Environment setup¶

In [ ]:
# Load the packages
import gc
import scanpy as sc
import decoupler as dc
import scanpy.external as sce
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))

from Code.Utils.Preprocessing_scanpy import *

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))

Load Data¶

In [ ]:
# Load example dataset
src_dir = '../../data/Preprocessed_data'
scrna_path = '../data/Time_series_scRNA/GSE162045_RAW/PC9'
batch_ids = ['GSM4932159_sample1', 'GSM4932160_sample2', 'GSM4932161_sample3', 'GSM4932162_sample4']

adatas = {}
for batch_id in batch_ids:
    print(batch_id)
    adata = load_adata(scrna_path, batch_id)
    adatas[batch_id] = adata
    del adata
    gc.collect()
GSM4932159_sample1
GSM4932160_sample2
GSM4932161_sample3
GSM4932162_sample4

Preprocessing¶

Quality control¶

In [ ]:
# Perform basic QC on each sample independently and concatenate them
for batch_id in batch_ids:
    print(batch_id)
    adatas[batch_id] = basic_qc(adatas[batch_id])
    gc.collect()

adata = sc.concat(list(adatas.values()), 
                  # join='outer'
                  )
GSM4932159_sample1
No description has been provided for this image
Before cell filtering:
No description has been provided for this image
After cell filtering:
No description has been provided for this image
GSM4932160_sample2
No description has been provided for this image
Before cell filtering:
No description has been provided for this image
After cell filtering:
No description has been provided for this image
GSM4932161_sample3
No description has been provided for this image
Before cell filtering:
No description has been provided for this image
After cell filtering:
No description has been provided for this image
GSM4932162_sample4
No description has been provided for this image
Before cell filtering:
No description has been provided for this image
After cell filtering:
No description has been provided for this image
In [ ]:
# Visualize the highest expressed genes and basic QC metrics for the concatenated dataset
sc.pl.highest_expr_genes(adata, n_top=20, showfliers=False)

sc.pl.violin(
            adata,
            ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
            jitter=0.4,
            multi_panel=True,
            )

sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt")

sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7463 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
In [ ]:
x = adata.obs['n_genes_by_counts'].values
x_min = np.max([0, np.median(x)-3*np.std(x)])
x_max = np.median(x)+3*np.std(x)
In [ ]:
sc.pl.violin(
            adata,
            ["n_genes_by_counts"],
            jitter=0.4,
            multi_panel=True,
            )
No description has been provided for this image
In [ ]:
adata = adata[adata.obs.n_genes_by_counts > x_min, :] # adopt to violin plot
adata = adata[adata.obs.n_genes_by_counts < x_max, :] # adopt to violin plot
sc.pl.violin(
            adata,
            ["n_genes_by_counts"],
            jitter=0.4,
            multi_panel=True,
            )
No description has been provided for this image

Noramlize data¶

In [ ]:
adata = Normalized_data(adata)
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'mean', 'std'
    uns: 'log1p'
    layers: 'raw', 'norm', 'log_norm', 'scale_data'

Identify highly variable genes¶

In [ ]:
# Identify the highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pl.highly_variable_genes(adata)
No description has been provided for this image

Batch correction¶

UMAP visualization without batch correction¶

In [ ]:
batch_correct_method='raw'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
No description has been provided for this image

Harmony batch correction¶

In [ ]:
batch_correct_method='harmony'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
2024-06-03 14:12:01,353 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-06-03 14:12:02,282 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-06-03 14:12:02,294 - harmonypy - INFO - Iteration 1 of 10
2024-06-03 14:12:02,893 - harmonypy - INFO - Iteration 2 of 10
2024-06-03 14:12:03,422 - harmonypy - INFO - Converged after 2 iterations
No description has been provided for this image

Scanorama batch correction¶

In [ ]:
batch_correct_method='scanorama'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
Processing datasets GSM4932160_sample2 <=> GSM4932161_sample3
Processing datasets GSM4932161_sample3 <=> GSM4932162_sample4
Processing datasets GSM4932159_sample1 <=> GSM4932160_sample2
Processing datasets GSM4932160_sample2 <=> GSM4932162_sample4
Processing datasets GSM4932159_sample1 <=> GSM4932161_sample3
No description has been provided for this image

BBKNN batch correction¶

In [ ]:
batch_correct_method='bbknn'
tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by BBKNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
WARNING: consider updating your call to make use of `computation`
No description has been provided for this image

MNN batch correction¶

In [ ]:
batch_correct_method='mnn'
tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by MNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
No description has been provided for this image