Environment setup¶
In [ ]:
# Load the packages
import gc
import scanpy as sc
import decoupler as dc
import scanpy.external as sce
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))
from Code.Utils.Preprocessing_scanpy import *
# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))
Load Data¶
In [ ]:
# Load example dataset
src_dir = '../../data/Preprocessed_data'
scrna_path = '../data/Time_series_scRNA/GSE162045_RAW/PC9'
batch_ids = ['GSM4932159_sample1', 'GSM4932160_sample2', 'GSM4932161_sample3', 'GSM4932162_sample4']
adatas = {}
for batch_id in batch_ids:
print(batch_id)
adata = load_adata(scrna_path, batch_id)
adatas[batch_id] = adata
del adata
gc.collect()
GSM4932159_sample1 GSM4932160_sample2 GSM4932161_sample3 GSM4932162_sample4
Preprocessing¶
Quality control¶
In [ ]:
# Perform basic QC on each sample independently and concatenate them
for batch_id in batch_ids:
print(batch_id)
adatas[batch_id] = basic_qc(adatas[batch_id])
gc.collect()
adata = sc.concat(list(adatas.values()),
# join='outer'
)
GSM4932159_sample1
Before cell filtering:
After cell filtering:
GSM4932160_sample2
Before cell filtering:
After cell filtering:
GSM4932161_sample3
Before cell filtering:
After cell filtering:
GSM4932162_sample4
Before cell filtering:
After cell filtering:
In [ ]:
# Visualize the highest expressed genes and basic QC metrics for the concatenated dataset
sc.pl.highest_expr_genes(adata, n_top=20, showfliers=False)
sc.pl.violin(
adata,
["n_genes_by_counts", "total_counts", "pct_counts_mt"],
jitter=0.4,
multi_panel=True,
)
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
In [ ]:
adata
Out[Â ]:
AnnData object with n_obs × n_vars = 7463 × 33694 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
In [ ]:
x = adata.obs['n_genes_by_counts'].values
x_min = np.max([0, np.median(x)-3*np.std(x)])
x_max = np.median(x)+3*np.std(x)
In [ ]:
sc.pl.violin(
adata,
["n_genes_by_counts"],
jitter=0.4,
multi_panel=True,
)
In [ ]:
adata = adata[adata.obs.n_genes_by_counts > x_min, :] # adopt to violin plot
adata = adata[adata.obs.n_genes_by_counts < x_max, :] # adopt to violin plot
sc.pl.violin(
adata,
["n_genes_by_counts"],
jitter=0.4,
multi_panel=True,
)
Noramlize data¶
In [ ]:
adata = Normalized_data(adata)
adata
Out[Â ]:
AnnData object with n_obs × n_vars = 7391 × 33694 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt' var: 'mean', 'std' uns: 'log1p' layers: 'raw', 'norm', 'log_norm', 'scale_data'
Identify highly variable genes¶
In [ ]:
# Identify the highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pl.highly_variable_genes(adata)
Batch correction¶
UMAP visualization without batch correction¶
In [ ]:
batch_correct_method='raw'
tmp_adata = adata.copy()
# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')
# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)
# Generate UMAP features
sc.tl.umap(tmp_adata)
# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)
# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
Harmony batch correction¶
In [ ]:
batch_correct_method='harmony'
tmp_adata = adata.copy()
# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')
# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)
# Generate UMAP features
sc.tl.umap(tmp_adata)
# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)
# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
2024-06-03 14:12:01,353 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... 2024-06-03 14:12:02,282 - harmonypy - INFO - sklearn.KMeans initialization complete. 2024-06-03 14:12:02,294 - harmonypy - INFO - Iteration 1 of 10 2024-06-03 14:12:02,893 - harmonypy - INFO - Iteration 2 of 10 2024-06-03 14:12:03,422 - harmonypy - INFO - Converged after 2 iterations
Scanorama batch correction¶
In [ ]:
batch_correct_method='scanorama'
tmp_adata = adata.copy()
# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')
# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)
# Generate UMAP features
sc.tl.umap(tmp_adata)
# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)
# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
Processing datasets GSM4932160_sample2 <=> GSM4932161_sample3 Processing datasets GSM4932161_sample3 <=> GSM4932162_sample4 Processing datasets GSM4932159_sample1 <=> GSM4932160_sample2 Processing datasets GSM4932160_sample2 <=> GSM4932162_sample4 Processing datasets GSM4932159_sample1 <=> GSM4932161_sample3
BBKNN batch correction¶
In [ ]:
batch_correct_method='bbknn'
tmp_adata = adata.copy()
# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')
# Batch correction by BBKNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)
# Generate UMAP features
sc.tl.umap(tmp_adata)
# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)
# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')
WARNING: consider updating your call to make use of `computation`
MNN batch correction¶
In [ ]:
batch_correct_method='mnn'
tmp_adata = adata.copy()
# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')
# Batch correction by MNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)
# Generate UMAP features
sc.tl.umap(tmp_adata)
# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)
# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')