# Load the packages
import gc
import scanpy as sc
import decoupler as dc
import scanpy.external as sce
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))

from Code.Utils.Preprocessing_scanpy import *

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(4, 4))

# Load example dataset
src_dir = '../../data/Preprocessed_data'
scrna_path = '../data/Time_series_scRNA/GSE162045_RAW/PC9'
batch_ids = ['GSM4932159_sample1', 'GSM4932160_sample2', 'GSM4932161_sample3', 'GSM4932162_sample4']

adatas = {}
for batch_id in batch_ids:
    print(batch_id)
    adata = load_adata(scrna_path, batch_id)
    adatas[batch_id] = adata
    del adata
    gc.collect()

GSM4932159_sample1
GSM4932160_sample2
GSM4932161_sample3
GSM4932162_sample4

# Perform basic QC on each sample independently and concatenate them
for batch_id in batch_ids:
    print(batch_id)
    adatas[batch_id] = basic_qc(adatas[batch_id])
    gc.collect()

adata = sc.concat(list(adatas.values()), 
                  # join='outer'
                  )

GSM4932159_sample1

Before cell filtering:

After cell filtering:

GSM4932160_sample2

Before cell filtering:

After cell filtering:

GSM4932161_sample3

Before cell filtering:

After cell filtering:

GSM4932162_sample4

# Visualize the highest expressed genes and basic QC metrics for the concatenated dataset
sc.pl.highest_expr_genes(adata, n_top=20, showfliers=False)

sc.pl.violin(
            adata,
            ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
            jitter=0.4,
            multi_panel=True,
            )

sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt")

sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")

adata

AnnData object with n_obs × n_vars = 7463 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'

x = adata.obs['n_genes_by_counts'].values
x_min = np.max([0, np.median(x)-3*np.std(x)])
x_max = np.median(x)+3*np.std(x)

sc.pl.violin(
            adata,
            ["n_genes_by_counts"],
            jitter=0.4,
            multi_panel=True,
            )

adata = adata[adata.obs.n_genes_by_counts > x_min, :] # adopt to violin plot
adata = adata[adata.obs.n_genes_by_counts < x_max, :] # adopt to violin plot
sc.pl.violin(
            adata,
            ["n_genes_by_counts"],
            jitter=0.4,
            multi_panel=True,
            )

adata = Normalized_data(adata)
adata

AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'mean', 'std'
    uns: 'log1p'
    layers: 'raw', 'norm', 'log_norm', 'scale_data'

# Identify the highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pl.highly_variable_genes(adata)

batch_correct_method='raw'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')

batch_correct_method='harmony'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)
tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')

2024-06-03 14:12:01,353 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-06-03 14:12:02,282 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-06-03 14:12:02,294 - harmonypy - INFO - Iteration 1 of 10
2024-06-03 14:12:02,893 - harmonypy - INFO - Iteration 2 of 10
2024-06-03 14:12:03,422 - harmonypy - INFO - Converged after 2 iterations

batch_correct_method='scanorama'

tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by harmony
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')

Processing datasets GSM4932160_sample2 <=> GSM4932161_sample3
Processing datasets GSM4932161_sample3 <=> GSM4932162_sample4
Processing datasets GSM4932159_sample1 <=> GSM4932160_sample2
Processing datasets GSM4932160_sample2 <=> GSM4932162_sample4
Processing datasets GSM4932159_sample1 <=> GSM4932161_sample3

batch_correct_method='bbknn'
tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by BBKNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')

WARNING: consider updating your call to make use of `computation`

batch_correct_method='mnn'
tmp_adata = adata.copy()

# Generate PCA features
sc.tl.pca(tmp_adata, svd_solver='arpack')

# Batch correction by MNN
tmp_adata = Batch_correct(tmp_adata, method=batch_correct_method)

# Generate UMAP features
sc.tl.umap(tmp_adata)

# Run leiden clustering algorithm
sc.tl.leiden(tmp_adata)

# Visualize
sc.pl.umap(tmp_adata, color='leiden', title=f'{batch_correct_method} UMAP',
        frameon=False, legend_fontweight='normal', legend_fontsize=15)

tmp_adata.write(f'{src_dir}/preprocessed_{batch_correct_method}.h5ad')

Environment setup¶

Load Data¶

Preprocessing¶

Quality control¶

Noramlize data¶

Identify highly variable genes¶

Batch correction¶

UMAP visualization without batch correction¶

Harmony batch correction¶

Scanorama batch correction¶

BBKNN batch correction¶

MNN batch correction¶