Environment setup¶
In [ ]:
# Load the packages
import os
import sys
import gc
import scanpy as sc
import decoupler as dc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))
from Code.Utils.Preprocessing_scanpy import *
# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(6, 4))
Load Data¶
In [ ]:
# Load example dataset
adata = sc.read('../../data/Preprocessed_data/preprocessed_harmony.h5ad')
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 33694 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden' var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm' uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' layers: 'log_norm', 'norm', 'raw', 'scale_data' obsp: 'connectivities', 'distances'
Load marker gene database¶
Use cell type specific marker genes to annotate single cell clusters. Marker genes that are mainly expressed exclusively by a specific cell type, making them useful to distinguish heterogeneous groups of cells. Marker genes were discovered and annotated in previous studies and there are some resources that collect and curate them.
In [ ]:
Marker_DB = 'CellMakers' # 'CellMakers' or 'CellMarkers'
In [ ]:
if Marker_DB == 'PanglaoDB':
# Query Omnipath and get PanglaoDB
markers = dc.get_resource('PanglaoDB')
# Human only (general)
cell_markers = markers[(markers['organ']=='Immune system') & markers['canonical_marker'] & markers['human'] & (markers['human_sensitivity'] > 0.5)]
# Remove duplicated entries
cell_markers = cell_markers[~cell_markers.duplicated(['cell_type', 'genesymbol'])]
elif Marker_DB == 'CellMakers':
# Load cell marker database
markers = pd.read_csv('../../data/CellMarker_DB/Cell_marker_Seq.csv')
# Subset to human only. Can add more condition constraints here
markers = markers[(markers['species']=='Human')]
markers = markers.drop('Symbol', axis=1).rename(columns={'marker': 'genesymbol'})
markers = markers.drop('cell_type', axis=1).rename(columns={'cell_name': 'cell_type'})
is_d = markers.duplicated(['cell_type', 'genesymbol']).values
cell_markers = markers[~is_d]
cell_markers.head(5)
Out[ ]:
species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cellontology_id | genesymbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Human | Abdomen | Abdominal fat pad | NaN | Normal | Brown adipocyte | CL_0000449 | FABP4 | 2167.0 | protein_coding | fatty acid binding protein 4 | E7DVW4 | sci-RNA-seq | Experiment | 32355218.0 | Single-cell transcriptional networks in differ... | Nature communications | 2020.0 |
1 | Human | Abdomen | Abdominal fat pad | NaN | Normal | Brown adipocyte | CL_0000449 | PDGFRα | NaN | NaN | NaN | NaN | sci-RNA-seq | Experiment | 32355218.0 | Single-cell transcriptional networks in differ... | Nature communications | 2020.0 |
2 | Human | Abdomen | Abdominal fat pad | NaN | Normal | Brown adipocyte | CL_0000449 | UCP1 | 7350.0 | protein_coding | uncoupling protein 1 | P25874 | sci-RNA-seq | Experiment | 32355218.0 | Single-cell transcriptional networks in differ... | Nature communications | 2020.0 |
22 | Human | Adipose tissue | Adipose tissue | UBERON_0001013 | Normal | Stem cell | CL_0000034 | FABP4 | 2167.0 | protein_coding | fatty acid binding protein 4 | E7DVW4 | Single-cell sequencing | Experiment | 30639037.0 | Patient adipose stem cell-derived adipocytes r... | Cell stem cell | 2019.0 |
23 | Human | Adipose tissue | Adipose tissue | UBERON_0001013 | Normal | Pan fibroblast | NaN | vimentin | NaN | NaN | NaN | NaN | Single-cell sequencing | Experiment | 31053654.0 | Distinct immunocyte-promoting and adipocyte-ge... | Science immunology | 2019.0 |
Automatic annotation¶
Enrichment with Over Representation Analysis¶
ORA (Over representation analysis): the top 5% of expressed genes by sample are selected as the set of interest (S)
In [ ]:
dc.run_ora(
mat=adata,
net=cell_markers,
source='cell_type',
target='genesymbol',
min_n=3,
verbose=True,
use_raw=False
)
12390 features of mat are empty, they will be removed. Running ora on mat with 7391 samples and 21304 targets for 418 sources.
100%|██████████| 7391/7391 [00:20<00:00, 360.76it/s]
In [ ]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')
# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e
acts
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 418 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden' uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
In [ ]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='wilcoxon')
df.head(5)
Out[ ]:
group | reference | names | statistic | meanchange | pvals | pvals_adj | |
---|---|---|---|---|---|---|---|
0 | 0 | rest | Fibrocartilage chondrocyte | 19.640788 | 0.316777 | 6.931426e-86 | 4.139052e-84 |
1 | 0 | rest | Stromal cell | 18.523929 | 0.141692 | 1.324114e-76 | 6.918497e-75 |
2 | 0 | rest | Mesangial cell | 16.388841 | 0.205480 | 2.297858e-60 | 7.388499e-59 |
3 | 0 | rest | m-Pericyte | 15.273152 | 0.296484 | 1.154554e-52 | 3.217357e-51 |
4 | 0 | rest | Microglial cell | 14.907150 | 0.156550 | 2.961248e-50 | 7.736260e-49 |
... | ... | ... | ... | ... | ... | ... | ... |
3757 | 8 | rest | Myofibroblast | -4.206916 | -0.129191 | 2.588788e-05 | 4.704841e-04 |
3758 | 8 | rest | Enterocyte | -4.270273 | -0.110944 | 1.952341e-05 | 3.886088e-04 |
3759 | 8 | rest | Stromal cell | -4.777971 | -0.106471 | 1.770730e-06 | 4.353913e-05 |
3760 | 8 | rest | Smooth muscle cell | -5.195685 | -0.151648 | 2.039674e-07 | 6.558335e-06 |
3761 | 8 | rest | Myeloid cell | -7.040084 | -0.093089 | 1.921246e-12 | 2.676935e-10 |
3762 rows × 7 columns
In [ ]:
n_ctypes = 5
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict
Out[ ]:
{'0': ['Fibrocartilage chondrocyte', 'Stromal cell', 'Mesangial cell', 'm-Pericyte', 'Microglial cell'], '1': ['Interstitial progenitor cell', 'Transit-amplifying cell', 'Inner radial glial cell', 'Proliferative cell', 'Neuronal progenitor cell'], '2': ['Ductal cell', 'Pelvic epithelial cell', 'Plasma cell', 'Cholangiocyte', 'Monocyte'], '3': ['Interstitial progenitor cell', 'Cartilage progenitor cell', 'Nephron progenitor cell', 'Transit-amplifying cell', 'Distal cell'], '4': ['CD4+ T cell', 'Microglial cell', 'Cancer-associated fibroblast', 'Monocyte', 'Plasma cell'], '5': ['Loop progenitor cell', 'Neural progenitor cell', 'Proliferative ductal cell', 'Exhausted T(Tex) cell', 'Proliferative neural progenitor cell'], '6': ['Memory T cell', 'Plasma cell', 'Macrophage', 'Monocyte', 'Myeloid cell'], '7': ['Cartilage progenitor cell', 'Nephron progenitor cell', 'Proliferative ductal cell', 'Cycling B cell', 'Exhausted T(Tex) cell'], '8': ['Regulatory chondrocyte', 'Cancer stem cell', 'Retinal pigment epithelial cell', 'M2 macrophage', 'Germ cell']}
In [ ]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
print(annotation_dict)
# Add cell type column based on annotation
adata.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]
acts.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]
{'0': 'Fibrocartilage chondrocyte', '1': 'Interstitial progenitor cell', '2': 'Ductal cell', '3': 'Interstitial progenitor cell', '4': 'CD4+ T cell', '5': 'Loop progenitor cell', '6': 'Memory T cell', '7': 'Cartilage progenitor cell', '8': 'Regulatory chondrocyte'}
In [ ]:
# Visualize
sc.pl.matrixplot(acts, ctypes_dict, 'cell_type', dendrogram=True, standard_scale='var',
colorbar_title='Z-scaled scores', cmap='RdBu_r')
sc.pl.umap(adata, color=['cell_type'])
sc.pl.umap(adata, color=['batch'])
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: CD4+ T cell, Cartilage progenitor cell, Ductal cell, etc. var_group_labels: 0, 1, 2, etc.
Save the results¶
In [ ]:
adata.write('../../data/Preprocessed_data/harmony_cellmarkers.h5ad')
adata.obs.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_meta.csv')
adata.var.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_gene.csv')
pd.DataFrame(adata.layers['raw'].toarray()).to_csv('../../data/Preprocessed_data/harmony_cellmarkers_mtx.csv')