Environment setup¶

In [ ]:
# Load the packages
import os
import sys
import gc
import scanpy as sc
import decoupler as dc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))

from Code.Utils.Preprocessing_scanpy import *

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(6, 4))

Load Data¶

In [ ]:
# Load example dataset
adata = sc.read('../../data/Preprocessed_data/preprocessed_harmony.h5ad')
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'

Load marker gene database¶

Use cell type specific marker genes to annotate single cell clusters. Marker genes that are mainly expressed exclusively by a specific cell type, making them useful to distinguish heterogeneous groups of cells. Marker genes were discovered and annotated in previous studies and there are some resources that collect and curate them.

In [ ]:
Marker_DB = 'CellMakers' # 'CellMakers' or 'CellMarkers'
In [ ]:
if Marker_DB == 'PanglaoDB':
    # Query Omnipath and get PanglaoDB
    markers = dc.get_resource('PanglaoDB')

    # Human only (general)
    cell_markers = markers[(markers['organ']=='Immune system') & markers['canonical_marker'] & markers['human'] & (markers['human_sensitivity'] > 0.5)]

    # Remove duplicated entries
    cell_markers = cell_markers[~cell_markers.duplicated(['cell_type', 'genesymbol'])]

elif Marker_DB == 'CellMakers':
    # Load cell marker database
    markers = pd.read_csv('../../data/CellMarker_DB/Cell_marker_Seq.csv')
    # Subset to human only. Can add more condition constraints here
    markers = markers[(markers['species']=='Human')]
    markers = markers.drop('Symbol', axis=1).rename(columns={'marker': 'genesymbol'})
    markers = markers.drop('cell_type', axis=1).rename(columns={'cell_name': 'cell_type'})
    is_d = markers.duplicated(['cell_type', 'genesymbol']).values
    cell_markers = markers[~is_d]

cell_markers.head(5)
Out[ ]:
species tissue_class tissue_type uberonongology_id cancer_type cell_type cellontology_id genesymbol GeneID Genetype Genename UNIPROTID technology_seq marker_source PMID Title journal year
0 Human Abdomen Abdominal fat pad NaN Normal Brown adipocyte CL_0000449 FABP4 2167.0 protein_coding fatty acid binding protein 4 E7DVW4 sci-RNA-seq Experiment 32355218.0 Single-cell transcriptional networks in differ... Nature communications 2020.0
1 Human Abdomen Abdominal fat pad NaN Normal Brown adipocyte CL_0000449 PDGFRα NaN NaN NaN NaN sci-RNA-seq Experiment 32355218.0 Single-cell transcriptional networks in differ... Nature communications 2020.0
2 Human Abdomen Abdominal fat pad NaN Normal Brown adipocyte CL_0000449 UCP1 7350.0 protein_coding uncoupling protein 1 P25874 sci-RNA-seq Experiment 32355218.0 Single-cell transcriptional networks in differ... Nature communications 2020.0
22 Human Adipose tissue Adipose tissue UBERON_0001013 Normal Stem cell CL_0000034 FABP4 2167.0 protein_coding fatty acid binding protein 4 E7DVW4 Single-cell sequencing Experiment 30639037.0 Patient adipose stem cell-derived adipocytes r... Cell stem cell 2019.0
23 Human Adipose tissue Adipose tissue UBERON_0001013 Normal Pan fibroblast NaN vimentin NaN NaN NaN NaN Single-cell sequencing Experiment 31053654.0 Distinct immunocyte-promoting and adipocyte-ge... Science immunology 2019.0

Automatic annotation¶

Enrichment with Over Representation Analysis¶

ORA (Over representation analysis): the top 5% of expressed genes by sample are selected as the set of interest (S)

In [ ]:
dc.run_ora(
    mat=adata,
    net=cell_markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)
12390 features of mat are empty, they will be removed.
Running ora on mat with 7391 samples and 21304 targets for 418 sources.
100%|██████████| 7391/7391 [00:20<00:00, 360.76it/s]
In [ ]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e
acts
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 418
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
In [ ]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='wilcoxon')
df.head(5)
Out[ ]:
group reference names statistic meanchange pvals pvals_adj
0 0 rest Fibrocartilage chondrocyte 19.640788 0.316777 6.931426e-86 4.139052e-84
1 0 rest Stromal cell 18.523929 0.141692 1.324114e-76 6.918497e-75
2 0 rest Mesangial cell 16.388841 0.205480 2.297858e-60 7.388499e-59
3 0 rest m-Pericyte 15.273152 0.296484 1.154554e-52 3.217357e-51
4 0 rest Microglial cell 14.907150 0.156550 2.961248e-50 7.736260e-49
... ... ... ... ... ... ... ...
3757 8 rest Myofibroblast -4.206916 -0.129191 2.588788e-05 4.704841e-04
3758 8 rest Enterocyte -4.270273 -0.110944 1.952341e-05 3.886088e-04
3759 8 rest Stromal cell -4.777971 -0.106471 1.770730e-06 4.353913e-05
3760 8 rest Smooth muscle cell -5.195685 -0.151648 2.039674e-07 6.558335e-06
3761 8 rest Myeloid cell -7.040084 -0.093089 1.921246e-12 2.676935e-10

3762 rows × 7 columns

In [ ]:
n_ctypes = 5
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict
Out[ ]:
{'0': ['Fibrocartilage chondrocyte',
  'Stromal cell',
  'Mesangial cell',
  'm-Pericyte',
  'Microglial cell'],
 '1': ['Interstitial progenitor cell',
  'Transit-amplifying cell',
  'Inner radial glial cell',
  'Proliferative cell',
  'Neuronal progenitor cell'],
 '2': ['Ductal cell',
  'Pelvic epithelial cell',
  'Plasma cell',
  'Cholangiocyte',
  'Monocyte'],
 '3': ['Interstitial progenitor cell',
  'Cartilage progenitor cell',
  'Nephron progenitor cell',
  'Transit-amplifying cell',
  'Distal cell'],
 '4': ['CD4+ T cell',
  'Microglial cell',
  'Cancer-associated fibroblast',
  'Monocyte',
  'Plasma cell'],
 '5': ['Loop progenitor cell',
  'Neural progenitor cell',
  'Proliferative ductal cell',
  'Exhausted T(Tex) cell',
  'Proliferative neural progenitor cell'],
 '6': ['Memory T cell',
  'Plasma cell',
  'Macrophage',
  'Monocyte',
  'Myeloid cell'],
 '7': ['Cartilage progenitor cell',
  'Nephron progenitor cell',
  'Proliferative ductal cell',
  'Cycling B cell',
  'Exhausted T(Tex) cell'],
 '8': ['Regulatory chondrocyte',
  'Cancer stem cell',
  'Retinal pigment epithelial cell',
  'M2 macrophage',
  'Germ cell']}
In [ ]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
print(annotation_dict)

# Add cell type column based on annotation
adata.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]
acts.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]
{'0': 'Fibrocartilage chondrocyte', '1': 'Interstitial progenitor cell', '2': 'Ductal cell', '3': 'Interstitial progenitor cell', '4': 'CD4+ T cell', '5': 'Loop progenitor cell', '6': 'Memory T cell', '7': 'Cartilage progenitor cell', '8': 'Regulatory chondrocyte'}
In [ ]:
# Visualize
sc.pl.matrixplot(acts, ctypes_dict, 'cell_type', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')
sc.pl.umap(adata, color=['cell_type'])
sc.pl.umap(adata, color=['batch'])
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: CD4+ T cell, Cartilage progenitor cell, Ductal cell, etc.
var_group_labels: 0, 1, 2, etc.
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Save the results¶

In [ ]:
adata.write('../../data/Preprocessed_data/harmony_cellmarkers.h5ad')
adata.obs.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_meta.csv')
adata.var.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_gene.csv')
pd.DataFrame(adata.layers['raw'].toarray()).to_csv('../../data/Preprocessed_data/harmony_cellmarkers_mtx.csv')