# Load the packages
import os
import sys
import gc
import scanpy as sc
import decoupler as dc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set the system path
import os
import sys
sys.path.append(os.path.abspath('../../'))

from Code.Utils.Preprocessing_scanpy import *

# Plotting options, change to your liking
sc.settings.set_figure_params(dpi=300, frameon=False)
sc.set_figure_params(dpi=300)
sc.set_figure_params(figsize=(6, 4))

# Load example dataset
adata = sc.read('../../data/Preprocessed_data/preprocessed_harmony.h5ad')
adata

AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'

Marker_DB = 'CellMakers' # 'CellMakers' or 'CellMarkers'

if Marker_DB == 'PanglaoDB':
    # Query Omnipath and get PanglaoDB
    markers = dc.get_resource('PanglaoDB')

    # Human only (general)
    cell_markers = markers[(markers['organ']=='Immune system') & markers['canonical_marker'] & markers['human'] & (markers['human_sensitivity'] > 0.5)]

    # Remove duplicated entries
    cell_markers = cell_markers[~cell_markers.duplicated(['cell_type', 'genesymbol'])]

elif Marker_DB == 'CellMakers':
    # Load cell marker database
    markers = pd.read_csv('../../data/CellMarker_DB/Cell_marker_Seq.csv')
    # Subset to human only. Can add more condition constraints here
    markers = markers[(markers['species']=='Human')]
    markers = markers.drop('Symbol', axis=1).rename(columns={'marker': 'genesymbol'})
    markers = markers.drop('cell_type', axis=1).rename(columns={'cell_name': 'cell_type'})
    is_d = markers.duplicated(['cell_type', 'genesymbol']).values
    cell_markers = markers[~is_d]

cell_markers.head(5)

dc.run_ora(
    mat=adata,
    net=cell_markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

12390 features of mat are empty, they will be removed.
Running ora on mat with 7391 samples and 21304 targets for 418 sources.

100%|██████████| 7391/7391 [00:20<00:00, 360.76it/s]

acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e
acts

AnnData object with n_obs × n_vars = 7391 × 418
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'

df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='wilcoxon')
df.head(5)

n_ctypes = 5
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

{'0': ['Fibrocartilage chondrocyte',
  'Stromal cell',
  'Mesangial cell',
  'm-Pericyte',
  'Microglial cell'],
 '1': ['Interstitial progenitor cell',
  'Transit-amplifying cell',
  'Inner radial glial cell',
  'Proliferative cell',
  'Neuronal progenitor cell'],
 '2': ['Ductal cell',
  'Pelvic epithelial cell',
  'Plasma cell',
  'Cholangiocyte',
  'Monocyte'],
 '3': ['Interstitial progenitor cell',
  'Cartilage progenitor cell',
  'Nephron progenitor cell',
  'Transit-amplifying cell',
  'Distal cell'],
 '4': ['CD4+ T cell',
  'Microglial cell',
  'Cancer-associated fibroblast',
  'Monocyte',
  'Plasma cell'],
 '5': ['Loop progenitor cell',
  'Neural progenitor cell',
  'Proliferative ductal cell',
  'Exhausted T(Tex) cell',
  'Proliferative neural progenitor cell'],
 '6': ['Memory T cell',
  'Plasma cell',
  'Macrophage',
  'Monocyte',
  'Myeloid cell'],
 '7': ['Cartilage progenitor cell',
  'Nephron progenitor cell',
  'Proliferative ductal cell',
  'Cycling B cell',
  'Exhausted T(Tex) cell'],
 '8': ['Regulatory chondrocyte',
  'Cancer stem cell',
  'Retinal pigment epithelial cell',
  'M2 macrophage',
  'Germ cell']}

annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
print(annotation_dict)

# Add cell type column based on annotation
adata.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]
acts.obs['cell_type'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

{'0': 'Fibrocartilage chondrocyte', '1': 'Interstitial progenitor cell', '2': 'Ductal cell', '3': 'Interstitial progenitor cell', '4': 'CD4+ T cell', '5': 'Loop progenitor cell', '6': 'Memory T cell', '7': 'Cartilage progenitor cell', '8': 'Regulatory chondrocyte'}

# Visualize
sc.pl.matrixplot(acts, ctypes_dict, 'cell_type', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')
sc.pl.umap(adata, color=['cell_type'])
sc.pl.umap(adata, color=['batch'])

WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: CD4+ T cell, Cartilage progenitor cell, Ductal cell, etc.
var_group_labels: 0, 1, 2, etc.

adata.write('../../data/Preprocessed_data/harmony_cellmarkers.h5ad')
adata.obs.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_meta.csv')
adata.var.to_csv('../../data/Preprocessed_data/harmony_cellmarkers_gene.csv')
pd.DataFrame(adata.layers['raw'].toarray()).to_csv('../../data/Preprocessed_data/harmony_cellmarkers_mtx.csv')

	species	tissue_class	tissue_type	uberonongology_id	cancer_type	cell_type	cellontology_id	genesymbol	GeneID	Genetype	Genename	UNIPROTID	technology_seq	marker_source	PMID	Title	journal	year
0	Human	Abdomen	Abdominal fat pad	NaN	Normal	Brown adipocyte	CL_0000449	FABP4	2167.0	protein_coding	fatty acid binding protein 4	E7DVW4	sci-RNA-seq	Experiment	32355218.0	Single-cell transcriptional networks in differ...	Nature communications	2020.0
1	Human	Abdomen	Abdominal fat pad	NaN	Normal	Brown adipocyte	CL_0000449	PDGFRα	NaN	NaN	NaN	NaN	sci-RNA-seq	Experiment	32355218.0	Single-cell transcriptional networks in differ...	Nature communications	2020.0
2	Human	Abdomen	Abdominal fat pad	NaN	Normal	Brown adipocyte	CL_0000449	UCP1	7350.0	protein_coding	uncoupling protein 1	P25874	sci-RNA-seq	Experiment	32355218.0	Single-cell transcriptional networks in differ...	Nature communications	2020.0
22	Human	Adipose tissue	Adipose tissue	UBERON_0001013	Normal	Stem cell	CL_0000034	FABP4	2167.0	protein_coding	fatty acid binding protein 4	E7DVW4	Single-cell sequencing	Experiment	30639037.0	Patient adipose stem cell-derived adipocytes r...	Cell stem cell	2019.0
23	Human	Adipose tissue	Adipose tissue	UBERON_0001013	Normal	Pan fibroblast	NaN	vimentin	NaN	NaN	NaN	NaN	Single-cell sequencing	Experiment	31053654.0	Distinct immunocyte-promoting and adipocyte-ge...	Science immunology	2019.0

	group	reference	names	statistic	meanchange	pvals	pvals_adj
0	0	rest	Fibrocartilage chondrocyte	19.640788	0.316777	6.931426e-86	4.139052e-84
1	0	rest	Stromal cell	18.523929	0.141692	1.324114e-76	6.918497e-75
2	0	rest	Mesangial cell	16.388841	0.205480	2.297858e-60	7.388499e-59
3	0	rest	m-Pericyte	15.273152	0.296484	1.154554e-52	3.217357e-51
4	0	rest	Microglial cell	14.907150	0.156550	2.961248e-50	7.736260e-49
...	...	...	...	...	...	...	...
3757	8	rest	Myofibroblast	-4.206916	-0.129191	2.588788e-05	4.704841e-04
3758	8	rest	Enterocyte	-4.270273	-0.110944	1.952341e-05	3.886088e-04
3759	8	rest	Stromal cell	-4.777971	-0.106471	1.770730e-06	4.353913e-05
3760	8	rest	Smooth muscle cell	-5.195685	-0.151648	2.039674e-07	6.558335e-06
3761	8	rest	Myeloid cell	-7.040084	-0.093089	1.921246e-12	2.676935e-10

Environment setup¶

Load Data¶

Load marker gene database¶

Automatic annotation¶

Enrichment with Over Representation Analysis¶

Save the results¶