Set up¶
- Tool: CellphoneDB, liana, decoupler
- Github: None
- Paper: https://www.sc-best-practices.org/conditions/gsea_pathway.html#
In [ ]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import decoupler
from pathlib import Path
# Filtering warnings from current version of matplotlib
import warnings
warnings.filterwarnings('ignore')
In [ ]:
# # Downloading reactome pathways
# from pathlib import Path
# if not Path("c2.cp.reactome.v7.5.1.symbols.gmt").is_file():
# !wget -O 'c2.cp.reactome.v7.5.1.symbols.gmt' https://figshare.com/ndownloader/files/35233771
In [ ]:
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
In [ ]:
src_dir = '../../data/Preprocessed_data'
Load data¶
In [ ]:
adata = sc.read(f'{src_dir}/harmony_cellmarkers.h5ad')
adata
Out[Â ]:
AnnData object with n_obs × n_vars = 7391 × 33694 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type' var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm' uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals' varm: 'PCs' layers: 'log_norm', 'norm', 'raw', 'scale_data' obsp: 'connectivities', 'distances'
In [ ]:
sc.pl.umap(
adata,
color=["batch", "cell_type"],
frameon=False,
ncols=2,
)
In [ ]:
adata.obs["group"] = adata.obs.batch.astype("string") + "_" + adata.obs.cell_type
In [ ]:
def gmt_to_decoupler(pth: Path) -> pd.DataFrame:
"""
Parse a gmt file to a decoupler pathway dataframe.
"""
from itertools import chain, repeat
pathways = {}
with Path(pth).open("r") as f:
for line in f:
name, _, *genes = line.strip().split("\t")
pathways[name] = genes
return pd.DataFrame.from_records(
chain.from_iterable(zip(repeat(k), v) for k, v in pathways.items()),
columns=["geneset", "genesymbol"],
)
In [ ]:
reactome = gmt_to_decoupler('../../data/Pathway_DB/c2.cp.reactome.v7.5.1.symbols.gmt')
In [ ]:
# There are lots of available database, we use a fixed one for reproducibility
decoupler.show_resources()
Out[Â ]:
['Adhesome', 'Almen2009', 'Baccin2019', 'CORUM_Funcat', 'CORUM_GO', 'CSPA', 'CSPA_celltype', 'CancerDrugsDB', 'CancerGeneCensus', 'CancerSEA', 'CellCall', 'CellCellInteractions', 'CellChatDB', 'CellChatDB_complex', 'CellPhoneDB', 'CellPhoneDB_complex', 'CellTalkDB', 'CellTypist', 'Cellinker', 'Cellinker_complex', 'ComPPI', 'CytoSig', 'DGIdb', 'DisGeNet', 'EMBRACE', 'Exocarta', 'GO_Intercell', 'GPCRdb', 'Guide2Pharma', 'HGNC', 'HPA_secretome', 'HPA_subcellular', 'HPA_tissue', 'HPMR', 'HumanCellMap', 'ICELLNET', 'ICELLNET_complex', 'IntOGen', 'Integrins', 'InterPro', 'KEGG-PC', 'Kirouac2010', 'LOCATE', 'LRdb', 'Lambert2018', 'MCAM', 'MSigDB', 'Matrisome', 'MatrixDB', 'Membranome', 'NetPath', 'OPM', 'PROGENy', 'PanglaoDB', 'Phobius', 'Phosphatome', 'Ramilowski2015', 'Ramilowski_location', 'SIGNOR', 'SignaLink_function', 'SignaLink_pathway', 'Surfaceome', 'TCDB', 'TFcensus', 'TopDB', 'UniProt_family', 'UniProt_keyword', 'UniProt_location', 'UniProt_tissue', 'UniProt_topology', 'Vesiclepedia', 'Wang', 'Zhong2015', 'connectomeDB2020', 'iTALK', 'kinase.com', 'scConnect', 'scConnect_complex', 'talklr']
In [ ]:
# Retrieving via python
msigdb = decoupler.get_resource("MSigDB")
# Get reactome pathways
reactome = msigdb.query("collection == 'reactome_pathways'")
# Filter duplicates
reactome = reactome[~reactome.duplicated(("geneset", "genesymbol"))]
Analysis¶
Running GSEA¶
In [ ]:
# Filtering genesets to match behaviour of fgsea
geneset_size = reactome.groupby("geneset").size()
gsea_genesets = geneset_size.index[(geneset_size > 15) & (geneset_size < 500)]
In [ ]:
# find DE genes by t-test
sc.tl.rank_genes_groups(adata, "cell_type", method="t-test", key_added="t-test")
In [ ]:
adata
Out[Â ]:
AnnData object with n_obs × n_vars = 7391 × 33694 obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type', 'group' var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm' uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap', 't-test' obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals' varm: 'PCs' layers: 'log_norm', 'norm', 'raw', 'scale_data' obsp: 'connectivities', 'distances'
Cluster-level gene set enrichment analysis¶
In [ ]:
celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp
Out[Â ]:
names | scores | logfoldchanges | pvals | pvals_adj | |
---|---|---|---|---|---|
0 | PTTG1 | 60.000668 | NaN | 3.530612e-278 | 5.948023e-274 |
1 | CDKN3 | 56.458328 | NaN | 1.684712e-285 | 5.676468e-281 |
2 | CKS1B | 49.613487 | NaN | 2.882141e-239 | 2.427771e-235 |
3 | BIRC5 | 46.635349 | NaN | 4.412753e-199 | 2.124047e-195 |
4 | CCNB1 | 46.380829 | NaN | 3.782304e-178 | 1.416011e-174 |
In [ ]:
# extract scores
t_stats = (
# Get dataframe of DE results for condition vs. rest
sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
# Subset to highly variable genes
.set_index("names")
.loc[adata.var["highly_variable"]]
# Sort by absolute score
.sort_values("scores", key=np.abs, ascending=False)
# Format for decoupler
[["scores"]]
.rename_axis([celltype_condition], axis=1)
)
t_stats
Out[Â ]:
Cartilage progenitor cell | scores |
---|---|
names | |
BIRC5 | 46.635349 |
CENPF | 46.066380 |
HSP90AA1 | 40.889629 |
SMC4 | 37.812042 |
DYNLL1 | 33.200161 |
... | ... |
TFDP2 | -0.005575 |
ZBTB33 | 0.004323 |
RAB42 | -0.004106 |
RP11-218C14.8 | 0.003311 |
CD46 | 0.000575 |
3000 rows × 1 columns
In [ ]:
scores, norm, pvals = decoupler.run_gsea(
t_stats.T,
reactome[reactome["geneset"].isin(gsea_genesets)],
source="geneset",
target="genesymbol",
)
gsea_results = (
pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
.droplevel(level=1, axis=1)
.sort_values("score", ascending=False)
)
In [ ]:
'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
**{"-log10(pval)": lambda x: -np.log10(x["pval"])}
).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=False)
tmp.reset_index(inplace=True)
tmp
Out[Â ]:
source | score | norm | pval | -log10(pval) | |
---|---|---|---|---|---|
0 | REACTOME_MEIOSIS | 0.876995 | 1.746298 | 0.001701 | 2.769377 |
1 | REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS | 0.921375 | 1.732780 | 0.001715 | 2.765669 |
2 | REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION | 0.911443 | 1.774311 | 0.001818 | 2.740363 |
3 | REACTOME_RHOBTB_GTPASE_CYCLE | 0.909893 | 1.765635 | 0.001876 | 2.726727 |
4 | REACTOME_ATTENUATION_PHASE | 0.912303 | 1.674935 | 0.003436 | 2.463893 |
In [ ]:
tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")
Out[Â ]:
<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>
Cell-level pathway activity scoring using AUCell¶
In [ ]:
# Using AUCell to score the activity level of pathways and gene sets in each individual cell
# It is based on absolute gene expression in the cell, regardless of expression of genes in the other cells
decoupler.run_aucell(
adata,
reactome,
source="geneset",
target="genesymbol",
use_raw=False,
)
In [ ]:
# Annotate the activity level of these pathways in each of the cells on the UMAP
ifn_pathways = [
"REACTOME_INTERFERON_SIGNALING",
"REACTOME_INTERFERON_ALPHA_BETA_SIGNALING",
"REACTOME_INTERFERON_GAMMA_SIGNALING",
]
adata.obs[ifn_pathways] = adata.obsm["aucell_estimate"][ifn_pathways]
In [ ]:
# Plot the scores on the umap
sc.pl.umap(
adata,
color=["batch", "cell_type"] + ifn_pathways,
frameon=False,
ncols=2,
wspace=0.3,
)
Pseudo-bulks¶
In [ ]:
def subsampled_summation(
adata,
groupby,
*,
n_samples_per_group,
n_cells,
random_state,
layer,
):
"""
Sum sample of X per condition.
Drops conditions which don't have enough samples.
Parameters
----------
adata
AnnData to sum expression of
groupby
Keys in obs to groupby
n_samples_per_group
Number of samples to take per group
n_cells
Number of cells to take per sample
random_state
Random state to use when sampling cells
layer
Which layer of adata to use
Returns
-------
AnnData with same var as original, obs with columns from groupby, and X.
"""
from scipy import sparse
from sklearn.utils import check_random_state
# Checks
if isinstance(groupby, str):
groupby = [groupby]
random_state = check_random_state(random_state)
indices = []
labels = []
grouped = adata.obs.groupby(groupby)
for k, inds in grouped.indices.items():
# Check size of group
if len(inds) < (n_cells * n_samples_per_group):
continue
# Sample from group
condition_inds = random_state.choice(
inds, n_cells * n_samples_per_group, replace=False
)
for i, sample_condition_inds in enumerate(np.split(condition_inds, 3)):
if isinstance(k, tuple):
labels.append((*k, i))
else: # only grouping by one variable
labels.append((k, i))
indices.append(sample_condition_inds)
# obs of output AnnData
new_obs = pd.DataFrame.from_records(
labels,
columns=[*groupby, "sample"],
index=["-".join(map(str, l)) for l in labels],
)
n_out = len(labels)
# Make indicator matrix
indptr = np.arange(0, (n_out + 1) * n_cells, n_cells)
indicator = sparse.csr_matrix(
(
np.ones(n_out * n_cells, dtype=bool),
np.concatenate(indices),
indptr,
),
shape=(len(labels), adata.n_obs),
)
return ad.AnnData(
X=indicator @ sc.get._get_obs_rep(adata, layer=layer),
obs=new_obs,
var=adata.var.copy(),
)
In [ ]:
pb_data = subsampled_summation(
adata, ["cell_type", "batch"], n_cells=75, n_samples_per_group=3, layer="raw", random_state=0
)
pb_data
Out[Â ]:
AnnData object with n_obs × n_vars = 36 × 33694 obs: 'cell_type', 'batch', 'sample' var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
In [ ]:
pb_data.obs["lib_size"] = pb_data.X.sum(1)
pb_data.layers["counts"] = pb_data.X.copy()
In [ ]:
sc.pp.normalize_total(pb_data)
sc.pp.log1p(pb_data)
sc.pp.pca(pb_data)
In [ ]:
In [ ]:
sc.pl.pca(pb_data, color=["cell_type", "batch", "lib_size"], ncols=1, size=250)
In [ ]:
sc.tl.rank_genes_groups(pb_data, "cell_type", method="t-test", key_added="t-test")
In [ ]:
celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp
Out[Â ]:
names | scores | logfoldchanges | pvals | pvals_adj | |
---|---|---|---|---|---|
0 | PTTG1 | 60.000668 | NaN | 3.530612e-278 | 5.948023e-274 |
1 | CDKN3 | 56.458328 | NaN | 1.684712e-285 | 5.676468e-281 |
2 | CKS1B | 49.613487 | NaN | 2.882141e-239 | 2.427771e-235 |
3 | BIRC5 | 46.635349 | NaN | 4.412753e-199 | 2.124047e-195 |
4 | CCNB1 | 46.380829 | NaN | 3.782304e-178 | 1.416011e-174 |
In [ ]:
# extract scores
t_stats = (
# Get dataframe of DE results for condition vs. rest
sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
# Subset to highly variable genes
.set_index("names")
.loc[adata.var["highly_variable"]]
# Sort by absolute score
.sort_values("scores", key=np.abs, ascending=False)
# Format for decoupler
[["scores"]]
.rename_axis([celltype_condition], axis=1)
)
t_stats
Out[Â ]:
Cartilage progenitor cell | scores |
---|---|
names | |
BIRC5 | 46.635349 |
CENPF | 46.066380 |
HSP90AA1 | 40.889629 |
SMC4 | 37.812042 |
DYNLL1 | 33.200161 |
... | ... |
TFDP2 | -0.005575 |
ZBTB33 | 0.004323 |
RAB42 | -0.004106 |
RP11-218C14.8 | 0.003311 |
CD46 | 0.000575 |
3000 rows × 1 columns
In [ ]:
scores, norm, pvals = decoupler.run_gsea(
t_stats.T,
reactome[reactome["geneset"].isin(gsea_genesets)],
source="geneset",
target="genesymbol",
)
gsea_results = (
pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
.droplevel(level=1, axis=1)
.sort_values("score", ascending=False)
)
In [ ]:
gsea_results
Out[Â ]:
score | norm | pval | |
---|---|---|---|
source | |||
REACTOME_HSF1_ACTIVATION | 0.936227 | 1.695460 | 0.000000 |
REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS | 0.921375 | 1.732780 | 0.001715 |
REACTOME_ATTENUATION_PHASE | 0.912303 | 1.674935 | 0.003436 |
REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION | 0.911443 | 1.774311 | 0.001818 |
REACTOME_RHOBTB_GTPASE_CYCLE | 0.909893 | 1.765635 | 0.001876 |
... | ... | ... | ... |
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION | -0.887341 | -2.831174 | 0.000000 |
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY | -0.889023 | -2.865755 | 0.000000 |
REACTOME_SELENOAMINO_ACID_METABOLISM | -0.891121 | -2.910915 | 0.000000 |
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE | -0.907605 | -2.852917 | 0.000000 |
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION | -0.915861 | -2.911499 | 0.000000 |
699 rows × 3 columns
In [ ]:
'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
**{"-log10(pval)": lambda x: -np.log10(x["pval"])}
).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=True)
tmp.reset_index(inplace=True)
tmp
Out[Â ]:
source | score | norm | pval | -log10(pval) | |
---|---|---|---|---|---|
0 | REACTOME_ATTENUATION_PHASE | 0.912303 | 1.674935 | 0.003436 | 2.463893 |
1 | REACTOME_RHOBTB_GTPASE_CYCLE | 0.909893 | 1.765635 | 0.001876 | 2.726727 |
2 | REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION | 0.911443 | 1.774311 | 0.001818 | 2.740363 |
3 | REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS | 0.921375 | 1.732780 | 0.001715 | 2.765669 |
4 | REACTOME_MEIOSIS | 0.876995 | 1.746298 | 0.001701 | 2.769377 |
In [ ]:
tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")
Out[Â ]:
<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>