Set up¶

  • Tool: CellphoneDB, liana, decoupler
  • Github: None
  • Paper: https://www.sc-best-practices.org/conditions/gsea_pathway.html#
In [ ]:
import numpy as np
import pandas as pd

import scanpy as sc
import anndata as ad
import decoupler
from pathlib import Path

# Filtering warnings from current version of matplotlib
import warnings
warnings.filterwarnings('ignore')
In [ ]:
# # Downloading reactome pathways
# from pathlib import Path

# if not Path("c2.cp.reactome.v7.5.1.symbols.gmt").is_file():
#     !wget -O 'c2.cp.reactome.v7.5.1.symbols.gmt' https://figshare.com/ndownloader/files/35233771
In [ ]:
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
In [ ]:
src_dir = '../../data/Preprocessed_data'

Load data¶

In [ ]:
adata = sc.read(f'{src_dir}/harmony_cellmarkers.h5ad')
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'
In [ ]:
sc.pl.umap(
    adata,
    color=["batch", "cell_type"],
    frameon=False,
    ncols=2,
)
No description has been provided for this image
In [ ]:
adata.obs["group"] = adata.obs.batch.astype("string") + "_" + adata.obs.cell_type
In [ ]:
def gmt_to_decoupler(pth: Path) -> pd.DataFrame:
    """
    Parse a gmt file to a decoupler pathway dataframe.
    """
    from itertools import chain, repeat

    pathways = {}

    with Path(pth).open("r") as f:
        for line in f:
            name, _, *genes = line.strip().split("\t")
            pathways[name] = genes

    return pd.DataFrame.from_records(
        chain.from_iterable(zip(repeat(k), v) for k, v in pathways.items()),
        columns=["geneset", "genesymbol"],
    )
In [ ]:
reactome = gmt_to_decoupler('../../data/Pathway_DB/c2.cp.reactome.v7.5.1.symbols.gmt')
In [ ]:
# There are lots of available database, we use a fixed one for reproducibility
decoupler.show_resources()
Out[ ]:
['Adhesome',
 'Almen2009',
 'Baccin2019',
 'CORUM_Funcat',
 'CORUM_GO',
 'CSPA',
 'CSPA_celltype',
 'CancerDrugsDB',
 'CancerGeneCensus',
 'CancerSEA',
 'CellCall',
 'CellCellInteractions',
 'CellChatDB',
 'CellChatDB_complex',
 'CellPhoneDB',
 'CellPhoneDB_complex',
 'CellTalkDB',
 'CellTypist',
 'Cellinker',
 'Cellinker_complex',
 'ComPPI',
 'CytoSig',
 'DGIdb',
 'DisGeNet',
 'EMBRACE',
 'Exocarta',
 'GO_Intercell',
 'GPCRdb',
 'Guide2Pharma',
 'HGNC',
 'HPA_secretome',
 'HPA_subcellular',
 'HPA_tissue',
 'HPMR',
 'HumanCellMap',
 'ICELLNET',
 'ICELLNET_complex',
 'IntOGen',
 'Integrins',
 'InterPro',
 'KEGG-PC',
 'Kirouac2010',
 'LOCATE',
 'LRdb',
 'Lambert2018',
 'MCAM',
 'MSigDB',
 'Matrisome',
 'MatrixDB',
 'Membranome',
 'NetPath',
 'OPM',
 'PROGENy',
 'PanglaoDB',
 'Phobius',
 'Phosphatome',
 'Ramilowski2015',
 'Ramilowski_location',
 'SIGNOR',
 'SignaLink_function',
 'SignaLink_pathway',
 'Surfaceome',
 'TCDB',
 'TFcensus',
 'TopDB',
 'UniProt_family',
 'UniProt_keyword',
 'UniProt_location',
 'UniProt_tissue',
 'UniProt_topology',
 'Vesiclepedia',
 'Wang',
 'Zhong2015',
 'connectomeDB2020',
 'iTALK',
 'kinase.com',
 'scConnect',
 'scConnect_complex',
 'talklr']
In [ ]:
# Retrieving via python
msigdb = decoupler.get_resource("MSigDB")

# Get reactome pathways
reactome = msigdb.query("collection == 'reactome_pathways'")
# Filter duplicates
reactome = reactome[~reactome.duplicated(("geneset", "genesymbol"))]

Analysis¶

Running GSEA¶

In [ ]:
# Filtering genesets to match behaviour of fgsea
geneset_size = reactome.groupby("geneset").size()
gsea_genesets = geneset_size.index[(geneset_size > 15) & (geneset_size < 500)]
In [ ]:
# find DE genes by t-test
sc.tl.rank_genes_groups(adata, "cell_type", method="t-test", key_added="t-test")
In [ ]:
adata
Out[ ]:
AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type', 'group'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap', 't-test'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'

Cluster-level gene set enrichment analysis¶

In [ ]:
celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
    tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp
Out[ ]:
names scores logfoldchanges pvals pvals_adj
0 PTTG1 60.000668 NaN 3.530612e-278 5.948023e-274
1 CDKN3 56.458328 NaN 1.684712e-285 5.676468e-281
2 CKS1B 49.613487 NaN 2.882141e-239 2.427771e-235
3 BIRC5 46.635349 NaN 4.412753e-199 2.124047e-195
4 CCNB1 46.380829 NaN 3.782304e-178 1.416011e-174
In [ ]:
# extract scores
t_stats = (
    # Get dataframe of DE results for condition vs. rest
    sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
    # Subset to highly variable genes
    .set_index("names")
    .loc[adata.var["highly_variable"]]
    # Sort by absolute score
    .sort_values("scores", key=np.abs, ascending=False)
    # Format for decoupler
    [["scores"]]
    .rename_axis([celltype_condition], axis=1)
)
t_stats
Out[ ]:
Cartilage progenitor cell scores
names
BIRC5 46.635349
CENPF 46.066380
HSP90AA1 40.889629
SMC4 37.812042
DYNLL1 33.200161
... ...
TFDP2 -0.005575
ZBTB33 0.004323
RAB42 -0.004106
RP11-218C14.8 0.003311
CD46 0.000575

3000 rows × 1 columns

In [ ]:
scores, norm, pvals = decoupler.run_gsea(
    t_stats.T,
    reactome[reactome["geneset"].isin(gsea_genesets)],
    source="geneset",
    target="genesymbol",
)

gsea_results = (
    pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
    .droplevel(level=1, axis=1)
    .sort_values("score", ascending=False)
)
In [ ]:
'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
                **{"-log10(pval)": lambda x: -np.log10(x["pval"])}
            ).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=False)
tmp.reset_index(inplace=True)
tmp
Out[ ]:
source score norm pval -log10(pval)
0 REACTOME_MEIOSIS 0.876995 1.746298 0.001701 2.769377
1 REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS 0.921375 1.732780 0.001715 2.765669
2 REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION 0.911443 1.774311 0.001818 2.740363
3 REACTOME_RHOBTB_GTPASE_CYCLE 0.909893 1.765635 0.001876 2.726727
4 REACTOME_ATTENUATION_PHASE 0.912303 1.674935 0.003436 2.463893
In [ ]:
tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")
Out[ ]:
<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>
No description has been provided for this image

Cell-level pathway activity scoring using AUCell¶

In [ ]:
# Using AUCell to score the activity level of pathways and gene sets in each individual cell
# It is based on absolute gene expression in the cell, regardless of expression of genes in the other cells
decoupler.run_aucell(
    adata,
    reactome,
    source="geneset",
    target="genesymbol",
    use_raw=False,
)
In [ ]:
# Annotate the activity level of these pathways in each of the cells on the UMAP
ifn_pathways = [
    "REACTOME_INTERFERON_SIGNALING",
    "REACTOME_INTERFERON_ALPHA_BETA_SIGNALING",
    "REACTOME_INTERFERON_GAMMA_SIGNALING",
]

adata.obs[ifn_pathways] = adata.obsm["aucell_estimate"][ifn_pathways]
In [ ]:
# Plot the scores on the umap
sc.pl.umap(
    adata,
    color=["batch", "cell_type"] + ifn_pathways,
    frameon=False,
    ncols=2,
    wspace=0.3,
)
No description has been provided for this image

Pseudo-bulks¶

In [ ]:
def subsampled_summation(
    adata,
    groupby,
    *,
    n_samples_per_group,
    n_cells,
    random_state,
    layer,
):
    """
    Sum sample of X per condition.

    Drops conditions which don't have enough samples.

    Parameters
    ----------
    adata
        AnnData to sum expression of
    groupby
        Keys in obs to groupby
    n_samples_per_group
        Number of samples to take per group
    n_cells
        Number of cells to take per sample
    random_state
        Random state to use when sampling cells
    layer
        Which layer of adata to use

    Returns
    -------
    AnnData with same var as original, obs with columns from groupby, and X.
    """
    from scipy import sparse
    from sklearn.utils import check_random_state

    # Checks
    if isinstance(groupby, str):
        groupby = [groupby]
    random_state = check_random_state(random_state)

    indices = []
    labels = []

    grouped = adata.obs.groupby(groupby)
    for k, inds in grouped.indices.items():
        # Check size of group
        if len(inds) < (n_cells * n_samples_per_group):
            continue

        # Sample from group
        condition_inds = random_state.choice(
            inds, n_cells * n_samples_per_group, replace=False
        )
        for i, sample_condition_inds in enumerate(np.split(condition_inds, 3)):
            if isinstance(k, tuple):
                labels.append((*k, i))
            else:  # only grouping by one variable
                labels.append((k, i))
            indices.append(sample_condition_inds)

    # obs of output AnnData
    new_obs = pd.DataFrame.from_records(
        labels,
        columns=[*groupby, "sample"],
        index=["-".join(map(str, l)) for l in labels],
    )
    n_out = len(labels)

    # Make indicator matrix
    indptr = np.arange(0, (n_out + 1) * n_cells, n_cells)
    indicator = sparse.csr_matrix(
        (
            np.ones(n_out * n_cells, dtype=bool),
            np.concatenate(indices),
            indptr,
        ),
        shape=(len(labels), adata.n_obs),
    )

    return ad.AnnData(
        X=indicator @ sc.get._get_obs_rep(adata, layer=layer),
        obs=new_obs,
        var=adata.var.copy(),
    )
In [ ]:
pb_data = subsampled_summation(
    adata, ["cell_type", "batch"], n_cells=75, n_samples_per_group=3, layer="raw", random_state=0
)
pb_data
Out[ ]:
AnnData object with n_obs × n_vars = 36 × 33694
    obs: 'cell_type', 'batch', 'sample'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
In [ ]:
pb_data.obs["lib_size"] = pb_data.X.sum(1)
pb_data.layers["counts"] = pb_data.X.copy()
In [ ]:
sc.pp.normalize_total(pb_data)
sc.pp.log1p(pb_data)
sc.pp.pca(pb_data)
In [ ]:
 
In [ ]:
sc.pl.pca(pb_data, color=["cell_type", "batch", "lib_size"], ncols=1, size=250)
No description has been provided for this image
In [ ]:
sc.tl.rank_genes_groups(pb_data, "cell_type", method="t-test", key_added="t-test")
In [ ]:
celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
    tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp
Out[ ]:
names scores logfoldchanges pvals pvals_adj
0 PTTG1 60.000668 NaN 3.530612e-278 5.948023e-274
1 CDKN3 56.458328 NaN 1.684712e-285 5.676468e-281
2 CKS1B 49.613487 NaN 2.882141e-239 2.427771e-235
3 BIRC5 46.635349 NaN 4.412753e-199 2.124047e-195
4 CCNB1 46.380829 NaN 3.782304e-178 1.416011e-174
In [ ]:
# extract scores
t_stats = (
    # Get dataframe of DE results for condition vs. rest
    sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
    # Subset to highly variable genes
    .set_index("names")
    .loc[adata.var["highly_variable"]]
    # Sort by absolute score
    .sort_values("scores", key=np.abs, ascending=False)
    # Format for decoupler
    [["scores"]]
    .rename_axis([celltype_condition], axis=1)
)
t_stats
Out[ ]:
Cartilage progenitor cell scores
names
BIRC5 46.635349
CENPF 46.066380
HSP90AA1 40.889629
SMC4 37.812042
DYNLL1 33.200161
... ...
TFDP2 -0.005575
ZBTB33 0.004323
RAB42 -0.004106
RP11-218C14.8 0.003311
CD46 0.000575

3000 rows × 1 columns

In [ ]:
scores, norm, pvals = decoupler.run_gsea(
    t_stats.T,
    reactome[reactome["geneset"].isin(gsea_genesets)],
    source="geneset",
    target="genesymbol",
)

gsea_results = (
    pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
    .droplevel(level=1, axis=1)
    .sort_values("score", ascending=False)
)
In [ ]:
gsea_results
Out[ ]:
score norm pval
source
REACTOME_HSF1_ACTIVATION 0.936227 1.695460 0.000000
REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS 0.921375 1.732780 0.001715
REACTOME_ATTENUATION_PHASE 0.912303 1.674935 0.003436
REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION 0.911443 1.774311 0.001818
REACTOME_RHOBTB_GTPASE_CYCLE 0.909893 1.765635 0.001876
... ... ... ...
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -0.887341 -2.831174 0.000000
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -0.889023 -2.865755 0.000000
REACTOME_SELENOAMINO_ACID_METABOLISM -0.891121 -2.910915 0.000000
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -0.907605 -2.852917 0.000000
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -0.915861 -2.911499 0.000000

699 rows × 3 columns

In [ ]:
'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
                **{"-log10(pval)": lambda x: -np.log10(x["pval"])}
            ).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=True)
tmp.reset_index(inplace=True)
tmp
Out[ ]:
source score norm pval -log10(pval)
0 REACTOME_ATTENUATION_PHASE 0.912303 1.674935 0.003436 2.463893
1 REACTOME_RHOBTB_GTPASE_CYCLE 0.909893 1.765635 0.001876 2.726727
2 REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION 0.911443 1.774311 0.001818 2.740363
3 REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS 0.921375 1.732780 0.001715 2.765669
4 REACTOME_MEIOSIS 0.876995 1.746298 0.001701 2.769377
In [ ]:
tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")
Out[ ]:
<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>
No description has been provided for this image