import numpy as np
import pandas as pd

import scanpy as sc
import anndata as ad
import decoupler
from pathlib import Path

# Filtering warnings from current version of matplotlib
import warnings
warnings.filterwarnings('ignore')

# # Downloading reactome pathways
# from pathlib import Path

# if not Path("c2.cp.reactome.v7.5.1.symbols.gmt").is_file():
#     !wget -O 'c2.cp.reactome.v7.5.1.symbols.gmt' https://figshare.com/ndownloader/files/35233771

sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))

src_dir = '../../data/Preprocessed_data'

adata = sc.read(f'{src_dir}/harmony_cellmarkers.h5ad')
adata

AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'

sc.pl.umap(
    adata,
    color=["batch", "cell_type"],
    frameon=False,
    ncols=2,
)

adata.obs["group"] = adata.obs.batch.astype("string") + "_" + adata.obs.cell_type

def gmt_to_decoupler(pth: Path) -> pd.DataFrame:
    """
    Parse a gmt file to a decoupler pathway dataframe.
    """
    from itertools import chain, repeat

    pathways = {}

    with Path(pth).open("r") as f:
        for line in f:
            name, _, *genes = line.strip().split("\t")
            pathways[name] = genes

    return pd.DataFrame.from_records(
        chain.from_iterable(zip(repeat(k), v) for k, v in pathways.items()),
        columns=["geneset", "genesymbol"],
    )

reactome = gmt_to_decoupler('../../data/Pathway_DB/c2.cp.reactome.v7.5.1.symbols.gmt')

# There are lots of available database, we use a fixed one for reproducibility
decoupler.show_resources()

['Adhesome',
 'Almen2009',
 'Baccin2019',
 'CORUM_Funcat',
 'CORUM_GO',
 'CSPA',
 'CSPA_celltype',
 'CancerDrugsDB',
 'CancerGeneCensus',
 'CancerSEA',
 'CellCall',
 'CellCellInteractions',
 'CellChatDB',
 'CellChatDB_complex',
 'CellPhoneDB',
 'CellPhoneDB_complex',
 'CellTalkDB',
 'CellTypist',
 'Cellinker',
 'Cellinker_complex',
 'ComPPI',
 'CytoSig',
 'DGIdb',
 'DisGeNet',
 'EMBRACE',
 'Exocarta',
 'GO_Intercell',
 'GPCRdb',
 'Guide2Pharma',
 'HGNC',
 'HPA_secretome',
 'HPA_subcellular',
 'HPA_tissue',
 'HPMR',
 'HumanCellMap',
 'ICELLNET',
 'ICELLNET_complex',
 'IntOGen',
 'Integrins',
 'InterPro',
 'KEGG-PC',
 'Kirouac2010',
 'LOCATE',
 'LRdb',
 'Lambert2018',
 'MCAM',
 'MSigDB',
 'Matrisome',
 'MatrixDB',
 'Membranome',
 'NetPath',
 'OPM',
 'PROGENy',
 'PanglaoDB',
 'Phobius',
 'Phosphatome',
 'Ramilowski2015',
 'Ramilowski_location',
 'SIGNOR',
 'SignaLink_function',
 'SignaLink_pathway',
 'Surfaceome',
 'TCDB',
 'TFcensus',
 'TopDB',
 'UniProt_family',
 'UniProt_keyword',
 'UniProt_location',
 'UniProt_tissue',
 'UniProt_topology',
 'Vesiclepedia',
 'Wang',
 'Zhong2015',
 'connectomeDB2020',
 'iTALK',
 'kinase.com',
 'scConnect',
 'scConnect_complex',
 'talklr']

# Retrieving via python
msigdb = decoupler.get_resource("MSigDB")

# Get reactome pathways
reactome = msigdb.query("collection == 'reactome_pathways'")
# Filter duplicates
reactome = reactome[~reactome.duplicated(("geneset", "genesymbol"))]

# Filtering genesets to match behaviour of fgsea
geneset_size = reactome.groupby("geneset").size()
gsea_genesets = geneset_size.index[(geneset_size > 15) & (geneset_size < 500)]

# find DE genes by t-test
sc.tl.rank_genes_groups(adata, "cell_type", method="t-test", key_added="t-test")

adata

AnnData object with n_obs × n_vars = 7391 × 33694
    obs: 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'cell_type', 'group'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_type_colors', 'dendrogram_cell_type', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap', 't-test'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
    varm: 'PCs'
    layers: 'log_norm', 'norm', 'raw', 'scale_data'
    obsp: 'connectivities', 'distances'

celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
    tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp

# extract scores
t_stats = (
    # Get dataframe of DE results for condition vs. rest
    sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
    # Subset to highly variable genes
    .set_index("names")
    .loc[adata.var["highly_variable"]]
    # Sort by absolute score
    .sort_values("scores", key=np.abs, ascending=False)
    # Format for decoupler
    [["scores"]]
    .rename_axis([celltype_condition], axis=1)
)
t_stats

scores, norm, pvals = decoupler.run_gsea(
    t_stats.T,
    reactome[reactome["geneset"].isin(gsea_genesets)],
    source="geneset",
    target="genesymbol",
)

gsea_results = (
    pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
    .droplevel(level=1, axis=1)
    .sort_values("score", ascending=False)
)

'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
                **{"-log10(pval)": lambda x: -np.log10(x["pval"])}
            ).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=False)
tmp.reset_index(inplace=True)
tmp

tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")

<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>

# Using AUCell to score the activity level of pathways and gene sets in each individual cell
# It is based on absolute gene expression in the cell, regardless of expression of genes in the other cells
decoupler.run_aucell(
    adata,
    reactome,
    source="geneset",
    target="genesymbol",
    use_raw=False,
)

# Annotate the activity level of these pathways in each of the cells on the UMAP
ifn_pathways = [
    "REACTOME_INTERFERON_SIGNALING",
    "REACTOME_INTERFERON_ALPHA_BETA_SIGNALING",
    "REACTOME_INTERFERON_GAMMA_SIGNALING",
]

adata.obs[ifn_pathways] = adata.obsm["aucell_estimate"][ifn_pathways]

# Plot the scores on the umap
sc.pl.umap(
    adata,
    color=["batch", "cell_type"] + ifn_pathways,
    frameon=False,
    ncols=2,
    wspace=0.3,
)

def subsampled_summation(
    adata,
    groupby,
    *,
    n_samples_per_group,
    n_cells,
    random_state,
    layer,
):
    """
    Sum sample of X per condition.

    Drops conditions which don't have enough samples.

    Parameters
    ----------
    adata
        AnnData to sum expression of
    groupby
        Keys in obs to groupby
    n_samples_per_group
        Number of samples to take per group
    n_cells
        Number of cells to take per sample
    random_state
        Random state to use when sampling cells
    layer
        Which layer of adata to use

    Returns
    -------
    AnnData with same var as original, obs with columns from groupby, and X.
    """
    from scipy import sparse
    from sklearn.utils import check_random_state

    # Checks
    if isinstance(groupby, str):
        groupby = [groupby]
    random_state = check_random_state(random_state)

    indices = []
    labels = []

    grouped = adata.obs.groupby(groupby)
    for k, inds in grouped.indices.items():
        # Check size of group
        if len(inds) < (n_cells * n_samples_per_group):
            continue

        # Sample from group
        condition_inds = random_state.choice(
            inds, n_cells * n_samples_per_group, replace=False
        )
        for i, sample_condition_inds in enumerate(np.split(condition_inds, 3)):
            if isinstance(k, tuple):
                labels.append((*k, i))
            else:  # only grouping by one variable
                labels.append((k, i))
            indices.append(sample_condition_inds)

    # obs of output AnnData
    new_obs = pd.DataFrame.from_records(
        labels,
        columns=[*groupby, "sample"],
        index=["-".join(map(str, l)) for l in labels],
    )
    n_out = len(labels)

    # Make indicator matrix
    indptr = np.arange(0, (n_out + 1) * n_cells, n_cells)
    indicator = sparse.csr_matrix(
        (
            np.ones(n_out * n_cells, dtype=bool),
            np.concatenate(indices),
            indptr,
        ),
        shape=(len(labels), adata.n_obs),
    )

    return ad.AnnData(
        X=indicator @ sc.get._get_obs_rep(adata, layer=layer),
        obs=new_obs,
        var=adata.var.copy(),
    )

pb_data = subsampled_summation(
    adata, ["cell_type", "batch"], n_cells=75, n_samples_per_group=3, layer="raw", random_state=0
)
pb_data

AnnData object with n_obs × n_vars = 36 × 33694
    obs: 'cell_type', 'batch', 'sample'
    var: 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

pb_data.obs["lib_size"] = pb_data.X.sum(1)
pb_data.layers["counts"] = pb_data.X.copy()

sc.pp.normalize_total(pb_data)
sc.pp.log1p(pb_data)
sc.pp.pca(pb_data)

sc.pl.pca(pb_data, color=["cell_type", "batch", "lib_size"], ncols=1, size=250)

sc.tl.rank_genes_groups(pb_data, "cell_type", method="t-test", key_added="t-test")

celltype_conditions = adata.obs["cell_type"].unique()
for celltype_condition in celltype_conditions[:2]:
    tmp = sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test").head()
tmp

# extract scores
t_stats = (
    # Get dataframe of DE results for condition vs. rest
    sc.get.rank_genes_groups_df(adata, celltype_condition, key="t-test")
    # Subset to highly variable genes
    .set_index("names")
    .loc[adata.var["highly_variable"]]
    # Sort by absolute score
    .sort_values("scores", key=np.abs, ascending=False)
    # Format for decoupler
    [["scores"]]
    .rename_axis([celltype_condition], axis=1)
)
t_stats

scores, norm, pvals = decoupler.run_gsea(
    t_stats.T,
    reactome[reactome["geneset"].isin(gsea_genesets)],
    source="geneset",
    target="genesymbol",
)

gsea_results = (
    pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
    .droplevel(level=1, axis=1)
    .sort_values("score", ascending=False)
)

gsea_results

'''
pval: p-value of the enrichment test
score: enrichment score
norm: normalized enrichment score
'''
tmp = gsea_results[gsea_results.pval<=0.05].assign(
                **{"-log10(pval)": lambda x: -np.log10(x["pval"])}
            ).replace(np.inf, np.nan).dropna().head(5).sort_values("-log10(pval)", ascending=True)
tmp.reset_index(inplace=True)
tmp

tmp.plot.barh(x="source", y="-log10(pval)", title="Top 5 pathways")

<Axes: title={'center': 'Top 5 pathways'}, ylabel='source'>

	names	scores	logfoldchanges	pvals	pvals_adj
0	PTTG1	60.000668	NaN	3.530612e-278	5.948023e-274
1	CDKN3	56.458328	NaN	1.684712e-285	5.676468e-281
2	CKS1B	49.613487	NaN	2.882141e-239	2.427771e-235
3	BIRC5	46.635349	NaN	4.412753e-199	2.124047e-195
4	CCNB1	46.380829	NaN	3.782304e-178	1.416011e-174

Cartilage progenitor cell	scores
names
BIRC5	46.635349
CENPF	46.066380
HSP90AA1	40.889629
SMC4	37.812042
DYNLL1	33.200161
...	...
TFDP2	-0.005575
ZBTB33	0.004323
RAB42	-0.004106
RP11-218C14.8	0.003311
CD46	0.000575

	source	score	norm	pval	-log10(pval)
0	REACTOME_MEIOSIS	0.876995	1.746298	0.001701	2.769377
1	REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS	0.921375	1.732780	0.001715	2.765669
2	REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION	0.911443	1.774311	0.001818	2.740363
3	REACTOME_RHOBTB_GTPASE_CYCLE	0.909893	1.765635	0.001876	2.726727
4	REACTOME_ATTENUATION_PHASE	0.912303	1.674935	0.003436	2.463893

	names	scores	logfoldchanges	pvals	pvals_adj
0	PTTG1	60.000668	NaN	3.530612e-278	5.948023e-274
1	CDKN3	56.458328	NaN	1.684712e-285	5.676468e-281
2	CKS1B	49.613487	NaN	2.882141e-239	2.427771e-235
3	BIRC5	46.635349	NaN	4.412753e-199	2.124047e-195
4	CCNB1	46.380829	NaN	3.782304e-178	1.416011e-174

Cartilage progenitor cell	scores
names
BIRC5	46.635349
CENPF	46.066380
HSP90AA1	40.889629
SMC4	37.812042
DYNLL1	33.200161
...	...
TFDP2	-0.005575
ZBTB33	0.004323
RAB42	-0.004106
RP11-218C14.8	0.003311
CD46	0.000575

Set up¶

Load data¶

Analysis¶

Running GSEA¶

Cluster-level gene set enrichment analysis¶

Cell-level pathway activity scoring using AUCell¶

Pseudo-bulks¶

	score	norm	pval
source
REACTOME_HSF1_ACTIVATION	0.936227	1.695460	0.000000
REACTOME_POLO_LIKE_KINASE_MEDIATED_EVENTS	0.921375	1.732780	0.001715
REACTOME_ATTENUATION_PHASE	0.912303	1.674935	0.003436
REACTOME_SEMA3A_PAK_DEPENDENT_AXON_REPULSION	0.911443	1.774311	0.001818
REACTOME_RHOBTB_GTPASE_CYCLE	0.909893	1.765635	0.001876
...	...	...	...
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION	-0.887341	-2.831174	0.000000
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY	-0.889023	-2.865755	0.000000
REACTOME_SELENOAMINO_ACID_METABOLISM	-0.891121	-2.910915	0.000000
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE	-0.907605	-2.852917	0.000000
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION	-0.915861	-2.911499	0.000000