Source code for ncountr.core.pathway

"""Gene set / pathway scoring."""

from __future__ import annotations

from typing import Union

import numpy as np
import pandas as pd

from ncountr.experiment import NanostringExperiment
from ncountr.datasets import get_gene_set



[docs]
def score_gene_set(
    experiment: NanostringExperiment,
    *,
    gene_set: Union[str, list[str]],
    counts: pd.DataFrame | None = None,
    samples: list[str] | None = None,
    method: str = "zscore_mean",
) -> pd.Series:
    """Score samples for a gene set.

    Parameters
    ----------
    experiment : NanostringExperiment
    gene_set : str or list[str]
        A built-in gene set name (e.g. ``"IFN_JAKSTAT"``) or an explicit
        list of gene names.
    counts : pd.DataFrame, optional
        Count matrix (genes x samples).  Defaults to normalized or raw counts.
    samples : list[str], optional
        Subset of samples to score.  Defaults to all.
    method : str
        Scoring method.  Currently ``"zscore_mean"`` (z-score each gene
        across samples, then take the mean z-score per sample).

    Returns
    -------
    pd.Series
        Score per sample.
    """
    if isinstance(gene_set, str):
        genes = get_gene_set(gene_set)
    else:
        genes = list(gene_set)

    if counts is None:
        counts = experiment.normalized if experiment.normalized is not None else experiment.raw_counts

    if samples is None:
        samples = experiment.samples

    # Filter to genes present in the data
    available = [g for g in genes if g in counts.index]
    if not available:
        raise ValueError(f"No genes from the gene set are present in the data")

    mat = np.log2(counts.loc[available, samples].astype(float) + 1)

    if method == "zscore_mean":
        row_mean = mat.mean(axis=1)
        row_std = mat.std(axis=1).replace(0, np.nan)
        z = mat.subtract(row_mean, axis=0).div(row_std, axis=0)
        scores = z.mean(axis=0)
    else:
        raise ValueError(f"Unknown scoring method: {method!r}")

    scores.name = "pathway_score"
    return scores