Source code for ncountr.core.normalize

"""Normalization methods for Nanostring nCounter data."""

from __future__ import annotations

from typing import Literal

import numpy as np
import pandas as pd
from scipy import stats as sp_stats

from ncountr.experiment import NanostringExperiment


def _geomean_scale(
    counts: pd.DataFrame, samples: list[str]
) -> dict[str, float]:
    """Compute geometric-mean scaling factors across samples.

    Each sample's geometric mean is computed, and the scaling factor brings
    each sample to the grand geometric mean.
    """
    geomeans: dict[str, float] = {}
    for sid in samples:
        vals = counts[sid].values.astype(float)
        vals = vals[vals > 0]
        if len(vals) == 0:
            geomeans[sid] = np.nan
        else:
            geomeans[sid] = sp_stats.gmean(vals)

    valid = [v for v in geomeans.values() if np.isfinite(v)]
    if not valid:
        return {sid: 1.0 for sid in samples}

    grand = sp_stats.gmean(valid)
    return {
        sid: grand / geomeans[sid] if np.isfinite(geomeans[sid]) else 1.0
        for sid in samples
    }


def _apply_scale(
    df: pd.DataFrame, scale: dict[str, float], samples: list[str]
) -> pd.DataFrame:
    """Multiply each sample column by its scaling factor."""
    result = df.copy()
    for sid in samples:
        result[sid] = df[sid] * scale[sid]
    return result



[docs]
def normalize(
    experiment: NanostringExperiment,
    *,
    method: Literal["pos_only", "pos_hk", "pos_hk_bg"] = "pos_hk",
    neg_bg: pd.Series | dict[str, float] | None = None,
) -> pd.DataFrame:
    """Normalize raw counts and store the result on the experiment.

    Parameters
    ----------
    experiment : NanostringExperiment
    method : str
        ``"pos_only"`` — positive control normalization only.
        ``"pos_hk"`` — positive control + housekeeping normalization.
        ``"pos_hk_bg"`` — positive control + housekeeping + background subtraction.
    neg_bg : pd.Series or dict, optional
        Per-sample negative background values. Required for ``"pos_hk_bg"``.
        If not provided, computed from ``experiment.neg_counts``.

    Returns
    -------
    pd.DataFrame
        Normalized count matrix (genes x samples).
    """
    samples = experiment.samples

    # Step 1: positive control normalization
    pos_scale = _geomean_scale(experiment.pos_counts, samples)
    normalized = _apply_scale(experiment.raw_counts, pos_scale, samples)

    if method in ("pos_hk", "pos_hk_bg"):
        # Also apply pos scaling to housekeeping before computing HK factors
        hk_pos_scaled = _apply_scale(experiment.hk_counts, pos_scale, samples)
        hk_scale = _geomean_scale(hk_pos_scaled, samples)
        normalized = _apply_scale(normalized, hk_scale, samples)

    if method == "pos_hk_bg":
        if neg_bg is None:
            # Compute from negative controls
            bg: dict[str, float] = {}
            for sid in samples:
                vals = experiment.neg_counts[sid].values.astype(float)
                bg[sid] = vals.mean() + 2.0 * vals.std()
            neg_bg = bg

        if isinstance(neg_bg, pd.Series):
            neg_bg = neg_bg.to_dict()

        for sid in samples:
            threshold = neg_bg.get(sid, 0.0)
            normalized[sid] = (normalized[sid] - threshold).clip(lower=0)

    experiment.normalized = normalized
    return normalized




[docs]
def get_scaling_factors(
    experiment: NanostringExperiment,
) -> dict[str, dict[str, float]]:
    """Compute and return scaling factors without modifying the experiment.

    Returns
    -------
    dict
        ``{"pos": {sid: factor}, "hk": {sid: factor}}``.
    """
    samples = experiment.samples
    pos_scale = _geomean_scale(experiment.pos_counts, samples)

    hk_pos_scaled = _apply_scale(experiment.hk_counts, pos_scale, samples)
    hk_scale = _geomean_scale(hk_pos_scaled, samples)

    return {"pos": pos_scale, "hk": hk_scale}