Source code for ncountr.core.normalize

"""Normalization methods for Nanostring nCounter data."""

from __future__ import annotations

from typing import Literal

import numpy as np
import pandas as pd
from scipy import stats as sp_stats

from ncountr.experiment import NanostringExperiment


def _geomean_scale(
    counts: pd.DataFrame, samples: list[str]
) -> dict[str, float]:
    """Compute geometric-mean scaling factors across samples.

    Each sample's geometric mean is computed, and the scaling factor brings
    each sample to the grand geometric mean.
    """
    geomeans: dict[str, float] = {}
    for sid in samples:
        vals = counts[sid].values.astype(float)
        vals = vals[vals > 0]
        if len(vals) == 0:
            geomeans[sid] = np.nan
        else:
            geomeans[sid] = sp_stats.gmean(vals)

    valid = [v for v in geomeans.values() if np.isfinite(v)]
    if not valid:
        return {sid: 1.0 for sid in samples}

    grand = sp_stats.gmean(valid)
    return {
        sid: grand / geomeans[sid] if np.isfinite(geomeans[sid]) else 1.0
        for sid in samples
    }


def _apply_scale(
    df: pd.DataFrame, scale: dict[str, float], samples: list[str]
) -> pd.DataFrame:
    """Multiply each sample column by its scaling factor."""
    result = df.copy()
    for sid in samples:
        result[sid] = df[sid] * scale[sid]
    return result


[docs] def normalize( experiment: NanostringExperiment, *, method: Literal["pos_only", "pos_hk", "pos_hk_bg"] = "pos_hk", neg_bg: pd.Series | dict[str, float] | None = None, ) -> pd.DataFrame: """Normalize raw counts and store the result on the experiment. Parameters ---------- experiment : NanostringExperiment method : str ``"pos_only"`` — positive control normalization only. ``"pos_hk"`` — positive control + housekeeping normalization. ``"pos_hk_bg"`` — positive control + housekeeping + background subtraction. neg_bg : pd.Series or dict, optional Per-sample negative background values. Required for ``"pos_hk_bg"``. If not provided, computed from ``experiment.neg_counts``. Returns ------- pd.DataFrame Normalized count matrix (genes x samples). """ samples = experiment.samples # Step 1: positive control normalization pos_scale = _geomean_scale(experiment.pos_counts, samples) normalized = _apply_scale(experiment.raw_counts, pos_scale, samples) if method in ("pos_hk", "pos_hk_bg"): # Also apply pos scaling to housekeeping before computing HK factors hk_pos_scaled = _apply_scale(experiment.hk_counts, pos_scale, samples) hk_scale = _geomean_scale(hk_pos_scaled, samples) normalized = _apply_scale(normalized, hk_scale, samples) if method == "pos_hk_bg": if neg_bg is None: # Compute from negative controls bg: dict[str, float] = {} for sid in samples: vals = experiment.neg_counts[sid].values.astype(float) bg[sid] = vals.mean() + 2.0 * vals.std() neg_bg = bg if isinstance(neg_bg, pd.Series): neg_bg = neg_bg.to_dict() for sid in samples: threshold = neg_bg.get(sid, 0.0) normalized[sid] = (normalized[sid] - threshold).clip(lower=0) experiment.normalized = normalized return normalized
[docs] def get_scaling_factors( experiment: NanostringExperiment, ) -> dict[str, dict[str, float]]: """Compute and return scaling factors without modifying the experiment. Returns ------- dict ``{"pos": {sid: factor}, "hk": {sid: factor}}``. """ samples = experiment.samples pos_scale = _geomean_scale(experiment.pos_counts, samples) hk_pos_scaled = _apply_scale(experiment.hk_counts, pos_scale, samples) hk_scale = _geomean_scale(hk_pos_scaled, samples) return {"pos": pos_scale, "hk": hk_scale}