Source code for ncountr.crossplatform.correlation

"""Cross-platform correlation analysis."""

from __future__ import annotations

import numpy as np
import pandas as pd
from scipy import stats


[docs] def per_sample_correlation( nanostring: pd.DataFrame, external: pd.DataFrame, *, shared_genes: list[str] | None = None, method: str = "spearman", ) -> pd.DataFrame: """Compute per-sample correlation between two expression matrices. Parameters ---------- nanostring : pd.DataFrame Nanostring expression (genes x samples). external : pd.DataFrame External expression (genes x samples). shared_genes : list[str], optional Genes to include. If None, uses intersection of indices. method : str ``"spearman"`` or ``"pearson"``. Returns ------- pd.DataFrame One row per sample with columns: sample, r, pvalue. """ if shared_genes is None: shared_genes = sorted(set(nanostring.index) & set(external.index)) shared_samples = sorted(set(nanostring.columns) & set(external.columns)) corr_func = stats.spearmanr if method == "spearman" else stats.pearsonr rows = [] for sid in shared_samples: n_vals = nanostring.loc[shared_genes, sid].values.astype(float) e_vals = external.loc[shared_genes, sid].values.astype(float) mask = np.isfinite(n_vals) & np.isfinite(e_vals) if mask.sum() < 3: continue r, p = corr_func(n_vals[mask], e_vals[mask]) rows.append({"sample": sid, "r": r, "pvalue": p}) return pd.DataFrame(rows)
[docs] def per_gene_correlation( nanostring: pd.DataFrame, external: pd.DataFrame, *, shared_samples: list[str] | None = None, method: str = "spearman", min_samples: int = 4, ) -> pd.DataFrame: """Compute per-gene correlation across shared samples. Parameters ---------- nanostring, external : pd.DataFrame Expression matrices (genes x samples). shared_samples : list[str], optional Samples to include. If None, uses intersection. method : str ``"spearman"`` or ``"pearson"``. min_samples : int Minimum samples with variation required. Returns ------- pd.DataFrame One row per gene with columns: gene, r, pvalue. """ if shared_samples is None: shared_samples = sorted(set(nanostring.columns) & set(external.columns)) shared_genes = sorted(set(nanostring.index) & set(external.index)) corr_func = stats.spearmanr if method == "spearman" else stats.pearsonr rows = [] for gene in shared_genes: n_vals = nanostring.loc[gene, shared_samples].values.astype(float) e_vals = external.loc[gene, shared_samples].values.astype(float) mask = np.isfinite(n_vals) & np.isfinite(e_vals) if mask.sum() < min_samples: continue if np.std(n_vals[mask]) == 0 or np.std(e_vals[mask]) == 0: continue r, p = corr_func(n_vals[mask], e_vals[mask]) rows.append({"gene": gene, "r": r, "pvalue": p}) return pd.DataFrame(rows)