Source code for ncountr.io.rcc

"""Parse Nanostring RCC files into a NanostringExperiment."""

from __future__ import annotations

import os
import re
from pathlib import Path
from typing import Union

import pandas as pd

from ncountr.experiment import NanostringExperiment


[docs] def parse_rcc(filepath: Union[str, Path]) -> dict: """Parse a single Nanostring RCC file. Parameters ---------- filepath : str or Path Path to a ``.RCC`` file. Returns ------- dict Keys: ``sample`` (sample attributes), ``lane`` (lane attributes), ``counts`` (dict of ``(CodeClass, GeneName) -> count``). """ result: dict = {"sample": {}, "lane": {}, "counts": {}} current_section: str | None = None with open(filepath) as f: for line in f: line = line.strip() if line.startswith("<") and not line.startswith("</"): current_section = line.strip("<>").lower() continue if line.startswith("</") or not line: current_section = None continue if current_section == "sample_attributes": parts = line.split(",", 1) if len(parts) == 2: result["sample"][parts[0]] = parts[1] elif current_section == "lane_attributes": parts = line.split(",", 1) if len(parts) == 2: result["lane"][parts[0]] = parts[1] elif current_section == "code_summary": parts = line.split(",") if len(parts) >= 4 and parts[0] != "CodeClass": code_class, name, _accession, count = ( parts[0], parts[1], parts[2], int(parts[3]), ) result["counts"][(code_class, name)] = count return result
[docs] def read_rcc( rcc_dirs: Union[str, Path, list[Union[str, Path]]], *, file_pattern: str = "*.RCC", sample_id_pattern: str = r"(\d+)", sample_id_field: str = "ID", sample_id_from: str = "field", sample_meta: dict[str, dict] | None = None, ) -> NanostringExperiment: """Read RCC files from one or more directories into a NanostringExperiment. Parameters ---------- rcc_dirs : str, Path, or list thereof Directory or directories containing ``.RCC`` files. file_pattern : str Glob pattern to match RCC files within each directory. sample_id_pattern : str Regex applied to extract a clean sample ID. The first capture group is used. sample_id_field : str Which field in the ``<Sample_Attributes>`` section holds the sample ID. Only used when ``sample_id_from="field"``. sample_id_from : str Where to extract the sample ID from. ``"field"`` (default) uses the ``sample_id_field`` from the RCC file. ``"filename"`` applies the regex to the filename instead — useful when internal IDs are inconsistent across files. sample_meta : dict[str, dict] | None Optional per-sample metadata, keyed by sample ID. Returns ------- NanostringExperiment """ if isinstance(rcc_dirs, (str, Path)): rcc_dirs = [rcc_dirs] # Collect RCC file paths rcc_files: list[Path] = [] for d in rcc_dirs: d = Path(d) rcc_files.extend(sorted(d.glob(file_pattern))) if not rcc_files: raise FileNotFoundError( f"No files matching '{file_pattern}' found in {rcc_dirs}" ) all_data: dict[str, dict] = {} lane_rows: dict[str, dict] = {} for fp in rcc_files: parsed = parse_rcc(fp) if sample_id_from == "filename": raw_id = fp.stem # filename without extension else: raw_id = parsed["sample"].get(sample_id_field, "") match = re.search(sample_id_pattern, raw_id) if not match: continue sid = match.group(1) all_data[sid] = parsed["counts"] lane_rows[sid] = { "FovCount": int(parsed["lane"].get("FovCount", 0)), "FovCounted": int(parsed["lane"].get("FovCounted", 0)), "BindingDensity": float(parsed["lane"].get("BindingDensity", 0)), "CartridgeID": parsed["lane"].get("CartridgeID", ""), } if not all_data: raise ValueError("No samples could be parsed from the RCC files") # Collect all code class / gene pairs all_keys: set[tuple[str, str]] = set() for counts in all_data.values(): all_keys.update(counts.keys()) endogenous = sorted({name for cls, name in all_keys if cls == "Endogenous"}) positive = sorted({name for cls, name in all_keys if cls == "Positive"}) negative = sorted({name for cls, name in all_keys if cls == "Negative"}) housekeeping = sorted({name for cls, name in all_keys if cls == "Housekeeping"}) samples = sorted(all_data, key=lambda x: (0, int(x)) if x.isdigit() else (1, x)) def _build_df(genes: list[str], code_class: str) -> pd.DataFrame: return pd.DataFrame( { sid: {g: all_data[sid].get((code_class, g), 0) for g in genes} for sid in samples } ) raw_counts = _build_df(endogenous, "Endogenous") pos_counts = _build_df(positive, "Positive") neg_counts = _build_df(negative, "Negative") hk_counts = _build_df(housekeeping, "Housekeeping") lane_df = pd.DataFrame(lane_rows).T lane_df.index.name = "sample" # Ensure proper dtypes for numeric columns for col in ("FovCount", "FovCounted"): if col in lane_df.columns: lane_df[col] = pd.to_numeric(lane_df[col], errors="coerce").astype(int) if "BindingDensity" in lane_df.columns: lane_df["BindingDensity"] = pd.to_numeric(lane_df["BindingDensity"], errors="coerce").astype(float) # Build sample_meta DataFrame if sample_meta: meta_df = pd.DataFrame.from_dict(sample_meta, orient="index") meta_df.index.name = "sample" else: meta_df = pd.DataFrame(index=pd.Index(samples, name="sample")) return NanostringExperiment( raw_counts=raw_counts, pos_counts=pos_counts, neg_counts=neg_counts, hk_counts=hk_counts, sample_meta=meta_df, lane_info=lane_df, )