Roestlab · jcharkow · Feb 26, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/docs/API.rst b/docs/API.rst
@@ -91,6 +91,7 @@ Classes
    ResultsLoader
    MzMLDataLoader
    SqMassLoader
+   XICParquetDataLoader
    SpectralLibraryLoader
 
 :mod:`massdash.loaders.access`: Classes For Low Level Data Access 
@@ -127,12 +128,13 @@ Classes
    ResultsTSVDataAccess
    TransitionPQPDataAccess
    TransitionTSVDataAccess
+   XICParquetDataAccess
 
 
 Abstract Classes
 ----------------
 
-.. automsummary::
+.. autosummary::
    :nosignatures:
    :toctree: generated/
    :template: class.rst

diff --git a/docs/conf.py b/docs/conf.py
@@ -41,7 +41,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build']
 
 autosummary_generate = True
 
@@ -67,4 +67,4 @@ def setup(app):
 
 
 # --- Always execute notebooks -------------
-nbsphinx_execute = 'always'
+nbsphinx_execute = 'never'
diff --git a/massdash/loaders/XICParquetDataLoader.py b/massdash/loaders/XICParquetDataLoader.py
@@ -0,0 +1,83 @@
+"""
+massdash/loaders/XICParquetDataLoader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+
+from typing import List, Union
+from pandas.core.api import DataFrame as DataFrame
+import pandas as pd
+
+# Loaders
+from .GenericChromatogramLoader import GenericChromatogramLoader
+from .ResultsLoader import ResultsLoader
+from .access import XICParquetDataAccess
+# Structs
+from ..structs import TransitionGroupCollection
+# Utils
+from massdash.util import LOGGER
+
+class XICParquetDataLoader(GenericChromatogramLoader):
+
+    ''' 
+    Class for loading Chromatograms and peak features from SqMass files and OSW files
+    Inherits from GenericChromatogramLoader
+    '''
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs) 
+        self.dataAccess = [XICParquetDataAccess(f) for f in self.dataFiles]
+
+    @ResultsLoader.cache_results
+    def loadTransitionGroupsDf(self, pep_id: str, charge: int) -> pd.DataFrame:
+        precursor = pep_id + str(charge)
+
+        columns=['run_name', 'rt', 'intensity', 'annotation']
+        out = {}
+        for t in self.dataAccess:
+
+            ### Get Transition chromatogram IDs
+            chroms = t.getTransitionDataDf(precursor)
+
+            # only add if there is data
+            if not chroms.empty:
+                out[t.runName] = chroms
+            else:
+                print(f"Warning: no data found for peptide in transition file {t.filename}")
+
+        if len(out) == 0:
+            return pd.DataFrame(columns=columns)
+        else:
+            return pd.concat(out).reset_index().drop('level_1', axis=1).rename(columns=dict(level_0='run'))
+
+    @ResultsLoader.cache_results
+    def loadTransitionGroups(self, pep_id: str, charge: int, runNames: Union[None, str, List[str]] =None) -> TransitionGroupCollection:
+        '''
+        Loads the transition group for a given peptide ID and charge across all files
+        Args:
+            pep_id (str): Peptide ID
+            charge (int): Charge
+            runNames (None | str | List[str]): Name of the run to extract the transition group from. If None, all runs are extracted. If str, only the specified run is extracted. If List[str], only the specified runs are extracted.
+        Returns:
+            Dict[str, TransitionGroup]: Dictionary of TransitionGroups, with keys as sqMass filenames
+        '''
+
+        out = TransitionGroupCollection()
+
+        if runNames is None:
+            for t in self.dataAccess:
+                out[t.runName] = t.getTransitionData(pep_id, charge)
+        elif isinstance(runNames, str):
+            t = self.dataAccess[self.runNames.index(runNames)]
+            out[runNames] = t.getTransitionData(pep_id, charge)
+        elif isinstance(runNames, list):
+            for r in runNames:
+                for t in self.dataAccess:
+                    if t.runName == r:
+                        out[t.runName] = t.getTransitionData(pep_id, charge)
+        else:
+            raise ValueError("runName must be none, a string or list of strings")
+
+        return out
+
+
diff --git a/massdash/loaders/__init__.py b/massdash/loaders/__init__.py
@@ -11,11 +11,13 @@
 from .ResultsLoader import ResultsLoader
 from .SpectralLibraryLoader import SpectralLibraryLoader
 from .SqMassLoader import SqMassLoader
+from .XICParquetDataLoader import XICParquetDataLoader
 
 __all__ = [ 
             "GenericChromatogramLoader",
             "GenericSpectrumLoader",
             "MzMLDataLoader", 
             "ResultsLoader",
             "SpectralLibraryLoader",
-            "SqMassLoader"]
+            "SqMassLoader",
+            "XICParquetDataLoader"]
diff --git a/massdash/loaders/access/XICParquetDataAccess.py b/massdash/loaders/access/XICParquetDataAccess.py
@@ -0,0 +1,99 @@
+"""
+massdash/loaders/access/XICParquetDataAccess
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+import pyarrow.parquet as pq
+from pathlib import Path
+import pandas as pd
+import pyarrow.compute as pc
+from typing import Union
+
+# Structs
+from ...structs import TransitionGroup
+
+class XICParquetDataAccess:
+    '''
+    Class for accessing XIC Parquet files (DIA-NN output)
+    '''
+    def __init__(self, filename):
+        self.filename = filename
+        self.runName = str(Path(filename).stem)
+        self.parquet = pq.ParquetFile(filename)
+
+        ## Read the index
+        self.index = self.parquet.read_row_group(self.parquet.num_row_groups - 2, columns=['pr', 'info']).to_pandas()
+        self.index.index = self.index['pr']
+
+    def getTransitionDataDf(self, precursor_id):
+        '''
+        Get the chromatogram data for a given precursor ID
+        '''
+
+        if precursor_id not in self.index.index:
+            return pd.DataFrame(columns=['rt', 'value', 'feature'])
+
+        out = self._getTransitionDataHelper(precursor_id)
+
+        # Rename columns
+        out.rename(columns={'value': 'intensity', 'feature':'annotation'}, inplace=True)
+
+        # fix annotation name for MS1
+        out.loc[out['annotation'] == 'ms1', 'annotation'] = 'prec'
+
+        return out
+
+    def getTransitionData(self, pep_id, charge) -> Union[None | TransitionGroup]:
+        '''
+        Get the chromatogram data for a given precursor ID
+
+        Returns:
+            TransitionGroup: TransitionGroup object OR none if pep_id, charge is not found
+        '''
+
+        precursor_id = pep_id + str(charge)
+
+        if precursor_id not in self.index.index:
+            print(f"Warning: {precursor_id} not found, returning empty TransitionGroup")
+            return None
+
+        out = self._getTransitionDataHelper(precursor_id)
+
+        # split into transition groups 
+        grps = out.groupby('feature')
+
+        transitions = []
+        precs = []
+        for name, grp in grps:
+            if name == 'ms1':
+                precs.append(TransitionGroup(grp['rt'].values, grp['value'].values, 'prec'))
+            else:
+                transitions.append(TransitionGroup(grp['rt'].values, grp['value'].values, name))
+
+        return TransitionGroup(precs, transitions, pep_id, charge)
+
+    def _getTransitionDataHelper(self, precursor_id) -> pd.DataFrame:
+        '''
+        Get the chromatogram data for a given precursor ID
+
+        Returns:
+            pd.DataFrame: Dataframe of chromatogram data
+        '''
+        ## Get the index of the precursor
+        idx = self.index.loc[precursor_id]['info']
+
+        # Read the row group
+        out = self.parquet.read_row_group(idx, columns=['rt', 'value', 'feature', 'pr'])
+
+        out = out.filter(pc.equal(out["pr"], precursor_id))
+        out = out.drop(['pr'])
+
+        return out.to_pandas()
+
+
+
+
+
+
+
+
diff --git a/massdash/loaders/access/__init__.py b/massdash/loaders/access/__init__.py
@@ -12,11 +12,13 @@
 from .SqMassDataAccess import SqMassDataAccess
 from .TransitionPQPDataAccess import TransitionPQPDataAccess
 from .TransitionTSVDataAccess import TransitionTSVDataAccess
+from .XICParquetDataAccess import XICParquetDataAccess
 
 __all__ = [ "GenericResultsAccess",
             "MzMLDataAccess",
             "OSWDataAccess",
             "ResultsTSVDataAccess",
             "SqMassDataAccess",
             "TransitionPQPDataAccess",
-            "TransitionTSVDataAccess"]
+            "TransitionTSVDataAccess",
+            "XICParquetDataAccess"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ requires-python = ">=3.10, <=3.12"
 dependencies = [
     "bokeh>3.0",
     "click>=8.1",
+    "pyarrow >= 19.0.1",
     "joblib",
     "matplotlib",
     "numpy>=1.9.0",