Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: DIA-NN XIC parquet access #175

Merged
merged 7 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/API.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Classes
ResultsLoader
MzMLDataLoader
SqMassLoader
XICParquetDataLoader
SpectralLibraryLoader

:mod:`massdash.loaders.access`: Classes For Low Level Data Access
Expand Down Expand Up @@ -127,12 +128,13 @@ Classes
ResultsTSVDataAccess
TransitionPQPDataAccess
TransitionTSVDataAccess
XICParquetDataAccess


Abstract Classes
----------------

.. automsummary::
.. autosummary::
:nosignatures:
:toctree: generated/
:template: class.rst
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build']

autosummary_generate = True

Expand All @@ -67,4 +67,4 @@ def setup(app):


# --- Always execute notebooks -------------
nbsphinx_execute = 'always'
nbsphinx_execute = 'never'
83 changes: 83 additions & 0 deletions massdash/loaders/XICParquetDataLoader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
massdash/loaders/XICParquetDataLoader
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""


from typing import List, Union
from pandas.core.api import DataFrame as DataFrame
import pandas as pd

# Loaders
from .GenericChromatogramLoader import GenericChromatogramLoader
from .ResultsLoader import ResultsLoader
from .access import XICParquetDataAccess
# Structs
from ..structs import TransitionGroupCollection
# Utils
from massdash.util import LOGGER

class XICParquetDataLoader(GenericChromatogramLoader):

'''
Class for loading Chromatograms and peak features from SqMass files and OSW files
Inherits from GenericChromatogramLoader
'''

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.dataAccess = [XICParquetDataAccess(f) for f in self.dataFiles]

@ResultsLoader.cache_results
def loadTransitionGroupsDf(self, pep_id: str, charge: int) -> pd.DataFrame:
precursor = pep_id + str(charge)

columns=['run_name', 'rt', 'intensity', 'annotation']
out = {}
for t in self.dataAccess:

### Get Transition chromatogram IDs
chroms = t.getTransitionDataDf(precursor)

# only add if there is data
if not chroms.empty:
out[t.runName] = chroms
else:
print(f"Warning: no data found for peptide in transition file {t.filename}")

if len(out) == 0:
return pd.DataFrame(columns=columns)
else:
return pd.concat(out).reset_index().drop('level_1', axis=1).rename(columns=dict(level_0='run'))

@ResultsLoader.cache_results
def loadTransitionGroups(self, pep_id: str, charge: int, runNames: Union[None, str, List[str]] =None) -> TransitionGroupCollection:
'''
Loads the transition group for a given peptide ID and charge across all files
Args:
pep_id (str): Peptide ID
charge (int): Charge
runNames (None | str | List[str]): Name of the run to extract the transition group from. If None, all runs are extracted. If str, only the specified run is extracted. If List[str], only the specified runs are extracted.
Returns:
Dict[str, TransitionGroup]: Dictionary of TransitionGroups, with keys as sqMass filenames
'''

out = TransitionGroupCollection()

if runNames is None:
for t in self.dataAccess:
out[t.runName] = t.getTransitionData(pep_id, charge)
elif isinstance(runNames, str):
t = self.dataAccess[self.runNames.index(runNames)]
out[runNames] = t.getTransitionData(pep_id, charge)
elif isinstance(runNames, list):
for r in runNames:
for t in self.dataAccess:
if t.runName == r:
out[t.runName] = t.getTransitionData(pep_id, charge)
else:
raise ValueError("runName must be none, a string or list of strings")

return out


4 changes: 3 additions & 1 deletion massdash/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from .ResultsLoader import ResultsLoader
from .SpectralLibraryLoader import SpectralLibraryLoader
from .SqMassLoader import SqMassLoader
from .XICParquetDataLoader import XICParquetDataLoader

__all__ = [
"GenericChromatogramLoader",
"GenericSpectrumLoader",
"MzMLDataLoader",
"ResultsLoader",
"SpectralLibraryLoader",
"SqMassLoader"]
"SqMassLoader",
"XICParquetDataLoader"]
99 changes: 99 additions & 0 deletions massdash/loaders/access/XICParquetDataAccess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
massdash/loaders/access/XICParquetDataAccess
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import pyarrow.compute as pc
from typing import Union

# Structs
from ...structs import TransitionGroup

class XICParquetDataAccess:
'''
Class for accessing XIC Parquet files (DIA-NN output)
'''
def __init__(self, filename):
self.filename = filename
self.runName = str(Path(filename).stem)
self.parquet = pq.ParquetFile(filename)

## Read the index
self.index = self.parquet.read_row_group(self.parquet.num_row_groups - 2, columns=['pr', 'info']).to_pandas()
self.index.index = self.index['pr']

def getTransitionDataDf(self, precursor_id):
'''
Get the chromatogram data for a given precursor ID
'''

if precursor_id not in self.index.index:
return pd.DataFrame(columns=['rt', 'value', 'feature'])

out = self._getTransitionDataHelper(precursor_id)

# Rename columns
out.rename(columns={'value': 'intensity', 'feature':'annotation'}, inplace=True)

# fix annotation name for MS1
out.loc[out['annotation'] == 'ms1', 'annotation'] = 'prec'

return out

def getTransitionData(self, pep_id, charge) -> Union[None | TransitionGroup]:
'''
Get the chromatogram data for a given precursor ID

Returns:
TransitionGroup: TransitionGroup object OR none if pep_id, charge is not found
'''

precursor_id = pep_id + str(charge)

if precursor_id not in self.index.index:
print(f"Warning: {precursor_id} not found, returning empty TransitionGroup")
return None

out = self._getTransitionDataHelper(precursor_id)

# split into transition groups
grps = out.groupby('feature')

transitions = []
precs = []
for name, grp in grps:
if name == 'ms1':
precs.append(TransitionGroup(grp['rt'].values, grp['value'].values, 'prec'))
else:
transitions.append(TransitionGroup(grp['rt'].values, grp['value'].values, name))

return TransitionGroup(precs, transitions, pep_id, charge)

def _getTransitionDataHelper(self, precursor_id) -> pd.DataFrame:
'''
Get the chromatogram data for a given precursor ID

Returns:
pd.DataFrame: Dataframe of chromatogram data
'''
## Get the index of the precursor
idx = self.index.loc[precursor_id]['info']

# Read the row group
out = self.parquet.read_row_group(idx, columns=['rt', 'value', 'feature', 'pr'])

out = out.filter(pc.equal(out["pr"], precursor_id))
out = out.drop(['pr'])

return out.to_pandas()








4 changes: 3 additions & 1 deletion massdash/loaders/access/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
from .SqMassDataAccess import SqMassDataAccess
from .TransitionPQPDataAccess import TransitionPQPDataAccess
from .TransitionTSVDataAccess import TransitionTSVDataAccess
from .XICParquetDataAccess import XICParquetDataAccess

__all__ = [ "GenericResultsAccess",
"MzMLDataAccess",
"OSWDataAccess",
"ResultsTSVDataAccess",
"SqMassDataAccess",
"TransitionPQPDataAccess",
"TransitionTSVDataAccess"]
"TransitionTSVDataAccess",
"XICParquetDataAccess"]
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ requires-python = ">=3.10, <=3.12"
dependencies = [
"bokeh>3.0",
"click>=8.1",
"pyarrow >= 19.0.1",
"joblib",
"matplotlib",
"numpy>=1.9.0",
Expand Down
Loading