From e08bdcc5bb015f552aa135c782edcd7279ca51e6 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 17 Jan 2025 17:08:02 +0100 Subject: [PATCH 1/2] Fix dtypes in numpy arrays from PSMList accession (e.g. psm_list["is_decoy"]) --- psm_utils/psm.py | 24 +++++++++++++++++++++++- psm_utils/psm_list.py | 11 +++++++++-- tests/test_psm_list.py | 3 +++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/psm_utils/psm.py b/psm_utils/psm.py index 0ed97d9..2d01d08 100644 --- a/psm_utils/psm.py +++ b/psm_utils/psm.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional, Union -from pydantic import ConfigDict, BaseModel +from pydantic import BaseModel, ConfigDict from psm_utils.peptidoform import Peptidoform @@ -135,3 +135,25 @@ def get_usi(self, as_url=False) -> str: if as_url: usi = "http://proteomecentral.proteomexchange.org/usi/?usi=" + usi return usi + + +NUMPY_DTYPES = { + "peptidoform": Peptidoform, + "spectrum_id": object, + "run": object, + "collection": object, + "spectrum": object, + "is_decoy": bool, + "score": float, + "qvalue": float, + "pep": float, + "precursor_mz": float, + "retention_time": float, + "ion_mobility": float, + "protein_list": object, + "rank": int, + "source": object, + "provenance_data": object, + "metadata": object, + "rescoring_features": object, +} diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py index c0339eb..0128c38 100644 --- a/psm_utils/psm_list.py +++ b/psm_utils/psm_list.py @@ -9,7 +9,7 @@ from pyteomics import auxiliary, proforma from rich.pretty import pretty_repr -from psm_utils.psm import PSM +from psm_utils.psm import NUMPY_DTYPES, PSM class PSMList(BaseModel): @@ -98,7 +98,14 @@ def __getitem__(self, item) -> PSM | list[PSM]: return PSMList(psm_list=self.psm_list[item]) elif isinstance(item, str): # Return PSM property as array across full PSMList - return np.fromiter([psm[item] for psm in self.psm_list], dtype=object, count=len(self)) + try: + return np.fromiter( + [psm[item] for psm in self.psm_list], dtype=NUMPY_DTYPES[item], count=len(self) + ) + except TypeError: + return np.fromiter( + [psm[item] for psm in self.psm_list], dtype=object, count=len(self) + ) elif _is_iterable_of_bools(item): # Return new PSMList with items that were True return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(item)]) diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py index 01064df..3a31a4e 100644 --- a/tests/test_psm_list.py +++ b/tests/test_psm_list.py @@ -38,6 +38,9 @@ def test___get_item__(self): # PSM property as array np.testing.assert_equal(psm_list["spectrum_id"], np.array(["1", "2", "3"])) + np.testing.assert_equal(psm_list["score"], np.array([140.2, 132.9, 55.7])) + np.testing.assert_equal(psm_list["rank"], np.array([None, None, None])) + np.testing.assert_equal(psm_list["qvalue"], np.array([np.nan, np.nan, np.nan])) # Multiple PSM properties as 2D array np.testing.assert_equal( From 3961624c00879bbd2017f843d99987005f3c5fcc Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 17 Jan 2025 18:09:19 +0100 Subject: [PATCH 2/2] Implement requested changes --- psm_utils/psm_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py index 0128c38..c38ffdf 100644 --- a/psm_utils/psm_list.py +++ b/psm_utils/psm_list.py @@ -100,11 +100,11 @@ def __getitem__(self, item) -> PSM | list[PSM]: # Return PSM property as array across full PSMList try: return np.fromiter( - [psm[item] for psm in self.psm_list], dtype=NUMPY_DTYPES[item], count=len(self) + (psm[item] for psm in self.psm_list), dtype=NUMPY_DTYPES[item], count=len(self) ) except TypeError: return np.fromiter( - [psm[item] for psm in self.psm_list], dtype=object, count=len(self) + (psm[item] for psm in self.psm_list), dtype=object, count=len(self) ) elif _is_iterable_of_bools(item): # Return new PSMList with items that were True