From 0e0582edee22c5aa00f3b5997f92a4ed4438fd4d Mon Sep 17 00:00:00 2001 From: Douwe Schulte Date: Fri, 6 Sep 2024 04:03:44 +0200 Subject: [PATCH] feature: change `average_spectra` to take an iterator over `SpectrumLike` (#6) * Allowed average from an iterator and added more documentation --------- Co-authored-by: Joshua Klein --- Cargo.toml | 62 +++++++++++++----------- src/spectrum/group.rs | 107 +++++++++++++++++++++++++++--------------- 2 files changed, 103 insertions(+), 66 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d329456..cddeb28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,27 +2,16 @@ name = "mzdata" version = "0.29.0" edition = "2018" -keywords = [ - 'mass-spectrometry', - 'mzml', - 'mgf' -] +keywords = ['mass-spectrometry', 'mzml', 'mgf'] -categories = [ - "science", - "parser-implementations", - "data-structures" -] +categories = ["science", "parser-implementations", "data-structures"] description = "A library to read mass spectrometry data formats" license = "Apache-2.0" repository = "https://github.com/mobiusklein/mzdata" documentation = "https://docs.rs/mzdata" -exclude = [ - "tmp/*", - "test/data/*" -] +exclude = ["tmp/*", "test/data/*"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [[bin]] @@ -54,8 +43,8 @@ lto = true debug = true [features] -# default = ["nalgebra", "parallelism", "mzsignal", "mzmlb", "zlib-ng-compat", ] -default = ["zlib-ng-compat", ] +# default = ["nalgebra", "parallelism", "mzsignal", "zlib-ng-compat"] +default = ["zlib-ng-compat"] openblas = ["mzsignal", "mzsignal/openblas"] netlib = ["mzsignal", "mzsignal/netlib"] @@ -80,7 +69,11 @@ mzmlb = ["hdf5", "ndarray", "hdf5-sys"] # but not flate2/zlib hdf5_static = ["mzmlb", "hdf5-sys/static", "hdf5-sys/zlib", "libz-sys"] -thermo = ["thermorawfilereader", "thermorawfilereader/net8_0", "thermorawfilereader/nethost-download"] +thermo = [ + "thermorawfilereader", + "thermorawfilereader/net8_0", + "thermorawfilereader/nethost-download", +] doc-only = ["thermorawfilereader/doc-only"] @@ -91,25 +84,32 @@ regex = "1" lazy_static = "1.4.0" serde = { version = "1.0.204", features = ["derive"] } serde_json = "1.0.120" -quick-xml = { version = "0.30", features = [ "serialize" ] } -flate2 = {version = "1.0.20"} +quick-xml = { version = "0.30", features = ["serialize"] } +flate2 = { version = "1.0.20" } num-traits = "0.2" -indexmap = { version = "2.0.0", features = [ "serde" ] } +indexmap = { version = "2.0.0", features = ["serde"] } log = "0.4.20" mzpeaks = { version = ">=0.20.0,<1.0.0" } rayon = { version = ">=1.8.0,<2.0", optional = true } mzsignal = { version = ">=0.22.0,<1.0.0", default-features = false, optional = true} md5 = "0.7.0" -tokio = {version = "1.32.0", optional = true, features = ["macros", "rt", "fs", "rt-multi-thread"]} - -hdf5 = {version = "0.8.1", optional = true, features = ["blosc", "lzf",]} +tokio = { version = "1.32.0", optional = true, features = [ + "macros", + "rt", + "fs", + "rt-multi-thread", +] } + +hdf5 = { version = "0.8.1", optional = true, features = ["blosc", "lzf"] } hdf5-sys = { version = "0.8.1", optional = true } -libz-sys = { version = "1.1", default-features = false, features = ["static", ], optional = true } +libz-sys = { version = "1.1", default-features = false, features = [ + "static", +], optional = true } ndarray = { version = "0.15.6", optional = true } filename = { version = "0.1.1", optional = true } numpress = { version = "1.1.0", optional = true } -bytemuck = { version = "1.15.0", features = ["extern_crate_alloc"]} +bytemuck = { version = "1.15.0", features = ["extern_crate_alloc"] } base64-simd = "0.8.0" thiserror = "1.0.50" @@ -120,7 +120,7 @@ chrono = "0.4.37" bitflags = "2.5.0" [dev-dependencies] -criterion = { version = "0.5.1", features = [ "html_reports" ] } +criterion = { version = "0.5.1", features = ["html_reports"] } test-log = "0.2.12 " env_logger = "0.10.0" tempfile = "3.10" @@ -132,5 +132,13 @@ harness = false [package.metadata.docs.rs] -features = ["parallelism", "mzsignal", "nalgebra", "mzmlb", "async", "thermorawfilereader", "doc-only"] +features = [ + "parallelism", + "mzsignal", + "nalgebra", + "mzmlb", + "async", + "thermorawfilereader", + "doc-only", +] no-default-features = true diff --git a/src/spectrum/group.rs b/src/spectrum/group.rs index ca642c2..4c515eb 100644 --- a/src/spectrum/group.rs +++ b/src/spectrum/group.rs @@ -660,7 +660,8 @@ mod mzsignal_impl { use crate::spectrum::bindata::{to_bytes, BuildArrayMapFrom, BuildFromArrayMap}; use crate::spectrum::{ - ArrayType, BinaryArrayMap, BinaryDataArrayType, DataArray, SignalContinuity, + ArrayType, BinaryArrayMap, BinaryDataArrayType, DataArray, RefPeakDataLevel, + SignalContinuity, }; use super::*; @@ -669,7 +670,7 @@ mod mzsignal_impl { use mzpeaks::{MZLocated, PeakCollection}; use mzsignal::average::{average_signal, SignalAverager}; use mzsignal::reprofile::{reprofile, PeakSetReprofiler, PeakShape, PeakShapeModel}; - use mzsignal::{ArrayPair, FittedPeak}; + use mzsignal::{ArrayPair, FittedPeak, MZGrid}; impl From> for BinaryArrayMap { fn from(value: ArrayPair<'_>) -> Self { @@ -692,47 +693,75 @@ mod mzsignal_impl { } } - /// Average a series of [`MultiLayerSpectrum`] together - pub fn average_spectra( - spectra: &[MultiLayerSpectrum], + /// Average a series of [`SpectrumLike`] together. The supplied dx will be used to [`reprofile`] + /// centroid spectra. The resulting [`ArrayPair`] can be made into a [`BinaryArrayMap`] using + /// `.into()`. Which in turn can be used to create a new [`MultiLayerSpectrum`] or + /// [`RawSpectrum`](crate::spectrum::RawSpectrum). Internally it uses [`SpectrumLike::peaks`] + /// to retrieve the spectrum data. Any deconvoluted spectra will be skipped. + /// + /// Note: only available with feature `mzsignal`. + pub fn average_spectra< + 'lifetime, + S: SpectrumLike + 'lifetime, + C: CentroidLike, + D: DeconvolutedCentroidLike, + >( + spectra: impl IntoIterator, dx: f64, - ) -> ArrayPair<'_> { - let arrays: Vec<_> = spectra - .iter() - .map(|scan| { - if scan.signal_continuity() == SignalContinuity::Profile { - if let Some(array_map) = scan.raw_arrays() { - let mz = array_map.mzs().unwrap().to_vec(); - let inten = array_map.intensities().unwrap().to_vec(); - Some(ArrayPair::from((mz, inten))) - } else { - warn!("{} did not have raw data arrays", scan.id()); - None - } - } else { - if let Some(peaks) = scan.peaks.as_ref() { - let fpeaks: Vec<_> = peaks + ) -> ArrayPair<'static> { + let mut to_be_reprofiled = Vec::new(); + let mut profiles = Vec::new(); + + for spectrum in spectra { + match (spectrum.signal_continuity(), spectrum.peaks()) { + (_, RefPeakDataLevel::Missing | RefPeakDataLevel::Deconvoluted(_)) => (), + (SignalContinuity::Centroid, RefPeakDataLevel::RawData(_)) + | (_, RefPeakDataLevel::Centroid(_)) => { + to_be_reprofiled.push( + spectrum + .peaks() .iter() .map(|p| FittedPeak::from(p.as_centroid())) - .collect(); - let signal = reprofile(fpeaks.iter(), dx); - Some(ArrayPair::from(( - signal.mz_array.to_vec(), - signal.intensity_array.to_vec(), - ))) - } else { - warn!( - "{} was not in profile mode but no centroids found", - scan.id() - ); - None - } + .collect::>(), + ); } - }) - .flatten() - .collect(); - let arrays = average_signal(&arrays, dx); - arrays + (_, RefPeakDataLevel::RawData(array_map)) => { + let mzs = array_map.mzs().unwrap(); + let intensities = array_map.intensities().unwrap(); + profiles.push(ArrayPair::from((mzs, intensities))); + } + } + } + + if !to_be_reprofiled.is_empty() { + let mz_start = to_be_reprofiled + .iter() + .map(|p| p.first().map_or(f64::MAX, |p| p.mz)) + .min_by(f64::total_cmp) + .unwrap_or_default(); + let mz_end = to_be_reprofiled + .iter() + .map(|p| p.last().map_or(f64::MIN, |p| p.mz)) + .max_by(f64::total_cmp) + .unwrap_or_default(); + + let reprofiler = PeakSetReprofiler::new(mz_start, mz_end, dx); + + for peaks in to_be_reprofiled { + let models: Vec> = peaks.iter().map(|p| p.into()).collect(); + if models.is_empty() { + profiles.push(ArrayPair::from((Vec::new(), Vec::new()))); + } else { + let arrays = reprofiler.reprofile_from_models(&models); + profiles.push(ArrayPair::from(( + reprofiler.copy_mz_array(), + arrays.intensity_array.into_owned(), + ))) + } + } + } + + average_signal(&profiles, dx) } #[derive(Debug, Clone)]