Clean up group iteration

mobiusklein · Jan 7, 2024 · c26db89 · c26db89
1 parent 94ac489
commit c26db89
Show file tree

Hide file tree

Showing 10 changed files with 504 additions and 177 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -41,6 +41,9 @@ required-features = ["parallelism"]
 name = "averaging_writer"
 required-features = ["parallelism", "mzsignal", "nalgebra"]
 
+[[example]]
+name = "random_access_iter"
+required-features = ["nalgebra"]
 
 [lib]
 name = "mzdata"

diff --git a/examples/random_access_iter.rs b/examples/random_access_iter.rs
@@ -0,0 +1,55 @@
+use std::{env, io, path};
+
+use mzdata::io::mzml;
+use mzdata::prelude::*;
+
+fn main() -> io::Result<()> {
+    let path = path::PathBuf::from(
+        env::args()
+            .nth(1)
+            .expect("Please pass an MS data file path"),
+        // "test/data/batching_test.mzML"
+    );
+
+    let mut reader = mzml::MzMLReader::open_path(path)?;
+
+    let n_spectra = reader.len();
+
+    // Find the spectrum at the midpoint of the run
+    let spec = reader.get_spectrum_by_index(n_spectra / 2).unwrap();
+    eprintln!(
+        "Midpoint spectrum {} (level {}) at time {}",
+        spec.id(),
+        spec.ms_level(),
+        spec.start_time()
+    );
+
+    // Jump the iterator to that point in time
+    reader.start_from_time(spec.start_time())?;
+    let s = reader.next().unwrap();
+    eprintln!("Resuming at {} (level {}) at time {}", s.id(), s.ms_level(), s.start_time());
+
+    // Convert the iterator into a group iterator
+    let mut group_iter = reader.into_groups();
+    // Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead)
+    group_iter.start_from_time(spec.start_time())?;
+    let g = group_iter.next().unwrap();
+    eprintln!(
+        "Resuming at group having {:?} at time {:?}",
+        g.earliest_spectrum().and_then(|s| Some(s.id())),
+        g.earliest_spectrum().and_then(|s| Some(s.start_time()))
+    );
+
+    // Convert the group iterator into an averaging group iterator
+    let mut avg_iter = group_iter.averaging(1, 200.0, 2200.0, 0.005);
+    // Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead)
+    avg_iter.start_from_time(spec.start_time())?;
+    let g = avg_iter.next().unwrap();
+    eprintln!(
+        "Resuming at group having {:?} at time {:?}",
+        g.earliest_spectrum().and_then(|s| Some(s.id())),
+        g.earliest_spectrum().and_then(|s| Some(s.start_time()))
+    );
+
+    Ok(())
+}
diff --git a/src/io/mzmlb/reader.rs b/src/io/mzmlb/reader.rs
@@ -687,9 +687,9 @@ pub struct MzMLbReaderType<
 }
 
 impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting + BuildFromArrayMap> MzMLbReaderType<C, D> {
-    /// Create a new `[MzMLbReader]` with an internal cache size of `chunk_size` elements
+    /// Create a new [`MzMLbReader`] with an internal cache size of `chunk_size` elements
     /// per data array to reduce the number of disk reads needed to populate spectra, and
-    /// sets the `[DetailLevel]`.
+    /// sets the [`DetailLevel`].
     pub fn with_chunk_size_and_detail_level<P: AsRef<Path>>(
         path: &P,
         chunk_size: usize,
@@ -733,7 +733,7 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
         Ok(inst)
     }
 
-    /// Create a new `[MzMLbReader]` with an internal cache size of `chunk_size` elements
+    /// Create a new [`MzMLbReader`] with an internal cache size of `chunk_size` elements
     /// per data array to reduce the number of disk reads needed to populate spectra.
     ///
     /// The default chunk size is 2**20 elements, which can use as much as 8.4 MB for 64-bit
@@ -772,7 +772,7 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
         Ok(buf)
     }
 
-    /// Create a new `[MzMLbReader]` with the default caching behavior.
+    /// Create a new [`MzMLbReader`] with the default caching behavior.
     pub fn new<P: AsRef<Path>>(path: &P) -> io::Result<Self> {
         Self::with_chunk_size(path, ExternalDataRegistry::default_chunk_size())
     }
@@ -870,7 +870,9 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
     }
 }
 
-/// [`MzMLReaderType`] instances are [`Iterator`]s over [`MultiLayerSpectrum`]
+/// [`MzMLbReaderType`] instances are [`Iterator`]s over [`MultiLayerSpectrum`], like all
+/// file format readers. This involves advancing the position of the internal mzML file
+/// reader in-place without seeking.
 impl<C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting + BuildFromArrayMap> Iterator for MzMLbReaderType<C, D> {
     type Item = MultiLayerSpectrum<C, D>;
 
@@ -992,7 +994,7 @@ impl<C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting +
     #[allow(unused)]
     /// The underlying HDF5 library for Rust, [`hdf5`](https://docs.rs/hdf5/latest/hdf5/) doesn't
     /// support reading directly from Rust [`io::Read`]-implementing objects yet. Without a means
-    /// of retrieving a [`path::Path`]-like value from a file handle, with the [`filename`](https://docs.rs/filename/latest/filename/)
+    /// of retrieving a [`path::Path`](std::path::Path)-like value from a file handle, with the [`filename`](https://docs.rs/filename/latest/filename/)
     /// library, this method **panics**. Enable this extra feature if you would like this method to
     /// work, but it is reported to have obscure compilation errors.
     fn open_file(source: fs::File) -> Self {

diff --git a/src/io/traits.rs b/src/io/traits.rs
@@ -368,6 +368,27 @@ pub enum SpectrumAccessError {
     IOError(#[source] Option<io::Error>),
 }
 
+impl From<SpectrumAccessError> for io::Error {
+    fn from(value: SpectrumAccessError) -> Self {
+        let s = value.to_string();
+        match value {
+            SpectrumAccessError::SpectrumNotFound => io::Error::new(io::ErrorKind::NotFound, s),
+            SpectrumAccessError::SpectrumIdNotFound(_) => io::Error::new(io::ErrorKind::NotFound, s),
+            SpectrumAccessError::SpectrumIndexNotFound(_) => io::Error::new(io::ErrorKind::NotFound, s),
+            SpectrumAccessError::IOError(e) => {
+                match e {
+                    Some(e) => {
+                        e
+                    },
+                    None => {
+                        io::Error::new(io::ErrorKind::Other, s)
+                    },
+                }
+            },
+        }
+    }
+}
+
 /// An extension of [`ScanSource`] that supports relocatable iteration relative to a
 /// specific spectrum coordinate or identifier.
 pub trait RandomAccessSpectrumIterator<
@@ -667,6 +688,7 @@ pub trait SpectrumGrouping<
     }
 }
 
+/// Analogous to to [`RandomAccessSpectrumIterator`], but for [`SpectrumGrouping`] implementations.
 pub trait RandomAccessSpectrumGroupingIterator<
     C: CentroidLike + Default = CentroidPeak,
     D: DeconvolutedCentroidLike + Default = DeconvolutedPeak,
@@ -677,30 +699,9 @@ pub trait RandomAccessSpectrumGroupingIterator<
     fn start_from_id(&mut self, id: &str) -> Result<&Self, SpectrumAccessError>;
     fn start_from_index(&mut self, index: usize) -> Result<&Self, SpectrumAccessError>;
     fn start_from_time(&mut self, time: f64) -> Result<&Self, SpectrumAccessError>;
+    fn reset_state(&mut self);
 }
 
-impl<
-        R: RandomAccessSpectrumIterator<C, D, S>,
-        C: CentroidLike + Default,
-        D: DeconvolutedCentroidLike + Default,
-        S: SpectrumLike<C, D>,
-        G: SpectrumGrouping<C, D, S>,
-    > RandomAccessSpectrumGroupingIterator<C, D, S, G> for SpectrumGroupingIterator<R, C, D, S, G>
-{
-    fn start_from_id(&mut self, id: &str) -> Result<&Self, SpectrumAccessError> {
-        self.start_from_id(id)
-    }
-
-    fn start_from_index(&mut self, index: usize) -> Result<&Self, SpectrumAccessError> {
-        self.start_from_index(index)
-    }
-
-    fn start_from_time(&mut self, time: f64) -> Result<&Self, SpectrumAccessError> {
-        self.start_from_time(time)
-    }
-}
-
-
 
 /// A collection of spectra held in memory but providing an interface
 /// identical to a data file. This structure owns its data, so in order
@@ -711,7 +712,7 @@ pub struct MemoryScanSource<
     D: DeconvolutedCentroidLike + Default = DeconvolutedPeak,
     S: SpectrumLike<C, D> = MultiLayerSpectrum<C, D>,
 > {
-    spectra: Vec<S>,
+    spectra: VecDeque<S>,
     position: usize,
     offsets: OffsetIndex,
     _c: PhantomData<C>,
@@ -724,7 +725,7 @@ impl<
         S: SpectrumLike<C, D> + Clone,
     > MemoryScanSource<C, D, S>
 {
-    pub fn new(spectra: Vec<S>) -> Self {
+    pub fn new(spectra: VecDeque<S>) -> Self {
         let mut offsets = OffsetIndex::new("spectrum".to_string());
         spectra.iter().enumerate().for_each(|(i, s)| {
             offsets.insert(s.id().to_string(), i as u64);
@@ -834,9 +835,9 @@ impl<
         C: CentroidLike + Default,
         D: DeconvolutedCentroidLike + Default,
         S: SpectrumLike<C, D> + Clone,
-    > From<Vec<S>> for MemoryScanSource<C, D, S>
+    > From<VecDeque<S>> for MemoryScanSource<C, D, S>
 {
-    fn from(value: Vec<S>) -> Self {
+    fn from(value: VecDeque<S>) -> Self {
         Self::new(value)
     }
 }

diff --git a/src/meta/file_description.rs b/src/meta/file_description.rs
@@ -1,5 +1,5 @@
 use crate::impl_param_described;
-use crate::params::{Param, ParamDescribed, ParamList};
+use crate::params::{Param, ParamDescribed, ParamList, CURIE, ControlledVocabulary};
 
 #[derive(Debug, Clone, Default, PartialEq, Eq)]
 pub struct SourceFile {
@@ -17,6 +17,35 @@ pub struct FileDescription {
     pub source_files: Vec<SourceFile>,
 }
 
+impl FileDescription {
+    pub fn new(contents: ParamList, source_files: Vec<SourceFile>) -> Self {
+        Self {
+            contents,
+            source_files,
+        }
+    }
+
+    /// Checks to see if the "MS1 spectrum" term is present in the file contents
+    ///
+    /// **Note**: This does not actually inspect the spectra in the file, only the metadata,
+    /// which may be incorrect/missing.
+    pub fn has_ms1_spectra(&self) -> bool {
+        self.get_param_by_curie(&CURIE::new(ControlledVocabulary::MS, 1000579)).is_some()
+    }
+
+    /// Checks to see if the "MSn spectrum" term is present in the file contents.
+    ///
+    /// **Note**: This does not actually inspect the spectra in the file, only the metadata,
+    /// which may be incorrect/missing.
+    pub fn has_msn_spectra(&self) -> bool {
+        self.get_param_by_curie(&CURIE::new(ControlledVocabulary::MS, 1000580)).is_some()
+    }
+
+    pub fn has_contents(&self) -> bool {
+        self.contents.len() > 0
+    }
+}
+
 impl_param_described!(SourceFile);
 
 impl ParamDescribed for FileDescription {

diff --git a/src/params.rs b/src/params.rs
@@ -4,8 +4,11 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fmt::Display;
+use std::num;
 use std::str::{self, FromStr};
 
+use thiserror::Error;
+
 #[doc(hidden)]
 #[derive(Debug, Clone, PartialEq, PartialOrd)]
 pub enum ValueType {
@@ -15,6 +18,61 @@ pub enum ValueType {
     Other(Box<Vec<u8>>)
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct CURIE {
+    controlled_vocabulary: ControlledVocabulary,
+    accession: u32
+}
+
+impl CURIE {
+    pub const fn new(cv_id: ControlledVocabulary, accession: u32) -> Self { Self { controlled_vocabulary: cv_id, accession } }
+}
+
+impl<T: ParamLike> PartialEq<T> for CURIE {
+    fn eq(&self, other: &T) -> bool {
+        if other.is_controlled() {
+            false
+        } else {
+            if other.controlled_vocabulary().unwrap() != self.controlled_vocabulary {
+                false
+            } else if other.accession().unwrap() != self.accession {
+                false
+            } else {
+                true
+            }
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum CURIEParsingError {
+    #[error("{0} is not a recognized controlled vocabulary")]
+    UnknownControlledVocabulary(#[from] #[source] ControlledVocabularyResolutionError),
+    #[error("Failed to parse accession number {0}")]
+    AccessionParsingError(#[from] #[source] num::ParseIntError),
+    #[error("Did not detect a namespace separator ':' token")]
+    MissingNamespaceSeparator
+}
+
+impl FromStr for CURIE {
+    type Err = CURIEParsingError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut tokens = s.split(":");
+        let cv = tokens.next().unwrap();
+        let accession = tokens.next();
+        if accession.is_none() {
+            Err(CURIEParsingError::MissingNamespaceSeparator)
+        } else{
+
+            let cv: ControlledVocabulary = cv.parse::<ControlledVocabulary>()?;
+
+            let accession = accession.unwrap().parse()?;
+            Ok(CURIE::new(cv, accession))
+        }
+    }
+}
+
 pub fn curie_to_num(curie: &str) -> (Option<ControlledVocabulary>, Option<u32>) {
     let mut parts = curie.split(':');
     let prefix = match parts.next() {
@@ -46,10 +104,6 @@ pub trait ParamLike {
         }
     }
 
-    // fn parse<T: str::FromStr>(&self) -> Result<T, T::Err> {
-    //     self.value().parse::<T>()
-    // }
-
     fn parse<T: str::FromStr>(&self) -> Result<T, T::Err> {
         self.value().parse::<T>()
     }
@@ -353,15 +407,13 @@ impl ControlledVocabulary {
 }
 
 #[doc(hidden)]
-#[derive(Debug, Clone)]
-pub enum ControlledVocabularyResolutionError {}
-
-impl Display for ControlledVocabularyResolutionError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(format!("{:?}", self).as_str())
-    }
+#[derive(Debug, Clone, Error)]
+pub enum ControlledVocabularyResolutionError {
+    #[error("Unrecognized controlled vocabulary {0}")]
+    UnknownControlledVocabulary(String)
 }
 
+
 impl FromStr for ControlledVocabulary {
     type Err = ControlledVocabularyResolutionError;
 
@@ -394,6 +446,10 @@ pub trait ParamDescribed {
         self.params().iter().find(|&param| param.name == name)
     }
 
+    fn get_param_by_curie(&self, curie: &CURIE) -> Option<&Param> {
+        self.params().iter().find(|&param| curie == param)
+    }
+
     fn get_param_by_accession(&self, accession: &str) -> Option<&Param> {
         let (cv, acc_num) = curie_to_num(accession);
         return self

diff --git a/src/prelude.rs b/src/prelude.rs
@@ -1,14 +1,17 @@
 //! A set of foundational traits used throughout the library.
 pub use crate::io::traits::{
     MZFileReader, RandomAccessSpectrumIterator, SpectrumAccessError, ScanSource, ScanWriter, SeekRead,
-    SpectrumGrouping, SpectrumIterator,
+    SpectrumGrouping, SpectrumIterator, RandomAccessSpectrumGroupingIterator,
 };
 
 pub use crate::meta::MSDataFileMetadata;
 pub use crate::params::{ParamDescribed, ParamLike};
 pub use crate::spectrum::{IonProperties, PrecursorSelection, SpectrumLike};
 pub use crate::spectrum::bindata::{ByteArrayView, ByteArrayViewMut, BuildArrayMapFrom, BuildFromArrayMap};
 
+#[cfg(feature = "mzsignal")]
+pub use crate::spectrum::group::SpectrumGroupAveraging;
+
 #[doc(hidden)]
 pub use std::convert::TryInto;
 #[doc(hidden)]