Skip to content

Commit

Permalink
Clean up group iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
mobiusklein committed Jan 7, 2024
1 parent 94ac489 commit c26db89
Show file tree
Hide file tree
Showing 10 changed files with 504 additions and 177 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ required-features = ["parallelism"]
name = "averaging_writer"
required-features = ["parallelism", "mzsignal", "nalgebra"]

[[example]]
name = "random_access_iter"
required-features = ["nalgebra"]

[lib]
name = "mzdata"
Expand Down
55 changes: 55 additions & 0 deletions examples/random_access_iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use std::{env, io, path};

use mzdata::io::mzml;
use mzdata::prelude::*;

fn main() -> io::Result<()> {
let path = path::PathBuf::from(
env::args()
.nth(1)
.expect("Please pass an MS data file path"),
// "test/data/batching_test.mzML"
);

let mut reader = mzml::MzMLReader::open_path(path)?;

let n_spectra = reader.len();

// Find the spectrum at the midpoint of the run
let spec = reader.get_spectrum_by_index(n_spectra / 2).unwrap();
eprintln!(
"Midpoint spectrum {} (level {}) at time {}",
spec.id(),
spec.ms_level(),
spec.start_time()
);

// Jump the iterator to that point in time
reader.start_from_time(spec.start_time())?;
let s = reader.next().unwrap();
eprintln!("Resuming at {} (level {}) at time {}", s.id(), s.ms_level(), s.start_time());

// Convert the iterator into a group iterator
let mut group_iter = reader.into_groups();
// Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead)
group_iter.start_from_time(spec.start_time())?;
let g = group_iter.next().unwrap();
eprintln!(
"Resuming at group having {:?} at time {:?}",
g.earliest_spectrum().and_then(|s| Some(s.id())),
g.earliest_spectrum().and_then(|s| Some(s.start_time()))
);

// Convert the group iterator into an averaging group iterator
let mut avg_iter = group_iter.averaging(1, 200.0, 2200.0, 0.005);
// Jump the group iterator to that point in time (If an MSn spectrum was found, the next MS1 may be shown instead)
avg_iter.start_from_time(spec.start_time())?;
let g = avg_iter.next().unwrap();
eprintln!(
"Resuming at group having {:?} at time {:?}",
g.earliest_spectrum().and_then(|s| Some(s.id())),
g.earliest_spectrum().and_then(|s| Some(s.start_time()))
);

Ok(())
}
14 changes: 8 additions & 6 deletions src/io/mzmlb/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -687,9 +687,9 @@ pub struct MzMLbReaderType<
}

impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting + BuildFromArrayMap> MzMLbReaderType<C, D> {
/// Create a new `[MzMLbReader]` with an internal cache size of `chunk_size` elements
/// Create a new [`MzMLbReader`] with an internal cache size of `chunk_size` elements
/// per data array to reduce the number of disk reads needed to populate spectra, and
/// sets the `[DetailLevel]`.
/// sets the [`DetailLevel`].
pub fn with_chunk_size_and_detail_level<P: AsRef<Path>>(
path: &P,
chunk_size: usize,
Expand Down Expand Up @@ -733,7 +733,7 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
Ok(inst)
}

/// Create a new `[MzMLbReader]` with an internal cache size of `chunk_size` elements
/// Create a new [`MzMLbReader`] with an internal cache size of `chunk_size` elements
/// per data array to reduce the number of disk reads needed to populate spectra.
///
/// The default chunk size is 2**20 elements, which can use as much as 8.4 MB for 64-bit
Expand Down Expand Up @@ -772,7 +772,7 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
Ok(buf)
}

/// Create a new `[MzMLbReader]` with the default caching behavior.
/// Create a new [`MzMLbReader`] with the default caching behavior.
pub fn new<P: AsRef<Path>>(path: &P) -> io::Result<Self> {
Self::with_chunk_size(path, ExternalDataRegistry::default_chunk_size())
}
Expand Down Expand Up @@ -870,7 +870,9 @@ impl<'a, 'b: 'a, C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPea
}
}

/// [`MzMLReaderType`] instances are [`Iterator`]s over [`MultiLayerSpectrum`]
/// [`MzMLbReaderType`] instances are [`Iterator`]s over [`MultiLayerSpectrum`], like all
/// file format readers. This involves advancing the position of the internal mzML file
/// reader in-place without seeking.
impl<C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting + BuildFromArrayMap> Iterator for MzMLbReaderType<C, D> {
type Item = MultiLayerSpectrum<C, D>;

Expand Down Expand Up @@ -992,7 +994,7 @@ impl<C: CentroidPeakAdapting + BuildFromArrayMap, D: DeconvolutedPeakAdapting +
#[allow(unused)]
/// The underlying HDF5 library for Rust, [`hdf5`](https://docs.rs/hdf5/latest/hdf5/) doesn't
/// support reading directly from Rust [`io::Read`]-implementing objects yet. Without a means
/// of retrieving a [`path::Path`]-like value from a file handle, with the [`filename`](https://docs.rs/filename/latest/filename/)
/// of retrieving a [`path::Path`](std::path::Path)-like value from a file handle, with the [`filename`](https://docs.rs/filename/latest/filename/)
/// library, this method **panics**. Enable this extra feature if you would like this method to
/// work, but it is reported to have obscure compilation errors.
fn open_file(source: fs::File) -> Self {
Expand Down
53 changes: 27 additions & 26 deletions src/io/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,27 @@ pub enum SpectrumAccessError {
IOError(#[source] Option<io::Error>),
}

impl From<SpectrumAccessError> for io::Error {
fn from(value: SpectrumAccessError) -> Self {
let s = value.to_string();
match value {
SpectrumAccessError::SpectrumNotFound => io::Error::new(io::ErrorKind::NotFound, s),
SpectrumAccessError::SpectrumIdNotFound(_) => io::Error::new(io::ErrorKind::NotFound, s),
SpectrumAccessError::SpectrumIndexNotFound(_) => io::Error::new(io::ErrorKind::NotFound, s),
SpectrumAccessError::IOError(e) => {
match e {
Some(e) => {
e
},
None => {
io::Error::new(io::ErrorKind::Other, s)
},
}
},
}
}
}

/// An extension of [`ScanSource`] that supports relocatable iteration relative to a
/// specific spectrum coordinate or identifier.
pub trait RandomAccessSpectrumIterator<
Expand Down Expand Up @@ -667,6 +688,7 @@ pub trait SpectrumGrouping<
}
}

/// Analogous to to [`RandomAccessSpectrumIterator`], but for [`SpectrumGrouping`] implementations.
pub trait RandomAccessSpectrumGroupingIterator<
C: CentroidLike + Default = CentroidPeak,
D: DeconvolutedCentroidLike + Default = DeconvolutedPeak,
Expand All @@ -677,30 +699,9 @@ pub trait RandomAccessSpectrumGroupingIterator<
fn start_from_id(&mut self, id: &str) -> Result<&Self, SpectrumAccessError>;
fn start_from_index(&mut self, index: usize) -> Result<&Self, SpectrumAccessError>;
fn start_from_time(&mut self, time: f64) -> Result<&Self, SpectrumAccessError>;
fn reset_state(&mut self);
}

impl<
R: RandomAccessSpectrumIterator<C, D, S>,
C: CentroidLike + Default,
D: DeconvolutedCentroidLike + Default,
S: SpectrumLike<C, D>,
G: SpectrumGrouping<C, D, S>,
> RandomAccessSpectrumGroupingIterator<C, D, S, G> for SpectrumGroupingIterator<R, C, D, S, G>
{
fn start_from_id(&mut self, id: &str) -> Result<&Self, SpectrumAccessError> {
self.start_from_id(id)
}

fn start_from_index(&mut self, index: usize) -> Result<&Self, SpectrumAccessError> {
self.start_from_index(index)
}

fn start_from_time(&mut self, time: f64) -> Result<&Self, SpectrumAccessError> {
self.start_from_time(time)
}
}



/// A collection of spectra held in memory but providing an interface
/// identical to a data file. This structure owns its data, so in order
Expand All @@ -711,7 +712,7 @@ pub struct MemoryScanSource<
D: DeconvolutedCentroidLike + Default = DeconvolutedPeak,
S: SpectrumLike<C, D> = MultiLayerSpectrum<C, D>,
> {
spectra: Vec<S>,
spectra: VecDeque<S>,
position: usize,
offsets: OffsetIndex,
_c: PhantomData<C>,
Expand All @@ -724,7 +725,7 @@ impl<
S: SpectrumLike<C, D> + Clone,
> MemoryScanSource<C, D, S>
{
pub fn new(spectra: Vec<S>) -> Self {
pub fn new(spectra: VecDeque<S>) -> Self {
let mut offsets = OffsetIndex::new("spectrum".to_string());
spectra.iter().enumerate().for_each(|(i, s)| {
offsets.insert(s.id().to_string(), i as u64);
Expand Down Expand Up @@ -834,9 +835,9 @@ impl<
C: CentroidLike + Default,
D: DeconvolutedCentroidLike + Default,
S: SpectrumLike<C, D> + Clone,
> From<Vec<S>> for MemoryScanSource<C, D, S>
> From<VecDeque<S>> for MemoryScanSource<C, D, S>
{
fn from(value: Vec<S>) -> Self {
fn from(value: VecDeque<S>) -> Self {
Self::new(value)
}
}
Expand Down
31 changes: 30 additions & 1 deletion src/meta/file_description.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::impl_param_described;
use crate::params::{Param, ParamDescribed, ParamList};
use crate::params::{Param, ParamDescribed, ParamList, CURIE, ControlledVocabulary};

#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct SourceFile {
Expand All @@ -17,6 +17,35 @@ pub struct FileDescription {
pub source_files: Vec<SourceFile>,
}

impl FileDescription {
pub fn new(contents: ParamList, source_files: Vec<SourceFile>) -> Self {
Self {
contents,
source_files,
}
}

/// Checks to see if the "MS1 spectrum" term is present in the file contents
///
/// **Note**: This does not actually inspect the spectra in the file, only the metadata,
/// which may be incorrect/missing.
pub fn has_ms1_spectra(&self) -> bool {
self.get_param_by_curie(&CURIE::new(ControlledVocabulary::MS, 1000579)).is_some()
}

/// Checks to see if the "MSn spectrum" term is present in the file contents.
///
/// **Note**: This does not actually inspect the spectra in the file, only the metadata,
/// which may be incorrect/missing.
pub fn has_msn_spectra(&self) -> bool {
self.get_param_by_curie(&CURIE::new(ControlledVocabulary::MS, 1000580)).is_some()
}

pub fn has_contents(&self) -> bool {
self.contents.len() > 0
}
}

impl_param_described!(SourceFile);

impl ParamDescribed for FileDescription {
Expand Down
78 changes: 67 additions & 11 deletions src/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt::Display;
use std::num;
use std::str::{self, FromStr};

use thiserror::Error;

#[doc(hidden)]
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub enum ValueType {
Expand All @@ -15,6 +18,61 @@ pub enum ValueType {
Other(Box<Vec<u8>>)
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct CURIE {
controlled_vocabulary: ControlledVocabulary,
accession: u32
}

impl CURIE {
pub const fn new(cv_id: ControlledVocabulary, accession: u32) -> Self { Self { controlled_vocabulary: cv_id, accession } }
}

impl<T: ParamLike> PartialEq<T> for CURIE {
fn eq(&self, other: &T) -> bool {
if other.is_controlled() {
false
} else {
if other.controlled_vocabulary().unwrap() != self.controlled_vocabulary {
false
} else if other.accession().unwrap() != self.accession {
false
} else {
true
}
}
}
}

#[derive(Debug, Error)]
pub enum CURIEParsingError {
#[error("{0} is not a recognized controlled vocabulary")]
UnknownControlledVocabulary(#[from] #[source] ControlledVocabularyResolutionError),
#[error("Failed to parse accession number {0}")]
AccessionParsingError(#[from] #[source] num::ParseIntError),
#[error("Did not detect a namespace separator ':' token")]
MissingNamespaceSeparator
}

impl FromStr for CURIE {
type Err = CURIEParsingError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut tokens = s.split(":");
let cv = tokens.next().unwrap();
let accession = tokens.next();
if accession.is_none() {
Err(CURIEParsingError::MissingNamespaceSeparator)
} else{

let cv: ControlledVocabulary = cv.parse::<ControlledVocabulary>()?;

let accession = accession.unwrap().parse()?;
Ok(CURIE::new(cv, accession))
}
}
}

pub fn curie_to_num(curie: &str) -> (Option<ControlledVocabulary>, Option<u32>) {
let mut parts = curie.split(':');
let prefix = match parts.next() {
Expand Down Expand Up @@ -46,10 +104,6 @@ pub trait ParamLike {
}
}

// fn parse<T: str::FromStr>(&self) -> Result<T, T::Err> {
// self.value().parse::<T>()
// }

fn parse<T: str::FromStr>(&self) -> Result<T, T::Err> {
self.value().parse::<T>()
}
Expand Down Expand Up @@ -353,15 +407,13 @@ impl ControlledVocabulary {
}

#[doc(hidden)]
#[derive(Debug, Clone)]
pub enum ControlledVocabularyResolutionError {}

impl Display for ControlledVocabularyResolutionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(format!("{:?}", self).as_str())
}
#[derive(Debug, Clone, Error)]
pub enum ControlledVocabularyResolutionError {
#[error("Unrecognized controlled vocabulary {0}")]
UnknownControlledVocabulary(String)
}


impl FromStr for ControlledVocabulary {
type Err = ControlledVocabularyResolutionError;

Expand Down Expand Up @@ -394,6 +446,10 @@ pub trait ParamDescribed {
self.params().iter().find(|&param| param.name == name)
}

fn get_param_by_curie(&self, curie: &CURIE) -> Option<&Param> {
self.params().iter().find(|&param| curie == param)
}

fn get_param_by_accession(&self, accession: &str) -> Option<&Param> {
let (cv, acc_num) = curie_to_num(accession);
return self
Expand Down
5 changes: 4 additions & 1 deletion src/prelude.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
//! A set of foundational traits used throughout the library.
pub use crate::io::traits::{
MZFileReader, RandomAccessSpectrumIterator, SpectrumAccessError, ScanSource, ScanWriter, SeekRead,
SpectrumGrouping, SpectrumIterator,
SpectrumGrouping, SpectrumIterator, RandomAccessSpectrumGroupingIterator,
};

pub use crate::meta::MSDataFileMetadata;
pub use crate::params::{ParamDescribed, ParamLike};
pub use crate::spectrum::{IonProperties, PrecursorSelection, SpectrumLike};
pub use crate::spectrum::bindata::{ByteArrayView, ByteArrayViewMut, BuildArrayMapFrom, BuildFromArrayMap};

#[cfg(feature = "mzsignal")]
pub use crate::spectrum::group::SpectrumGroupAveraging;

#[doc(hidden)]
pub use std::convert::TryInto;
#[doc(hidden)]
Expand Down
Loading

0 comments on commit c26db89

Please sign in to comment.