From 8ecadd6021a14f0b7cb690b4d56d8c470c345583 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 30 Apr 2024 11:26:36 -0400 Subject: [PATCH 01/41] bump x.y.z version to even y prior to creation of RELEASE_3_19 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5e2617d8..da8033b9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.13.8 +Version: 1.14.0 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different From 051567eeff94281b916c381156faaf6278518f12 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 30 Apr 2024 11:26:36 -0400 Subject: [PATCH 02/41] bump x.y.z version to odd y following creation of RELEASE_3_19 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index da8033b9..26055771 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.14.0 +Version: 1.15.0 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different From 24bdf519e993279014d928ad2ec57b9006c80dbf Mon Sep 17 00:00:00 2001 From: jorainer Date: Thu, 16 May 2024 07:44:53 +0200 Subject: [PATCH 03/41] feat: add dataStorageBasePath method --- NAMESPACE | 3 +++ R/AllGenerics.R | 4 ++++ R/MsBackend.R | 31 ++++++++++++++++++++++++++++++ R/MsBackendMzR.R | 16 +++++++++++++++ R/Spectra.R | 9 +++++++++ man/MsBackend.Rd | 12 ++++++++++++ tests/testthat/test_MsBackend.R | 6 ++++++ tests/testthat/test_MsBackendMzR.R | 14 ++++++++++++++ 8 files changed, 95 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 668b6c50..2d56f253 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ exportMethods("centroided<-") exportMethods("collisionEnergy<-") exportMethods("dataOrigin<-") exportMethods("dataStorage<-") +exportMethods("dataStorageBasePath<-") exportMethods("intensity<-") exportMethods("isolationWindowLowerMz<-") exportMethods("isolationWindowTargetMz<-") @@ -77,6 +78,7 @@ exportMethods(containsMz) exportMethods(containsNeutralLoss) exportMethods(dataOrigin) exportMethods(dataStorage) +exportMethods(dataStorageBasePath) exportMethods(dropNaSpectraVariables) exportMethods(entropy) exportMethods(export) @@ -156,6 +158,7 @@ importFrom(MsCoreUtils,coefMA) importFrom(MsCoreUtils,coefSG) importFrom(MsCoreUtils,coefWMA) importFrom(MsCoreUtils,common) +importFrom(MsCoreUtils,common_path) importFrom(MsCoreUtils,entropy) importFrom(MsCoreUtils,group) importFrom(MsCoreUtils,i2index) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index f68500ad..0b69bdaf 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -11,6 +11,10 @@ setGeneric("containsMz", function(object, ...) #' @rdname hidden_aliases setGeneric("containsNeutralLoss", function(object, ...) standardGeneric("containsNeutralLoss")) +setGeneric("dataStorageBasePath", function(object, ...) + standardGeneric("dataStorageBasePath")) +setGeneric("dataStorageBasePath<-", function(object, ..., value) + standardGeneric("dataStorageBasePath<-")) #' @rdname hidden_aliases setGeneric("dropNaSpectraVariables", function(object, ...) standardGeneric("dropNaSpectraVariables")) diff --git a/R/MsBackend.R b/R/MsBackend.R index 9528c628..dc73ee5f 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -11,6 +11,10 @@ #' @aliases backendInitialize #' @aliases backendParallelFactor,MsBackendMzR-method #' @aliases backendParallelFactor,MsBackendHdf5Peaks-method +#' @aliases dataStorageBasePath +#' @aliases dataStorageBasePath,MsBackendMzR-method +#' @aliases dataStorageBasePath<- +#' @aliases dataStorageBasePath<-,MsBackendMzR-method #' #' @description #' @@ -280,6 +284,16 @@ #' spectra in `object` with the data storage of each spectrum. Note that #' missing values (`NA_character_`) are not supported for `dataStorage`. #' +#' - `dataStorageBasePath()`, `dataStorageBasePath<-: gets or sets the common +#' *base* path of the directory containing all data files. If supported, +#' the function is expected to return (or accept) a `character` of length 1. +#' Most backends (such as for example the `MsBackendMemory` will not support +#' this function and `dataStorageBasePath()` will return `NA_character_`. +#' For `MsBackendMzR`, this function allows to get or change the path to the +#' directory containing the original data files, which is required if e.g. +#' a serialized `MsBackendMzR` instance gets copied to another computer or +#' file system. +#' #' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the #' object's `spectraData` that contain only missing values (`NA`). Note that #' while columns with only `NA`s are removed, a `spectraData()` call after @@ -1711,3 +1725,20 @@ setReplaceMethod("[[", "MsBackend", function(x, i, j, ..., value) { setMethod("uniqueMsLevels", "MsBackend", function(object, ...) { unique(msLevel(object)) }) + +#' @exportMethod dataStorageBasePath +#' +#' @rdname MsBackend +setMethod("dataStorageBasePath", "MsBackend", function(object) { + NA_character_ +}) + +#' @exportMethod dataStorageBasePath<- +#' +#' @rdname MsBackend +setReplaceMethod( + "dataStorageBasePath", "MsBackend", function(object, value) { + warning(class(object)[1L], " does not support changing", + " 'dataStorageBasePath'.") + object + }) diff --git a/R/MsBackendMzR.R b/R/MsBackendMzR.R index 74b00308..2e8cafd1 100644 --- a/R/MsBackendMzR.R +++ b/R/MsBackendMzR.R @@ -214,3 +214,19 @@ setMethod("export", "MsBackendMzR", function(object, x, file = tempfile(), setMethod("backendParallelFactor", "MsBackendMzR", function(object) { factor(dataStorage(object), levels = unique(dataStorage(object))) }) + +#' @importFrom MsCoreUtils common_path +setMethod("dataStorageBasePath", "MsBackendMzR", function(object) { + common_path(dataStorage(object)) +}) + +setReplaceMethod( + "dataStorageBasePath", "MsBackendMzR", function(object, value) { + ds <- dataStorage(object) + cp <- common_path(ds) + ds <- sub(cp, value, ds, fixed = TRUE) + if (!all(file.exists(unique(ds)))) + stop("Provided path does not contain all data files.") + dataStorage(object) <- ds + object + }) diff --git a/R/Spectra.R b/R/Spectra.R index 78a74464..9b6dfb5f 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -136,6 +136,15 @@ NULL #' - `...`: additional parameters specific for the `MsBackend` passed with #' parameter `backend`. #' +#' The `dataStorageBasePath()` and `dataStoragePath<-` functions allow, for +#' backend classes that support this operation, to get or change the *base* +#' path to the directory where the backend stores the data. In-memory backends +#' such as [MsBackendMemory] or [MsBackendDataFrame] keeping all MS data in +#' memory don't support, and need, this function, but for [MsBackendMzR] this +#' function can be used to update/adapt the path to the directory containing +#' the original data files. Thus, for `Spectra` objects (using this backend) +#' that were moved to another file system or computer, these functions allow to +#' adjust/adapt the base file path. #' #' @section Accessing spectra data: #' diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 1ee1d331..2e9292e9 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -17,6 +17,10 @@ \alias{backendInitialize} \alias{backendParallelFactor,MsBackendMzR-method} \alias{backendParallelFactor,MsBackendHdf5Peaks-method} +\alias{dataStorageBasePath} +\alias{dataStorageBasePath,MsBackendMzR-method} +\alias{dataStorageBasePath<-} +\alias{dataStorageBasePath<-,MsBackendMzR-method} \alias{backendBpparam,MsBackend-method} \alias{backendInitialize,MsBackend-method} \alias{backendMerge,list-method} @@ -93,6 +97,8 @@ \alias{$<-,MsBackend-method} \alias{[[,MsBackend-method} \alias{[[<-,MsBackend-method} +\alias{dataStorageBasePath,MsBackend-method} +\alias{dataStorageBasePath<-,MsBackend-method} \alias{MsBackendDataFrame} \alias{backendInitialize,MsBackendDataFrame-method} \alias{MsBackendHdf5Peaks} @@ -269,6 +275,10 @@ \S4method{uniqueMsLevels}{MsBackend}(object, ...) +\S4method{dataStorageBasePath}{MsBackend}(object) + +\S4method{dataStorageBasePath}{MsBackend}(object) <- value + MsBackendDataFrame() \S4method{backendInitialize}{MsBackendDataFrame}(object, data, peaksVariables = c("mz", "intensity"), ...) @@ -559,6 +569,8 @@ e.g. be the mzML file from which the data was read. \item \code{dataStorage()}: gets a \code{character} of length equal to the number of spectra in \code{object} with the data storage of each spectrum. Note that missing values (\code{NA_character_}) are not supported for \code{dataStorage}. +\item \code{dataStorageBasePath()}, \verb{dataStorageBasePath<-: gets or sets the common *base* path of the directory containing all data files. If supported, the function is expected to return (or accept) a }character\verb{of length 1. Most backends (such as for example the}MsBackendMemory\verb{will not support this function and}dataStorageBasePath()\verb{will return}NA_character_\verb{. For }MsBackendMzR\verb{, this function allows to get or change the path to the directory containing the original data files, which is required if e.g. a serialized }MsBackendMzR` instance gets copied to another computer or +file system. \item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the object's \code{spectraData} that contain only missing values (\code{NA}). Note that while columns with only \code{NA}s are removed, a \code{spectraData()} call after diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index 3d3f7e28..d80bd757 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -75,3 +75,9 @@ test_that("backendBpparam,MsBackend works", { test_that("backendParallelFactor,MsBackend works", { expect_equal(backendParallelFactor(MsBackendMemory()), factor()) }) + +test_that("dataStorageBasePath,MsExperiment works", { + expect_identical(dataStorageBasePath(MsBackendMemory()), NA_character_) + tmp <- MsBackendMemory() + expect_warning(dataStorageBasePath(tmp) <- "/", "not support") +}) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index ff891738..318a73ca 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -570,3 +570,17 @@ test_that("backendParallelFactor,MsBackendMzR", { factor(dataStorage(sciex_mzr), levels = unique(dataStorage(sciex_mzr)))) }) + +test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { + tmpd <- tempdir() + file.copy(sciex_file, tmpd) + + expect_equal(dataStorageBasePath(sciex_mzr), + MsCoreUtils::common_path(sciex_file)) + tmp <- sciex_mzr + dataStorageBasePath(tmp) <- tmpd + expect_equal(dataStorageBasePath(tmp), tmpd) + + #' errors + expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") +}) From 4c7d03acab6ed77391ba13de9997f5809fcc0972 Mon Sep 17 00:00:00 2001 From: jorainer Date: Thu, 16 May 2024 11:42:05 +0200 Subject: [PATCH 04/41] docs: bump version --- DESCRIPTION | 2 +- NEWS.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 26055771..40e8d371 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.0 +Version: 1.15.1 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 0f67f490..b4a949e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# Spectra 1.15 + +## Changes in 1.15.2 + + + # Spectra 1.13 ## Changes in 1.13.8 From 6ae16cbf53663620867bc5c91b46032994aa09f6 Mon Sep 17 00:00:00 2001 From: jorainer Date: Thu, 16 May 2024 11:57:37 +0200 Subject: [PATCH 05/41] docs: expand documentation on combineSpectra - Expand the documentation for `combineSpectra()` and `combinePeaks()` (issue #320). --- .github/workflows/check-bioc.yml | 6 +++--- DESCRIPTION | 2 +- NEWS.md | 7 ++++++ R/Spectra.R | 18 +++++++++------- man/Spectra.Rd | 18 +++++++++------- vignettes/Spectra.Rmd | 37 +++++++++++++++++++++----------- 6 files changed, 55 insertions(+), 33 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index c7c036d7..296f3674 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -52,9 +52,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: 'next', bioc: '3.19'} - - { os: windows-latest, r: 'next', bioc: '3.19'} + - { os: ubuntu-latest, r: '4.4', bioc: '3.20', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: '4.4', bioc: '3.20'} + - { os: windows-latest, r: '4.4', bioc: '3.20'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} diff --git a/DESCRIPTION b/DESCRIPTION index 26055771..40e8d371 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.0 +Version: 1.15.1 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 0f67f490..f72e9681 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# Spectra 1.15 + +## Changes in 1.15.1 + +- Improve documentation for `combineSpectra()` and `combinePeaks()` [issue + #320](https://github.com/rformassspectrometry/Spectra/issues/320). + # Spectra 1.13 ## Changes in 1.13.8 diff --git a/R/Spectra.R b/R/Spectra.R index a1e58193..690a8fd5 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -594,7 +594,7 @@ NULL #' access and it is possible to *revert* the operation with the `reset()` #' function (see description of `reset()` above). #' -#' - `combinePeaks()`: combines mass peaks within each spectrum with a +#' - `combinePeaks()`: combines mass peaks **within each spectrum** with a #' difference in their m/z values that is smaller than the maximal #' acceptable difference defined by `ppm` and `tolerance`. Parameters #' `intensityFun` and `mzFun` allow to define functions to aggregate the @@ -612,13 +612,15 @@ NULL #' `reduceSpectra()` for a function to select a single *representative* #' mass peak for each peak group. #' -#' - `combineSpectra()`: combines sets of spectra into a single spectrum per -#' set. For each spectrum group (set), spectra variables from the first -#' spectrum are used and the peak matrices are combined using the function -#' specified with `FUN`, which defaults to [combinePeaksData()]. Please -#' refer to the [combinePeaksData()] help page for details and options of -#' the actual combination of peaks across the sets of spectra and to the -#' package vignette for examples and alternative ways to aggregate spectra. +#' - `combineSpectra()`: combines MS data from **sets of spectra into a +#' single spectrum per set** (in contrast to `combinePeaks()` or +#' `reduceSpectra()` that combine mass peaks **within each spectrum**). +#' For each spectrum group (set), spectra variables from the first spectrum +#' are used and the peak matrices are combined using the function specified +#' with `FUN`, which defaults to [combinePeaksData()]. Please refer to the +#' [combinePeaksData()] help page for details and options of the actual +#' combination of peaks across the sets of spectra and to the package +#' vignette for examples and alternative ways to aggregate spectra. #' The sets of spectra can be specified with parameter `f`. #' In addition it is possible to define, with parameter `p` if and how to #' split the input data for parallel processing. diff --git a/man/Spectra.Rd b/man/Spectra.Rd index c90ad700..4ff4f5ed 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1348,7 +1348,7 @@ same bin are aggregated using the function provided with parameter \code{FUN} the binning operation is applied to the peak data on-the-fly upon data access and it is possible to \emph{revert} the operation with the \code{reset()} function (see description of \code{reset()} above). -\item \code{combinePeaks()}: combines mass peaks within each spectrum with a +\item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a difference in their m/z values that is smaller than the maximal acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters \code{intensityFun} and \code{mzFun} allow to define functions to aggregate the @@ -1364,13 +1364,15 @@ dropped (i.e. their values are replaced with \code{NA}) for combined peaks unless they are constant across the combined peaks. See also \code{reduceSpectra()} for a function to select a single \emph{representative} mass peak for each peak group. -\item \code{combineSpectra()}: combines sets of spectra into a single spectrum per -set. For each spectrum group (set), spectra variables from the first -spectrum are used and the peak matrices are combined using the function -specified with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please -refer to the \code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of -the actual combination of peaks across the sets of spectra and to the -package vignette for examples and alternative ways to aggregate spectra. +\item \code{combineSpectra()}: combines MS data from \strong{sets of spectra into a +single spectrum per set} (in contrast to \code{combinePeaks()} or +\code{reduceSpectra()} that combine mass peaks \strong{within each spectrum}). +For each spectrum group (set), spectra variables from the first spectrum +are used and the peak matrices are combined using the function specified +with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please refer to the +\code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of the actual +combination of peaks across the sets of spectra and to the package +vignette for examples and alternative ways to aggregate spectra. The sets of spectra can be specified with parameter \code{f}. In addition it is possible to define, with parameter \code{p} if and how to split the input data for parallel processing. diff --git a/vignettes/Spectra.Rmd b/vignettes/Spectra.Rmd index 74bcb1fb..753d2bb9 100644 --- a/vignettes/Spectra.Rmd +++ b/vignettes/Spectra.Rmd @@ -412,11 +412,14 @@ affecting the total number of spectra are (in alphabetic order): to specify whether all conditions (`match = "all"`; the default) or if any of the conditions must match (`match = "any"`; all spectra for which values are within any of the provided ranges are retained). +- `combineSpectra()`: allows to combine the MS data from sets of spectra into a + single spectrum per set. Thus, instead of filtering the data, this function + aggregates it. Filter functions that return the same number of spectra, but affect/subset the peaks data (m/z and intensity values) within each spectrum are: -- `combinePeaks()`: groups peaks within each spectrum based on similarity of +- `combinePeaks()`: groups peaks **within each spectrum** based on similarity of their m/z values and combines these into a single peak per peak group. - `deisotopeSpectra()`: deisotopes each individual spectrum keeping only the monoisotopic peak for peaks groups of potential isotopologues. @@ -871,18 +874,26 @@ See also `?plotSpectra` for more plotting options and examples. The `Spectra` package provides the `combineSpectra()` function that allows to *aggregate* multiple spectra into a single one. The main parameters of this -function are `f`, which defines the grouping of the spectra, and `FUN` which -allows to define the function that performs the actual aggregation. The default -aggregation function is `combinePeaksData()` (see `?combinePeaksData` for -details) that combines multiple spectra into a single spectrum with all peaks -from all input spectra (with additional paramter `peaks = "union"`), or peaks -that are present in a certain proportion of input spectra (with parameter -`peaks = "intersect"`; parameter `minProp` allows to define the minimum -required proportion of spectra in which a peak needs to be present. Below we -use this function to combine the spectra for 1-methylhistidine and caffeine -into a single spectrum for each compound. We use the spectra variable `$name`, -that contains the names of the compounds, to define which spectra should be -grouped together. +function are `f`, which defines the sets of spectra that should be combined, and +`FUN`, which allows to define the function that performs the actual +aggregation. The default aggregation function is `combinePeaksData()` (see +`?combinePeaksData` for details) that combines multiple spectra into a single +spectrum with all peaks from all input spectra (with additional paramter `peaks += "union"`), or peaks that are present in a certain proportion of input spectra +(with parameter `peaks = "intersect"`; parameter `minProp` allows to define the +minimum required proportion of spectra in which a peak needs to be present. It +is important to mention that, by default, the function combines all mass peaks +from all spectra with a similar m/z value into a single, representative mass +peak aggregating all their intensities into one. To avoid the resulting +intensity to be affected by potential noise peaks it might be advised to first +*clean* the individual mass spectra using e.g. the `combinePeaks()` or +`reduceSpectra()` functions that first aggregate mass peaks **within** each +individual spectrum. + +In this example we below we use `combineSpectra()` to combine the spectra for +1-methylhistidine and caffeine into a single spectrum for each compound. We use +the spectra variable `$name`, that contains the names of the compounds, to +define which spectra should be grouped together. ```{r} sps_agg <- combineSpectra(sps, f = sps$name) From 8527d64d754a1dc4e5974d5795761dd05cd68f31 Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 17 May 2024 08:15:49 +0200 Subject: [PATCH 06/41] fix: paths for Windows --- R/MsBackendMzR.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/MsBackendMzR.R b/R/MsBackendMzR.R index 2e8cafd1..7cadc0d5 100644 --- a/R/MsBackendMzR.R +++ b/R/MsBackendMzR.R @@ -223,10 +223,12 @@ setMethod("dataStorageBasePath", "MsBackendMzR", function(object) { setReplaceMethod( "dataStorageBasePath", "MsBackendMzR", function(object, value) { ds <- dataStorage(object) + ds <- gsub("\\", "/", ds, fixed = TRUE) + value <- gsub("\\", "/", value, fixed = TRUE) cp <- common_path(ds) ds <- sub(cp, value, ds, fixed = TRUE) if (!all(file.exists(unique(ds)))) stop("Provided path does not contain all data files.") - dataStorage(object) <- ds + dataStorage(object) <- normalizePath(ds) object }) From 44050a0c31bd1048e0db0dff894f097e4db0ddb0 Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 17 May 2024 11:05:59 +0200 Subject: [PATCH 07/41] fix: adapt unit test to Windows OS --- tests/testthat/test_MsBackendMzR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index 318a73ca..0a4e7954 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -579,7 +579,7 @@ test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { MsCoreUtils::common_path(sciex_file)) tmp <- sciex_mzr dataStorageBasePath(tmp) <- tmpd - expect_equal(dataStorageBasePath(tmp), tmpd) + expect_equal(dataStorageBasePath(tmp), gsub("\\", "/", tmpd, fixed = TRUE)) #' errors expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") From 05f5e8b48cb307564fdfd1c54c62b42b14ded351 Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 17 May 2024 11:56:54 +0200 Subject: [PATCH 08/41] fix: use normalzePath for path comparisons --- tests/testthat/test_MsBackendMzR.R | 4 +++- tests/testthat/test_Spectra.R | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index 0a4e7954..dcbf5ec0 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -579,7 +579,9 @@ test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { MsCoreUtils::common_path(sciex_file)) tmp <- sciex_mzr dataStorageBasePath(tmp) <- tmpd - expect_equal(dataStorageBasePath(tmp), gsub("\\", "/", tmpd, fixed = TRUE)) + expect_true(validObject(tmp)) + bp <- normalizePath(dataStorageBasePath(tmp)) + expect_equal(bp, tmpd) #' errors expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index 9bdf7bfc..d3db066b 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -1901,7 +1901,9 @@ test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { tmp <- sciex_mzr tmp <- Spectra(tmp) dataStorageBasePath(tmp) <- tmpd - expect_equal(dataStorageBasePath(tmp), tmpd) + expect_true(validObject(tmp@backend)) + bp <- normalizePath(dataStorageBasePath(tmp)) + expect_equal(bp, tmpd) #' errors expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") From 6bafa7d097438f621a18470ac2a78657788019f3 Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 17 May 2024 12:31:12 +0200 Subject: [PATCH 09/41] fix: use normalizePath also for tempdir() --- tests/testthat/test_MsBackendMzR.R | 2 +- tests/testthat/test_Spectra.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index dcbf5ec0..dee66253 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -572,7 +572,7 @@ test_that("backendParallelFactor,MsBackendMzR", { }) test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { - tmpd <- tempdir() + tmpd <- normalizePath(tempdir()) file.copy(sciex_file, tmpd) expect_equal(dataStorageBasePath(sciex_mzr), diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index d3db066b..b0cda2ca 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -1892,7 +1892,7 @@ test_that("entropy,Spectra works", { }) test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { - tmpd <- tempdir() + tmpd <- normalizePath(tempdir()) file.copy(sciex_file, tmpd) tmp <- Spectra(sciex_mzr) From 4a7522e713a7913d6981ce665ac7acf37a21fb71 Mon Sep 17 00:00:00 2001 From: jorainer Date: Wed, 5 Jun 2024 14:00:53 +0200 Subject: [PATCH 10/41] refactor: call functions for the processinQueue using namespace :: - Use direct namespace calls for functions used in the `Spectra`'s processing queue (i.e. using `MsCoreUtils::`) to ensure functions are called/available if run in parallel on Windows or called on a re-loaded object. --- DESCRIPTION | 2 +- NEWS.md | 7 +++++++ R/peaks-functions.R | 28 +++++++++++++++------------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 44b8e8a3..f61a77b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.2 +Version: 1.15.3 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 2babe28f..73614759 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # Spectra 1.15 +## Changes in 1.15.3 + +- For evaluation of the `Spectra`'s processing queue: call functions from the + *MetaboCoreUtils* directly through their namespace (`MsCoreUtils::`) to avoid + errors if performed in parallel on Windows machines or if called on a + re-loaded object. + ## Changes in 1.15.2 - Add `dataStorageDataPath()` and `dataStorageDataPath<-` methods to allow diff --git a/R/peaks-functions.R b/R/peaks-functions.R index 7419dd44..4c8ae8cf 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -87,7 +87,7 @@ NULL msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - x[which(between(x[, "intensity"], intensity)), , drop = FALSE] + x[which(MsCoreUtils::between(x[, "intensity"], intensity)), , drop = FALSE] } #' @description @@ -146,8 +146,9 @@ NULL keep = TRUE, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - no_match <- is.na(closest(x[, "mz"], mz, tolerance = tolerance, ppm = ppm, - duplicates = "keep", .check = FALSE)) + no_match <- is.na(MsCoreUtils::closest(x[, "mz"], mz, tolerance = tolerance, + ppm = ppm, duplicates = "keep", + .check = FALSE)) if (keep) x[!no_match, , drop = FALSE] else x[no_match, , drop = FALSE] } @@ -170,8 +171,8 @@ NULL if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) if (keep) - x[between(x[, "mz"], mz), , drop = FALSE] - else x[!between(x[, "mz"], mz), , drop = FALSE] + x[MsCoreUtils::between(x[, "mz"], mz), , drop = FALSE] + else x[!MsCoreUtils::between(x[, "mz"], mz), , drop = FALSE] } #' @description @@ -415,14 +416,14 @@ joinPeaksNone <- function(x, y, ...) { return(x) } - n <- noise(x[, 1L], x[, 2L], method = method, ...) + n <- MsCoreUtils::noise(x[, 1L], x[, 2L], method = method, ...) - l <- localMaxima(x[, 2L], hws = halfWindowSize) + l <- MsCoreUtils::localMaxima(x[, 2L], hws = halfWindowSize) p <- which(l & x[, 2L] > (snr * n)) if (k > 0L) { - cbind(mz = refineCentroids(x = x[, 1L], y = x[, 2L], p = p, + cbind(mz = MsCoreUtils::refineCentroids(x = x[, 1L], y = x[, 2L], p = p, k = k, threshold = threshold, descending = descending), intensity = x[p, 2L]) @@ -552,9 +553,10 @@ joinPeaksNone <- function(x, y, ...) { .peaks_deisotope <- function(x, substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), tolerance = 0, ppm = 10, charge = 1, ...) { - iso_grps <- isotopologues(x, substDefinition = substDefinition, - tolerance = tolerance, ppm = ppm, - charge = charge) + iso_grps <- MetaboCoreUtils::isotopologues( + x, substDefinition = substDefinition, + tolerance = tolerance, ppm = ppm, + charge = charge) if (length(iso_grps)) { rem <- unique(unlist(lapply(iso_grps, `[`, -1), use.names = FALSE)) x[-rem, , drop = FALSE] @@ -614,7 +616,7 @@ joinPeaksNone <- function(x, y, ...) { msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - grps <- group(x[, "mz"], tolerance = tolerance, ppm = ppm) + grps <- MsCoreUtils::group(x[, "mz"], tolerance = tolerance, ppm = ppm) lg <- length(grps) if (grps[lg] == lg) return(x) @@ -649,7 +651,7 @@ joinPeaksNone <- function(x, y, ...) { msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !nrow(x)) return(x) - keep <- is.na(closest(x[, "mz"], precursorMz, ppm = ppm, + keep <- is.na(MsCoreUtils::closest(x[, "mz"], precursorMz, ppm = ppm, tolerance = tolerance, duplicates = "keep", .check = FALSE)) x[keep, , drop = FALSE] From 15c2cf6451793c437fb4ccfc961d225e90b71f2f Mon Sep 17 00:00:00 2001 From: jorainer Date: Wed, 10 Jul 2024 16:05:05 +0200 Subject: [PATCH 11/41] feat: add new `filterPeaksRanges` function - Add a new `filterPeaksRanges` function that allows to filter mass peaks in a `Spectra` by any (combination of) numeric spectra and peaks variables. - Re-organize documentation of filter and subset operations (related to issue #288). --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 5 + R/Spectra-functions.R | 181 ++++++++++++++ R/Spectra.R | 309 ++++++++++++------------ R/peaks-functions.R | 59 +++++ man/Spectra.Rd | 277 +++++++++++---------- man/filterPeaksRanges.Rd | 137 +++++++++++ tests/testthat/test_Spectra-functions.R | 72 ++++++ tests/testthat/test_peaks-functions.R | 232 ++++++++++++++++++ vignettes/Spectra.Rmd | 96 +++++--- 11 files changed, 1047 insertions(+), 326 deletions(-) create mode 100644 man/filterPeaksRanges.Rd diff --git a/DESCRIPTION b/DESCRIPTION index f61a77b5..082e156a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.3 +Version: 1.15.4 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different @@ -73,7 +73,7 @@ BugReports: https://github.com/RforMassSpectrometry/Spectra/issues URL: https://github.com/RforMassSpectrometry/Spectra biocViews: Infrastructure, Proteomics, MassSpectrometry, Metabolomics Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Roxygen: list(markdown=TRUE) Collate: 'hidden_aliases.R' diff --git a/NAMESPACE b/NAMESPACE index 269d2e35..9d84e477 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(countIdentifications) export(deisotopeSpectra) export(estimatePrecursorIntensity) export(estimatePrecursorMz) +export(filterPeaksRanges) export(filterPrecursorIsotopes) export(filterPrecursorMaxIntensity) export(filterPrecursorPeaks) diff --git a/NEWS.md b/NEWS.md index 73614759..134689bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.4 + +- Add new `filterPeaksRanges()` function to filter mass peaks by ranges on + numeric spectra or peak variables. + ## Changes in 1.15.3 - For evaluation of the `Spectra`'s processing queue: call functions from the diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 12a82aea..67d47517 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -1182,3 +1182,184 @@ processingChunkFactor <- function(x) { stop("'x' is supposed to be a 'Spectra' object") .parallel_processing_factor(x) } + +#' @title Filter peaks based on spectra and peaks variable ranges +#' +#' @description +#' +#' The `filterPeaksRanges()` function allows to filter the peaks matrices of a +#' [Spectra] object using any set of range-based filters on numeric spectra +#' variables or peaks variables. These ranges can be passed to the function +#' using the `...` as ` = ` pairs. `` +#' has to be an available spectra or peaks variable. `` can be a +#' `numeric` of length 2 defining the lower and upper boundary, or a `numeric` +#' two-column matrix (multi-row matrices are also supported, see further +#' below). `filterPeaksRanges(s, mz = c(200, 300))` woudl for example reduce +#' the peaks matrices of the `Spectra` object `s` to mass peaks with an m/z +#' value between 200 and 300. `filterPeaksRanges()` returns the original +#' `Spectra` object with the filter operation added to the processing queue. +#' Thus, the filter gets only applied when the peaks data gets extracted +#' with `mz()`, `intensity()` or `peaksData()`. If ranges for both spectra +#' **and** peaks variables are defined, the function evaluates first whether +#' the spectra variable value for a spectrum is within the provided range and, +#' if so, applies also the peaks variable-based filter (otherwise an empty +#' peaks matrix is returned). +#' +#' If more than one spectra variable and/or peaks variable are defined, their +#' filter results are combined with a logical AND: a peak matrix is only +#' returned for a spectrum if all values of spectra variables are within the +#' provided (respective) ranges for spectra variables, and this matrix is +#' further filtered to contain only those peaks which values are within the +#' provided peaks variable ranges. +#' +#' **Filtering with multiple ranges** per spectra and peaks variables is also +#' supported: ranges can also be provided as multi-row numeric (two-column) +#' matrices. In this case, the above described procedure is applied for each +#' row separately and their results are combined with a logical OR, i.e. +#' peaks matrices are returned that match any of the conditions/filters +#' of a row. The number of rows of the provided ranges (being it for spectra +#' or peaks variables) have to match. +#' +#' **Missing value handling**: any comparison which involves a missing value +#' (beingvit a spectra variable value, a peaks variable value or a value +#' in one of thevprovided ranges) is treated as a logical `FALSE`. For +#' example, if the retention time of a spectrum is `NA` and the data is +#' filtered using a retention time range, an empty peaks matrix is returned +#' (for `keep = TRUE`, for `keep = FALSE` the full peaks matrix is returned). +#' +#' @note +#' +#' In contrast to other *filter* functions, this function does not provide a +#' `msLevel.` parameter to apply the filter only on spectra of the specified +#' MS levels. In contrast, to apply no, or different, filters to spectra from +#' different MS levels, multi-row range matrices can be used (see examples +#' below). +#' +#' @param object A [Spectra] object. +#' +#' @param ... the ranges for the spectra and/or peaks variables. Has to be +#' provided as ` = ` pairs with `` being the name of a +#' spectra or peaks variable (of numeric data type) and `` being +#' either a `numeric` of length 2 or a `numeric` two column matrix (see +#' function desription above for details), +#' +#' @param keep `logical(1)` whether to keep (default) or remove peaks that +#' match the provided range(s). +#' +#' @author Johannes Rainer +#' +#' @name filterPeaksRanges +#' +#' @export +#' +#' @examples +#' +#' ## Define a test Spectra +#' d <- data.frame(rtime = c(123.2, 134.2), msLevel = c(1L, 2L)) +#' d$mz <- list(c(100.1, 100.2, 100.3, 200.1, 200.2, 300.3), +#' c(100.3, 100.4, 200.2, 400.3, 400.4)) +#' ## Use the index of the mass peak within the spectrum as index for +#' ## better illustration of filtering results +#' d$intensity <- list(c(1:6), 1:5) +#' s <- Spectra(d) +#' s +#' +#' ## Filter peaks removing all mass peaks with an m/z between 200 and 300 +#' res <- filterPeaksRanges(s, mz = c(200, 300), keep = FALSE) +#' res +#' +#' ## The Spectra object has still the same length and spectra variables +#' length(res) +#' res$rtime +#' +#' ## The filter gets applied when mass peak data gets extracted, using either +#' ## `mz()`, `intensity()` or `peaksData()`. The filtered peaks data does +#' ## not contain any mass peaks with m/z values between 200 and 300: +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## We next combine spectra and filter variables. We want to keep only mass +#' ## peaks of MS2 spectra that have an m/z between 100 and 110. +#' res <- filterPeaksRanges(s, mz = c(100, 110), msLevel = c(2, 2)) +#' res +#' length(res) +#' +#' ## Only data for peaks are returned for which the spectra's MS level is +#' ## between 2 and 2 and with an m/z between 100 and 110. The peaks data for +#' ## the first spectrum, that has MS level 1, is thus empty: +#' peaksData(res)[[1L]] +#' +#' ## While the peaks matrix for the second spectrum (with MS level 2) contains +#' ## the mass peaks with m/z between 100 and 110. +#' peaksData(res)[[2L]] +#' +#' ## To keep also the peaks data for the first spectrum, we need to define +#' ## an additional set of ranges, which we define using a second row in each +#' ## ranges matrix. We use the same filter as above, i.e. keeping only mass +#' ## peaks with an m/z between 100 and 110 for spectra with MS level 2, but +#' ## add an additional row for MS level 1 spectra keeping mass peaks with an +#' ## m/z between 0 and 2000. Filter results of different rows are combined +#' ## using a logical OR, i.e. peaks matrices with mass peaks are returned +#' ## matching either the first, or the second row. +#' res <- filterPeaksRanges(s, mz = rbind(c(100, 110), c(0, 1000)), +#' msLevel = rbind(c(2, 2), c(1, 1))) +#' +#' ## The results for the MS level 2 spectrum are the same as before, but with +#' ## the additional row we keep the full peaks matrix of the MS1 spectrum: +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## As a last example we define a filter that keeps all mass peaks with an +#' ## m/z either between 100 and 200, or between 300 and 400. +#' res <- filterPeaksRanges(s, mz = rbind(c(100, 200), c(300, 400))) +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## Such filters could thus be defined to restrict/filter the MS data to +#' ## specific e.g. retention time and m/z ranges. +filterPeaksRanges <- function(object, ..., keep = TRUE) { + if (!inherits(object, "Spectra")) + stop("'object' is expected to be a 'Spectra' object.") + dots <- list(...) + variables <- names(dots) + if (!length(variables)) + return(object) + ## check that: + ## - variables are in spectraVariables + pvars <- peaksVariables(object) + svars <- spectraVariables(object) + if (!all(variables %in% c(svars, pvars))) + stop("Provided filter variable(s): ", + paste0("\"", variables[!variables %in% c(svars, pvars)], "\"", + collapse = ", "), " are not valid spectra variables. ", + "Use 'spectraVariables(object)' and 'peaksVariables()' to list ", + "available variables.") + ## - range parameters are defined correctly + err <- paste0("Range parameters have to be either a 'numeric' of length ", + "2 or a 'numeric' matrix with two columns.") + dots <- lapply(dots, function(z) { + if (is.null(nrow(z))) { + if (length(z) != 2) + stop(err) + z <- matrix(z, ncol = 2) + } + if (!is.matrix(z) | !is.numeric(z)) stop(err) + z + }) + ## - number for rows of matrices matches. + nr <- unlist(lapply(dots, nrow), use.names = FALSE) + if (any(nr != nr[1L])) + stop("Number of rows of the range matrices have to match.") + ## OK, now proceed to split by svar and pvar and pass to the peaks function. + pvars <- intersect(variables, pvars) + svars <- intersect(variables, svars) + object <- addProcessing(object, .peaks_filter_ranges, ranges = dots, + svars = svars, pvars = pvars, + spectraVariables = c(svars, "msLevel"), keep = keep) + if (keep) keep_or_remove <- "select" + else keep_or_remove <- "remove" + object@processing <- .logging( + object@processing, "Filter: ", keep_or_remove, " peaks based on ", + "user-provided ranges for ", length(variables), " variables") + object +} diff --git a/R/Spectra.R b/R/Spectra.R index cf212594..6a32176a 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -331,31 +331,23 @@ NULL #' - `uniqueMsLevels()`: get the unique MS levels available in `object`. This #' function is supposed to be more efficient than `unique(msLevel(object))`. #' -#' @section Data subsetting, filtering and merging: +#' @section Filter spectra data: #' -#' Subsetting and filtering of `Spectra` objects can be performed with the below -#' listed methods. +#' Filter a `Spectra` object based on the spectra data. This includes subset +#' operations that reduce the number of spectra in the object as well as filters +#' that reduce the *content* of the `Spectra` object. See section +#' *Filter peaks data* below for functions that filter the peaks data of a +#' `Spectra`. #' #' - `[`: subsets the spectra keeping only selected elements (`i`). The method #' **always** returns a `Spectra` object. #' -#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the -#' monoisotopic peak for groups of isotopologues. Isotopologues are -#' estimated using the [isotopologues()] function from the -#' *MetaboCoreUtils* package. Note that -#' the default parameters for isotope prediction/detection have been -#' determined using data from the Human Metabolome Database (HMDB) and -#' isotopes for elements other than CHNOPS might not be detected. See -#' parameter `substDefinition` in the documentation of [isotopologues()] for -#' more information. The approach and code to define the parameters for -#' isotope prediction is described -#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). -#' #' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the #' object's `spectraData` that contain only missing values (`NA`). Note that #' while columns with only `NA`s are removed, a `spectraData()` call after #' `dropNaSpectraVariables()` might still show columns containing `NA` values -#' for *core* spectra variables. +#' for *core* spectra variables. The total number of spectra is not changed +#' by this function. #' #' - `filterAcquisitionNum()`: filters the object keeping only spectra matching #' the provided acquisition numbers (argument `n`). If `dataOrigin` or @@ -382,32 +374,6 @@ NULL #' Returns the filtered `Spectra` object (with spectra in their #' original order). #' -#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier -#' artefact peaks from spectra (see examples below). The function iterates -#' through all intensity ordered peaks in a spectrum and removes all peaks -#' with an m/z within +/- `halfWindowSize` of the current peak if their -#' intensity is lower than `threshold` times the current peak's intensity. -#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` -#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` -#' being the maximum charge that should be considered and `isotopeTolerance` -#' the absolute acceptable tolerance for matching their m/z). -#' See [filterFourierTransformArtefacts()] for details and background and -#' `deisitopeSpectra()` for an alternative. -#' -#' - `filterIntensity()`: filters each spectrum keeping only peaks with -#' intensities that are within the provided range or match the criteria of -#' the provided function. For the former, parameter `intensity` has to be a -#' `numeric` defining the intensity range, for the latter a `function` that -#' takes the intensity values of the spectrum and returns a `logical` whether -#' the peak should be retained or not (see examples below for details) - -#' additional parameters to the function can be passed with `...`. To -#' remove only peaks with intensities below a certain threshold, say 100, use -#' `intensity = c(100, Inf)`. Note: also a single value can be passed with -#' the `intensity` parameter in which case an upper limit of `Inf` is used. -#' Note that this function removes also peaks with missing intensities -#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the -#' filtering to spectra of the specified MS level(s). -#' #' - `filterIsolationWindow()`: retains spectra that contain `mz` in their #' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` #' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` @@ -417,17 +383,6 @@ NULL #' the MS level specified with argument `msLevel`. Returns the filtered #' `Spectra` (with spectra in their original order). #' -#' - `filterMzRange()`: filters the object keeping or removing peaks in each -#' spectrum that are within the provided m/z range. Whether peaks are -#' retained or removed can be configured with parameter `keep` (default -#' `keep = TRUE`). -#' -#' - `filterMzValues()`: filters the object keeping **all** peaks in each -#' spectrum that match the provided m/z value(s) (for `keep = TRUE`, the -#' default) or removing **all** of them (for `keep = FALSE`). The m/z -#' matching considers also the absolute `tolerance` and m/z-relative -#' `ppm` values. `tolerance` and `ppm` have to be of length 1. -#' #' - `filterPolarity()`: filters the object keeping only spectra matching the #' provided polarity. Returns the filtered `Spectra` (with spectra in their #' original order). @@ -459,28 +414,12 @@ NULL #' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with #' missing precursor m/z value (e.g. MS1 spectra) are dropped. #' -#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with -#' an m/z equal or larger than the m/z of the precursor, depending on the -#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching -#' m/z (considering an absolute and relative acceptable difference depending -#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all -#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` -#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` -#' allows to restrict the filter to certain MS levels (by default the filter -#' is applied to all MS levels). Note that no peaks are removed if the -#' precursor m/z is `NA` (e.g. typically for MS1 spectra). -#' #' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. #' MS2) of acquisition number `acquisitionNum`. Returns the filtered #' `Spectra` (with spectra in their original order). Parameter `f` allows to #' define which spectra belong to the same sample or original data file ( #' defaults to `f = dataOrigin(object)`). #' -#' - `filterRt()`: retains spectra of MS level `msLevel` with retention -#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) -#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their -#' original order). -#' #' - `filterRanges()`: allows filtering of the `Spectra` object based on user #' defined *numeric* ranges (parameter `ranges`) for one or more available #' spectra variables in object (spectra variable names can be specified with @@ -491,6 +430,11 @@ NULL #' any of the conditions must match (`match = "any"`; all spectra for which #' values are within any of the provided ranges are retained). #' +#' - `filterRt()`: retains spectra of MS level `msLevel` with retention +#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) +#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' #' - `filterValues()`: allows filtering of the `Spectra` object based on #' similarities of *numeric* values of one or more `spectraVariables(object)` #' (parameter `spectraVariables`) to provided values (parameter `values`) @@ -500,20 +444,6 @@ NULL #' any of the conditions must match (`match = "any"`; all spectra for which #' values are within any of the provided ranges are retained). #' -#' - `reduceSpectra()`: for groups of peaks within highly similar m/z values -#' within each spectrum (given `ppm` and `tolerance`), this function keeps -#' only the peak with the highest intensity removing all other peaks hence -#' *reducing* each spectrum to the highest intensity peaks per *peak group*. -#' Peak groups are defined using the [group()] function from the -#' *MsCoreUtils* package. -#' -#' - `reset()`: restores the data to its original state (as much as possible): -#' removes any processing steps from the lazy processing queue and calls -#' `reset()` on the backend which, depending on the backend, can also undo -#' e.g. data filtering operations. Note that a `reset*(` call after -#' `applyProcessing()` will not have any effect. See examples below for more -#' information. -#' #' - `selectSpectraVariables()`: reduces the information within the object to #' the selected spectra variables: all data for variables not specified will #' be dropped. For mandatory columns (i.e., those listed by @@ -522,8 +452,141 @@ NULL #' user defined) spectra variables will be completely removed. #' Returns the filtered `Spectra`. #' -#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` -#' of `Spectra` objects. +#' +#' @section Filter or aggregate mass peak data: +#' +#' Operations that filter or aggregate the mass peak data from each spectrum +#' without changing the number of spectra in a `Spectra` object. +#' +#' - `combinePeaks()`: combines mass peaks **within each spectrum** with a +#' difference in their m/z values that is smaller than the maximal +#' acceptable difference defined by `ppm` and `tolerance`. Parameters +#' `intensityFun` and `mzFun` allow to define functions to aggregate the +#' intensity and m/z values for each such group of peaks. With +#' `weighted = TRUE` (the default), the m/z value of the combined peak is +#' calculated using an intensity-weighted mean and parameter `mzFun` is +#' ignored. The [MsCoreUtils::group()] function is used for the grouping of +#' mass peaks. Parameter `msLevel.` allows to define selected MS levels for +#' which peaks should be combined. This function returns a `Spectra` with +#' the same number of spectra than the input object, but with possibly +#' combined peaks within each spectrum. +# Additional peak variables (other than `"mz"` and `"intensity"`) are +#' dropped (i.e. their values are replaced with `NA`) for combined peaks +#' unless they are constant across the combined peaks. See also +#' `reduceSpectra()` for a function to select a single *representative* +#' mass peak for each peak group. +#' +#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the +#' monoisotopic peak for groups of isotopologues. Isotopologues are +#' estimated using the [isotopologues()] function from the +#' *MetaboCoreUtils* package. Note that +#' the default parameters for isotope prediction/detection have been +#' determined using data from the Human Metabolome Database (HMDB) and +#' isotopes for elements other than CHNOPS might not be detected. See +#' parameter `substDefinition` in the documentation of [isotopologues()] for +#' more information. The approach and code to define the parameters for +#' isotope prediction is described +#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' +#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier +#' artefact peaks from spectra (see examples below). The function iterates +#' through all intensity ordered peaks in a spectrum and removes all peaks +#' with an m/z within +/- `halfWindowSize` of the current peak if their +#' intensity is lower than `threshold` times the current peak's intensity. +#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` +#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` +#' being the maximum charge that should be considered and `isotopeTolerance` +#' the absolute acceptable tolerance for matching their m/z). +#' See [filterFourierTransformArtefacts()] for details and background and +#' `deisitopeSpectra()` for an alternative. +#' +#' - `filterIntensity()`: filters mass peaks in each spectrum keeping only +#' those with intensities that are within the provided range or match the +#' criteria of the provided function. For the former, parameter `intensity` +#' has to be a `numeric` defining the intensity range, for the latter a +#' `function` that takes the intensity values of the spectrum and returns +#' a `logical` whether the peak should be retained or not (see examples +#' below for details) - additional parameters to the function can be passed +#' with `...`. +#' To remove only peaks with intensities below a certain threshold, say +#' 100, use `intensity = c(100, Inf)`. Note: also a single value can be +#' passed with the `intensity` parameter in which case an upper limit of +#' `Inf` is used. +#' Note that this function removes also peaks with missing intensities +#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the +#' filtering to spectra of the specified MS level(s). +#' +#' - `filterMzRange()`: filters mass peaks in the object keeping or removing +#' those in each spectrum that are within the provided m/z range. Whether +#' peaks are retained or removed can be configured with parameter `keep` +#' (default `keep = TRUE`). +#' +#' - `filterMzValues()`: filters mass peaks in the object keeping all +#' peaks in each spectrum that match the provided m/z value(s) (for +#' `keep = TRUE`, the default) or removing all of them (for `keep = FALSE`). +#' The m/z matching considers also the absolute `tolerance` and m/z-relative +#' `ppm` values. `tolerance` and `ppm` have to be of length 1. +#' +#' - `filterPeaksRanges()`: filters mass peaks of a `Spectra` object using any +#' set of range-based filters on numeric spectra or peaks variables. See +#' [filterPeaksRanges()] for more information. +#' +#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with +#' an m/z equal or larger than the m/z of the precursor, depending on the +#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching +#' m/z (considering an absolute and relative acceptable difference depending +#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all +#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` +#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` +#' allows to restrict the filter to certain MS levels (by default the filter +#' is applied to all MS levels). Note that no peaks are removed if the +#' precursor m/z is `NA` (e.g. typically for MS1 spectra). +#' +#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in +#' (given `ppm` and `tolerance`) in each spectrum only the peak with the +#' highest intensity removing all other peaks hence *reducing* each +#' spectrum to the highest intensity peaks per *peak group*. +#' Peak groups are defined using the [group()] function from the +#' *MsCoreUtils* package. See also the `combinePeaks()` function for an +#' alternative function to combine peaks within each spectrum. +#' +#' +#' @section Merging, aggregating and splitting: +#' +#' Several `Spectra` objects can be concatenated into a single object with the +#' `c()` or the `concatenateSpectra()` function. Concatenation will fail if the +#' processing queue of any of the `Spectra` objects is not empty or if +#' different backends are used in the `Spectra` objects. Thus, in these cases, +#' prior to merging `Spectra` object it is suggested to change the backend to +#' a `MsBackendMemory` using the `setBackend()` function, and to *apply* all +#' data processing steps using `applyProcessing()`. The spectra variables +#' of the resulting `Spectra` object is the union of the spectra variables of +#' the individual `Spectra` objects. +#' +#' - `combineSpectra()`: combines MS data (i.e. mass peaks) from sets of +#' spectra into a single spectrum per set (in contrast to `combinePeaks()` +#' or `reduceSpectra()` that combine mass peaks **within each spectrum**). +#' For each spectrum group (set), spectra variables from the first spectrum +#' are used and the peak matrices are combined using the function specified +#' with `FUN`, which defaults to [combinePeaksData()]. Please refer to the +#' [combinePeaksData()] help page for details and options of the actual +#' combination of peaks across the sets of spectra and to the package +#' vignette for examples and alternative ways to aggregate spectra. +#' The sets of spectra can be specified with parameter `f`. +#' In addition it is possible to define, with parameter `p` if and how to +#' split the input data for parallel processing. +#' This defaults to `p = x$dataStorage` and hence a per-file parallel +#' processing is applied for `Spectra` with file-based backends (such as the +#' [MsBackendMzR()]). +#' Prior combination of the spectra all processings queued in the lazy +#' evaluation queue are applied. Be aware that calling `combineSpectra()` on a +#' `Spectra` object with certain backends that allow modifications might +#' **overwrite** the original data. This does not happen with a +#' `MsBackendMemory` or `MsBackendDataFrame` backend, but with a +#' `MsBackendHdf5Peaks` backend the m/z and intensity values in the original +#' hdf5 file(s) will be overwritten. +#' The function returns a `Spectra` of length equal to the unique levels +#' of `f`. #' #' - `joinSpectraData()`: Individual spectra variables can be directly #' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` @@ -542,13 +605,8 @@ NULL #' should be explored and ideally be removed using for #' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar #' functions. -#' -#' Several `Spectra` objects can be concatenated into a single object with the -#' `c()` or the `concatenateSpectra()` function. Concatenation will fail if the -#' processing queue of any of the `Spectra` objects is not empty or if -#' different backends are used in the `Spectra` objects. The spectra variables -#' of the resulting `Spectra` object is the union of the spectra variables of -#' the individual `Spectra` objects. +#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` +#' of `Spectra` objects. #' #' #' @section Data manipulation and analysis methods: @@ -603,49 +661,6 @@ NULL #' access and it is possible to *revert* the operation with the `reset()` #' function (see description of `reset()` above). #' -#' - `combinePeaks()`: combines mass peaks **within each spectrum** with a -#' difference in their m/z values that is smaller than the maximal -#' acceptable difference defined by `ppm` and `tolerance`. Parameters -#' `intensityFun` and `mzFun` allow to define functions to aggregate the -#' intensity and m/z values for each such group of peaks. With -#' `weighted = TRUE` (the default), the m/z value of the combined peak is -#' calculated using an intensity-weighted mean and parameter `mzFun` is -#' ignored. The [MsCoreUtils::group()] function is used for the grouping of -#' mass peaks. Parameter `msLevel.` allows to define selected MS levels for -#' which peaks should be combined. This function returns a `Spectra` with -#' the same number of spectra than the input object, but with possibly -#' combined peaks within each spectrum. -# Additional peak variables (other than `"mz"` and `"intensity"`) are -#' dropped (i.e. their values are replaced with `NA`) for combined peaks -#' unless they are constant across the combined peaks. See also -#' `reduceSpectra()` for a function to select a single *representative* -#' mass peak for each peak group. -#' -#' - `combineSpectra()`: combines MS data from **sets of spectra into a -#' single spectrum per set** (in contrast to `combinePeaks()` or -#' `reduceSpectra()` that combine mass peaks **within each spectrum**). -#' For each spectrum group (set), spectra variables from the first spectrum -#' are used and the peak matrices are combined using the function specified -#' with `FUN`, which defaults to [combinePeaksData()]. Please refer to the -#' [combinePeaksData()] help page for details and options of the actual -#' combination of peaks across the sets of spectra and to the package -#' vignette for examples and alternative ways to aggregate spectra. -#' The sets of spectra can be specified with parameter `f`. -#' In addition it is possible to define, with parameter `p` if and how to -#' split the input data for parallel processing. -#' This defaults to `p = x$dataStorage` and hence a per-file parallel -#' processing is applied for `Spectra` with file-based backends (such as the -#' [MsBackendMzR()]). -#' Prior combination of the spectra all processings queued in the lazy -#' evaluation queue are applied. Be aware that calling `combineSpectra()` on a -#' `Spectra` object with certain backends that allow modifications might -#' **overwrite** the original data. This does not happen with a -#' `MsBackendMemory` or `MsBackendDataFrame` backend, but with a -#' `MsBackendHdf5Peaks` backend the m/z and intensity values in the original -#' hdf5 file(s) will be overwritten. -#' The function returns a `Spectra` of length equal to the unique levels -#' of `f`. -#' #' - `compareSpectra()`: compares each spectrum in `x` with each spectrum in `y` #' using the function provided with `FUN` (defaults to [ndotproduct()]). If #' `y` is missing, each spectrum in `x` is compared with each other spectrum @@ -681,17 +696,6 @@ NULL #' the vignette for additional examples, such as using spectral entropy #' similarity in the scoring. #' -#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the -#' monoisotopic peak for groups of isotopologues. Isotopologues are -#' estimated using the [isotopologues()] function from the *MetaboCoreUtils* -#' package. Note that the default parameters for isotope -#' prediction/detection have been determined using data from the Human -#' Metabolome Database (HMDB) and isotopes for elements other than CHNOPS -#' might not be detected. See parameter `substDefinition` in the -#' documentation of [isotopologues()] for more information. The approach -#' and code to define the parameters for isotope prediction is described -#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). -#' #' - `entropy()`: calculates the entropy of each spectra based on the metrics #' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). #' See also [nentropy()] in the *MsCoreUtils* package for details. @@ -715,13 +719,12 @@ NULL #' - `processingLog()`: returns a `character` vector with the processing log #' messages. #' -#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in -#' (given `ppm` and `tolerance`) in each spectrum only the peak with the -#' highest intensity removing all other peaks hence *reducing* each -#' spectrum to the highest intensity peaks per *peak group*. -#' Peak groups are defined using the [group()] function from the -#' *MsCoreUtils* package. See also the `combinePeaks()` function for an -#' alternative function to combine peaks within each spectrum. +#' - `reset()`: restores the data to its original state (as much as possible): +#' removes any processing steps from the lazy processing queue and calls +#' `reset()` on the backend which, depending on the backend, can also undo +#' e.g. data filtering operations. Note that a `reset*(` call after +#' `applyProcessing()` will not have any effect. See examples below for more +#' information. #' #' - `scalePeaks()`: scales intensities of peaks within each spectrum depending #' on parameter `by`. With `by = sum` (the default) peak intensities are diff --git a/R/peaks-functions.R b/R/peaks-functions.R index 4c8ae8cf..7639538a 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -672,3 +672,62 @@ joinPeaksNone <- function(x, y, ...) { pmz <- precursorMz - tolerance - ppm(precursorMz, ppm = ppm) x[x[, "mz"] < pmz, , drop = FALSE] } + +#' filter a peak matrix `x` by (arbitrary) numeric ranges for spectra and/or +#' peaks variables. ranges for spectra and peaks variables are combined using +#' a logical AND, rows in the provided range matrices with a logical OR. +#' +#' Used by `filterPeaksRanges()` function for `Spectra`. +#' +#' @param svars `character` with the spectra variables for which filter ranges +#' where provided. +#' +#' @param pvars `character` with the peaks variables for which filter ranges +#' where provided. +#' +#' @param ranges `list` with `numeric` two-column matrices with the +#' user-provided ranges. The number of rows of all matrices is expected +#' to match. +#' +#' @param spectrumMsLevel `integer(1)` with the MS level of the peak matrix' +#' spectrum. +#' +#' @param keep `logical(1)` whether mass peaks that match the filters should be +#' kept or removed. +#' +#' @param ... values for all spectra variables defined in `svars` are expected +#' to be passed through `...` as `name = value` pairs. +#' +#' @author Johannes Rainer +#' +#' @noRd +.peaks_filter_ranges <- function(x, svars = character(), + pvars = character(), + ranges, spectrumMsLevel, + keep = TRUE, ...) { + svalue <- list(..., msLevel = spectrumMsLevel) + nx <- nrow(x) + sel <- rep(FALSE, nx) + for (i in seq_len(nrow(ranges[[1L]]))) { + ## check ranges for spectra variables + svars_ok <- vapply(svars, function(z) + MsCoreUtils::between(svalue[[z]], ranges[[z]][i, ]), TRUE, + USE.NAMES = FALSE) + if (!anyNA(svars_ok) && all(svars_ok)) { + if (length(pvars)) { + ## check ranges for peaks variables + tmp <- rowSums(do.call(cbind, lapply(pvars, function(z) { + MsCoreUtils::between(x[, z], ranges[[z]][i, ]) + }))) == length(pvars) + tmp[is.na(tmp)] <- FALSE + sel <- sel | tmp + } else { + ## No need to check further, because we have a match + if (keep) return(x) + else return(x[logical(), , drop = FALSE]) + } + } + } + if (keep) x[sel, , drop = FALSE] + else x[!sel, , drop = FALSE] +} diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 399b8262..1163bb44 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1125,30 +1125,23 @@ function is supposed to be more efficient than \code{unique(msLevel(object))}. } } -\section{Data subsetting, filtering and merging}{ +\section{Filter spectra data}{ -Subsetting and filtering of \code{Spectra} objects can be performed with the below -listed methods. +Filter a \code{Spectra} object based on the spectra data. This includes subset +operations that reduce the number of spectra in the object as well as filters +that reduce the \emph{content} of the \code{Spectra} object. See section +\emph{Filter peaks data} below for functions that filter the peaks data of a +\code{Spectra}. \itemize{ \item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method \strong{always} returns a \code{Spectra} object. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the -\emph{MetaboCoreUtils} package. Note that -the default parameters for isotope prediction/detection have been -determined using data from the Human Metabolome Database (HMDB) and -isotopes for elements other than CHNOPS might not be detected. See -parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for -more information. The approach and code to define the parameters for -isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. \item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the object's \code{spectraData} that contain only missing values (\code{NA}). Note that while columns with only \code{NA}s are removed, a \code{spectraData()} call after \code{dropNaSpectraVariables()} might still show columns containing \code{NA} values -for \emph{core} spectra variables. +for \emph{core} spectra variables. The total number of spectra is not changed +by this function. \item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or \code{dataStorage} is also provided, \code{object} is subsetted to the spectra with @@ -1170,30 +1163,6 @@ the provided \code{dataStorage} parameter). \item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). Returns the filtered \code{Spectra} object (with spectra in their original order). -\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier -artefact peaks from spectra (see examples below). The function iterates -through all intensity ordered peaks in a spectrum and removes all peaks -with an m/z within +/- \code{halfWindowSize} of the current peak if their -intensity is lower than \code{threshold} times the current peak's intensity. -Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} -allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} -being the maximum charge that should be considered and \code{isotopeTolerance} -the absolute acceptable tolerance for matching their m/z). -See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and -\code{deisitopeSpectra()} for an alternative. -\item \code{filterIntensity()}: filters each spectrum keeping only peaks with -intensities that are within the provided range or match the criteria of -the provided function. For the former, parameter \code{intensity} has to be a -\code{numeric} defining the intensity range, for the latter a \code{function} that -takes the intensity values of the spectrum and returns a \code{logical} whether -the peak should be retained or not (see examples below for details) - -additional parameters to the function can be passed with \code{...}. To -remove only peaks with intensities below a certain threshold, say 100, use -\code{intensity = c(100, Inf)}. Note: also a single value can be passed with -the \code{intensity} parameter in which case an upper limit of \code{Inf} is used. -Note that this function removes also peaks with missing intensities -(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the -filtering to spectra of the specified MS level(s). \item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} @@ -1201,15 +1170,6 @@ object (with spectra in their original order). \item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching the MS level specified with argument \code{msLevel}. Returns the filtered \code{Spectra} (with spectra in their original order). -\item \code{filterMzRange()}: filters the object keeping or removing peaks in each -spectrum that are within the provided m/z range. Whether peaks are -retained or removed can be configured with parameter \code{keep} (default -\code{keep = TRUE}). -\item \code{filterMzValues()}: filters the object keeping \strong{all} peaks in each -spectrum that match the provided m/z value(s) (for \code{keep = TRUE}, the -default) or removing \strong{all} of them (for \code{keep = FALSE}). The m/z -matching considers also the absolute \code{tolerance} and m/z-relative -\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. \item \code{filterPolarity()}: filters the object keeping only spectra matching the provided polarity. Returns the filtered \code{Spectra} (with spectra in their original order). @@ -1235,18 +1195,11 @@ a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. \item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with missing precursor m/z value (e.g. MS1 spectra) are dropped. -\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with -an m/z equal or larger than the m/z of the precursor, depending on the -value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). \item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. MS2) of acquisition number \code{acquisitionNum}. Returns the filtered \code{Spectra} (with spectra in their original order). Parameter \code{f} allows to define which spectra belong to the same sample or original data file ( defaults to \code{f = dataOrigin(object)}). -\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention -times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) -\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their -original order). \item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available spectra variables in object (spectra variable names can be specified with @@ -1256,6 +1209,10 @@ ranges/spectra variables are defined, the \code{match} parameter can be used to specify whether all conditions (\code{match = "all"}; the default) or if any of the conditions must match (\code{match = "any"}; all spectra for which values are within any of the provided ranges are retained). +\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention +times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) +\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their +original order). \item \code{filterValues()}: allows filtering of the \code{Spectra} object based on similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} (parameter \code{spectraVariables}) to provided values (parameter \code{values}) @@ -1264,18 +1221,6 @@ values/spectra variables are defined, the \code{match} parameter can be used to specify whether all conditions (\code{match = "all"}; the default) or if any of the conditions must match (\code{match = "any"}; all spectra for which values are within any of the provided ranges are retained). -\item \code{reduceSpectra()}: for groups of peaks within highly similar m/z values -within each spectrum (given \code{ppm} and \code{tolerance}), this function keeps -only the peak with the highest intensity removing all other peaks hence -\emph{reducing} each spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. -\item \code{reset()}: restores the data to its original state (as much as possible): -removes any processing steps from the lazy processing queue and calls -\code{reset()} on the backend which, depending on the backend, can also undo -e.g. data filtering operations. Note that a \verb{reset*(} call after -\code{applyProcessing()} will not have any effect. See examples below for more -information. \item \code{selectSpectraVariables()}: reduces the information within the object to the selected spectra variables: all data for variables not specified will be dropped. For mandatory columns (i.e., those listed by @@ -1283,8 +1228,130 @@ be dropped. For mandatory columns (i.e., those listed by the values will be dropped but not the variable itself. Additional (or user defined) spectra variables will be completely removed. Returns the filtered \code{Spectra}. -\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} -of \code{Spectra} objects. +} +} + +\section{Filter or aggregate mass peak data}{ + + +Operations that filter or aggregate the mass peak data from each spectrum +without changing the number of spectra in a \code{Spectra} object. +\itemize{ +\item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a +difference in their m/z values that is smaller than the maximal +acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters +\code{intensityFun} and \code{mzFun} allow to define functions to aggregate the +intensity and m/z values for each such group of peaks. With +\code{weighted = TRUE} (the default), the m/z value of the combined peak is +calculated using an intensity-weighted mean and parameter \code{mzFun} is +ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is used for the grouping of +mass peaks. Parameter \code{msLevel.} allows to define selected MS levels for +which peaks should be combined. This function returns a \code{Spectra} with +the same number of spectra than the input object, but with possibly +combined peaks within each spectrum. +dropped (i.e. their values are replaced with \code{NA}) for combined peaks +unless they are constant across the combined peaks. See also +\code{reduceSpectra()} for a function to select a single \emph{representative} +mass peak for each peak group. +\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the +monoisotopic peak for groups of isotopologues. Isotopologues are +estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the +\emph{MetaboCoreUtils} package. Note that +the default parameters for isotope prediction/detection have been +determined using data from the Human Metabolome Database (HMDB) and +isotopes for elements other than CHNOPS might not be detected. See +parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for +more information. The approach and code to define the parameters for +isotope prediction is described +\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. +\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier +artefact peaks from spectra (see examples below). The function iterates +through all intensity ordered peaks in a spectrum and removes all peaks +with an m/z within +/- \code{halfWindowSize} of the current peak if their +intensity is lower than \code{threshold} times the current peak's intensity. +Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} +allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} +being the maximum charge that should be considered and \code{isotopeTolerance} +the absolute acceptable tolerance for matching their m/z). +See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and +\code{deisitopeSpectra()} for an alternative. +\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only +those with intensities that are within the provided range or match the +criteria of the provided function. For the former, parameter \code{intensity} +has to be a \code{numeric} defining the intensity range, for the latter a +\code{function} that takes the intensity values of the spectrum and returns +a \code{logical} whether the peak should be retained or not (see examples +below for details) - additional parameters to the function can be passed +with \code{...}. +To remove only peaks with intensities below a certain threshold, say +100, use \code{intensity = c(100, Inf)}. Note: also a single value can be +passed with the \code{intensity} parameter in which case an upper limit of +\code{Inf} is used. +Note that this function removes also peaks with missing intensities +(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the +filtering to spectra of the specified MS level(s). +\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing +those in each spectrum that are within the provided m/z range. Whether +peaks are retained or removed can be configured with parameter \code{keep} +(default \code{keep = TRUE}). +\item \code{filterMzValues()}: filters mass peaks in the object keeping all +peaks in each spectrum that match the provided m/z value(s) (for +\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). +The m/z matching considers also the absolute \code{tolerance} and m/z-relative +\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. +\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any +set of range-based filters on numeric spectra or peaks variables. See +\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. +\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with +an m/z equal or larger than the m/z of the precursor, depending on the +value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). +\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in +(given \code{ppm} and \code{tolerance}) in each spectrum only the peak with the +highest intensity removing all other peaks hence \emph{reducing} each +spectrum to the highest intensity peaks per \emph{peak group}. +Peak groups are defined using the \code{\link[=group]{group()}} function from the +\emph{MsCoreUtils} package. See also the \code{combinePeaks()} function for an +alternative function to combine peaks within each spectrum. +} +} + +\section{Merging, aggregating and splitting}{ + + +Several \code{Spectra} objects can be concatenated into a single object with the +\code{c()} or the \code{concatenateSpectra()} function. Concatenation will fail if the +processing queue of any of the \code{Spectra} objects is not empty or if +different backends are used in the \code{Spectra} objects. Thus, in these cases, +prior to merging \code{Spectra} object it is suggested to change the backend to +a \code{MsBackendMemory} using the \code{setBackend()} function, and to \emph{apply} all +data processing steps using \code{applyProcessing()}. The spectra variables +of the resulting \code{Spectra} object is the union of the spectra variables of +the individual \code{Spectra} objects. +\itemize{ +\item \code{combineSpectra()}: combines MS data (i.e. mass peaks) from sets of +spectra into a single spectrum per set (in contrast to \code{combinePeaks()} +or \code{reduceSpectra()} that combine mass peaks \strong{within each spectrum}). +For each spectrum group (set), spectra variables from the first spectrum +are used and the peak matrices are combined using the function specified +with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please refer to the +\code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of the actual +combination of peaks across the sets of spectra and to the package +vignette for examples and alternative ways to aggregate spectra. +The sets of spectra can be specified with parameter \code{f}. +In addition it is possible to define, with parameter \code{p} if and how to +split the input data for parallel processing. +This defaults to \code{p = x$dataStorage} and hence a per-file parallel +processing is applied for \code{Spectra} with file-based backends (such as the +\code{\link[=MsBackendMzR]{MsBackendMzR()}}). +Prior combination of the spectra all processings queued in the lazy +evaluation queue are applied. Be aware that calling \code{combineSpectra()} on a +\code{Spectra} object with certain backends that allow modifications might +\strong{overwrite} the original data. This does not happen with a +\code{MsBackendMemory} or \code{MsBackendDataFrame} backend, but with a +\code{MsBackendHdf5Peaks} backend the m/z and intensity values in the original +hdf5 file(s) will be overwritten. +The function returns a \code{Spectra} of length equal to the unique levels +of \code{f}. \item \code{joinSpectraData()}: Individual spectra variables can be directly added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} function allows to merge a \code{DataFrame} to the existing spectra @@ -1304,14 +1371,9 @@ should be explored and ideally be removed using for \code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar functions. } +\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} +of \code{Spectra} objects. } - -Several \code{Spectra} objects can be concatenated into a single object with the -\code{c()} or the \code{concatenateSpectra()} function. Concatenation will fail if the -processing queue of any of the \code{Spectra} objects is not empty or if -different backends are used in the \code{Spectra} objects. The spectra variables -of the resulting \code{Spectra} object is the union of the spectra variables of -the individual \code{Spectra} objects. } \section{Data manipulation and analysis methods}{ @@ -1364,46 +1426,6 @@ same bin are aggregated using the function provided with parameter \code{FUN} the binning operation is applied to the peak data on-the-fly upon data access and it is possible to \emph{revert} the operation with the \code{reset()} function (see description of \code{reset()} above). -\item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a -difference in their m/z values that is smaller than the maximal -acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters -\code{intensityFun} and \code{mzFun} allow to define functions to aggregate the -intensity and m/z values for each such group of peaks. With -\code{weighted = TRUE} (the default), the m/z value of the combined peak is -calculated using an intensity-weighted mean and parameter \code{mzFun} is -ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is used for the grouping of -mass peaks. Parameter \code{msLevel.} allows to define selected MS levels for -which peaks should be combined. This function returns a \code{Spectra} with -the same number of spectra than the input object, but with possibly -combined peaks within each spectrum. -dropped (i.e. their values are replaced with \code{NA}) for combined peaks -unless they are constant across the combined peaks. See also -\code{reduceSpectra()} for a function to select a single \emph{representative} -mass peak for each peak group. -\item \code{combineSpectra()}: combines MS data from \strong{sets of spectra into a -single spectrum per set} (in contrast to \code{combinePeaks()} or -\code{reduceSpectra()} that combine mass peaks \strong{within each spectrum}). -For each spectrum group (set), spectra variables from the first spectrum -are used and the peak matrices are combined using the function specified -with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please refer to the -\code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of the actual -combination of peaks across the sets of spectra and to the package -vignette for examples and alternative ways to aggregate spectra. -The sets of spectra can be specified with parameter \code{f}. -In addition it is possible to define, with parameter \code{p} if and how to -split the input data for parallel processing. -This defaults to \code{p = x$dataStorage} and hence a per-file parallel -processing is applied for \code{Spectra} with file-based backends (such as the -\code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Prior combination of the spectra all processings queued in the lazy -evaluation queue are applied. Be aware that calling \code{combineSpectra()} on a -\code{Spectra} object with certain backends that allow modifications might -\strong{overwrite} the original data. This does not happen with a -\code{MsBackendMemory} or \code{MsBackendDataFrame} backend, but with a -\code{MsBackendHdf5Peaks} backend the m/z and intensity values in the original -hdf5 file(s) will be overwritten. -The function returns a \code{Spectra} of length equal to the unique levels -of \code{f}. \item \code{compareSpectra()}: compares each spectrum in \code{x} with each spectrum in \code{y} using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If \code{y} is missing, each spectrum in \code{x} is compared with each other spectrum @@ -1438,16 +1460,6 @@ the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \ is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also the vignette for additional examples, such as using spectral entropy similarity in the scoring. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the \emph{MetaboCoreUtils} -package. Note that the default parameters for isotope -prediction/detection have been determined using data from the Human -Metabolome Database (HMDB) and isotopes for elements other than CHNOPS -might not be detected. See parameter \code{substDefinition} in the -documentation of \code{\link[=isotopologues]{isotopologues()}} for more information. The approach -and code to define the parameters for isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. \item \code{entropy()}: calculates the entropy of each spectra based on the metrics suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. @@ -1466,13 +1478,12 @@ from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePr \code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. \item \code{processingLog()}: returns a \code{character} vector with the processing log messages. -\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in -(given \code{ppm} and \code{tolerance}) in each spectrum only the peak with the -highest intensity removing all other peaks hence \emph{reducing} each -spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. See also the \code{combinePeaks()} function for an -alternative function to combine peaks within each spectrum. +\item \code{reset()}: restores the data to its original state (as much as possible): +removes any processing steps from the lazy processing queue and calls +\code{reset()} on the backend which, depending on the backend, can also undo +e.g. data filtering operations. Note that a \verb{reset*(} call after +\code{applyProcessing()} will not have any effect. See examples below for more +information. \item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending on parameter \code{by}. With \code{by = sum} (the default) peak intensities are divided by the sum of peak intensities within each spectrum. The sum of diff --git a/man/filterPeaksRanges.Rd b/man/filterPeaksRanges.Rd new file mode 100644 index 00000000..2cfa140d --- /dev/null +++ b/man/filterPeaksRanges.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R +\name{filterPeaksRanges} +\alias{filterPeaksRanges} +\title{Filter peaks based on spectra and peaks variable ranges} +\usage{ +filterPeaksRanges(object, ..., keep = TRUE) +} +\arguments{ +\item{object}{A \link{Spectra} object.} + +\item{...}{the ranges for the spectra and/or peaks variables. Has to be +provided as \verb{ = } pairs with \verb{} being the name of a +spectra or peaks variable (of numeric data type) and \verb{} being +either a \code{numeric} of length 2 or a \code{numeric} two column matrix (see +function desription above for details),} + +\item{keep}{\code{logical(1)} whether to keep (default) or remove peaks that +match the provided range(s).} +} +\description{ +The \code{filterPeaksRanges()} function allows to filter the peaks matrices of a +\link{Spectra} object using any set of range-based filters on numeric spectra +variables or peaks variables. These ranges can be passed to the function +using the \code{...} as \verb{ = } pairs. \verb{} +has to be an available spectra or peaks variable. \verb{} can be a +\code{numeric} of length 2 defining the lower and upper boundary, or a \code{numeric} +two-column matrix (multi-row matrices are also supported, see further +below). \code{filterPeaksRanges(s, mz = c(200, 300))} woudl for example reduce +the peaks matrices of the \code{Spectra} object \code{s} to mass peaks with an m/z +value between 200 and 300. \code{filterPeaksRanges()} returns the original +\code{Spectra} object with the filter operation added to the processing queue. +Thus, the filter gets only applied when the peaks data gets extracted +with \code{mz()}, \code{intensity()} or \code{peaksData()}. If ranges for both spectra +\strong{and} peaks variables are defined, the function evaluates first whether +the spectra variable value for a spectrum is within the provided range and, +if so, applies also the peaks variable-based filter (otherwise an empty +peaks matrix is returned). + +If more than one spectra variable and/or peaks variable are defined, their +filter results are combined with a logical AND: a peak matrix is only +returned for a spectrum if all values of spectra variables are within the +provided (respective) ranges for spectra variables, and this matrix is +further filtered to contain only those peaks which values are within the +provided peaks variable ranges. + +\strong{Filtering with multiple ranges} per spectra and peaks variables is also +supported: ranges can also be provided as multi-row numeric (two-column) +matrices. In this case, the above described procedure is applied for each +row separately and their results are combined with a logical OR, i.e. +peaks matrices are returned that match any of the conditions/filters +of a row. The number of rows of the provided ranges (being it for spectra +or peaks variables) have to match. + +\strong{Missing value handling}: any comparison which involves a missing value +(beingvit a spectra variable value, a peaks variable value or a value +in one of thevprovided ranges) is treated as a logical \code{FALSE}. For +example, if the retention time of a spectrum is \code{NA} and the data is +filtered using a retention time range, an empty peaks matrix is returned +(for \code{keep = TRUE}, for \code{keep = FALSE} the full peaks matrix is returned). +} +\note{ +In contrast to other \emph{filter} functions, this function does not provide a +\code{msLevel.} parameter to apply the filter only on spectra of the specified +MS levels. In contrast, to apply no, or different, filters to spectra from +different MS levels, multi-row range matrices can be used (see examples +below). +} +\examples{ + +## Define a test Spectra +d <- data.frame(rtime = c(123.2, 134.2), msLevel = c(1L, 2L)) +d$mz <- list(c(100.1, 100.2, 100.3, 200.1, 200.2, 300.3), + c(100.3, 100.4, 200.2, 400.3, 400.4)) +## Use the index of the mass peak within the spectrum as index for +## better illustration of filtering results +d$intensity <- list(c(1:6), 1:5) +s <- Spectra(d) +s + +## Filter peaks removing all mass peaks with an m/z between 200 and 300 +res <- filterPeaksRanges(s, mz = c(200, 300), keep = FALSE) +res + +## The Spectra object has still the same length and spectra variables +length(res) +res$rtime + +## The filter gets applied when mass peak data gets extracted, using either +## `mz()`, `intensity()` or `peaksData()`. The filtered peaks data does +## not contain any mass peaks with m/z values between 200 and 300: +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## We next combine spectra and filter variables. We want to keep only mass +## peaks of MS2 spectra that have an m/z between 100 and 110. +res <- filterPeaksRanges(s, mz = c(100, 110), msLevel = c(2, 2)) +res +length(res) + +## Only data for peaks are returned for which the spectra's MS level is +## between 2 and 2 and with an m/z between 100 and 110. The peaks data for +## the first spectrum, that has MS level 1, is thus empty: +peaksData(res)[[1L]] + +## While the peaks matrix for the second spectrum (with MS level 2) contains +## the mass peaks with m/z between 100 and 110. +peaksData(res)[[2L]] + +## To keep also the peaks data for the first spectrum, we need to define +## an additional set of ranges, which we define using a second row in each +## ranges matrix. We use the same filter as above, i.e. keeping only mass +## peaks with an m/z between 100 and 110 for spectra with MS level 2, but +## add an additional row for MS level 1 spectra keeping mass peaks with an +## m/z between 0 and 2000. Filter results of different rows are combined +## using a logical OR, i.e. peaks matrices with mass peaks are returned +## matching either the first, or the second row. +res <- filterPeaksRanges(s, mz = rbind(c(100, 110), c(0, 1000)), + msLevel = rbind(c(2, 2), c(1, 1))) + +## The results for the MS level 2 spectrum are the same as before, but with +## the additional row we keep the full peaks matrix of the MS1 spectrum: +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## As a last example we define a filter that keeps all mass peaks with an +## m/z either between 100 and 200, or between 300 and 400. +res <- filterPeaksRanges(s, mz = rbind(c(100, 200), c(300, 400))) +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## Such filters could thus be defined to restrict/filter the MS data to +## specific e.g. retention time and m/z ranges. +} +\author{ +Johannes Rainer +} diff --git a/tests/testthat/test_Spectra-functions.R b/tests/testthat/test_Spectra-functions.R index ec73a72f..7fabfbb5 100644 --- a/tests/testthat/test_Spectra-functions.R +++ b/tests/testthat/test_Spectra-functions.R @@ -861,3 +861,75 @@ test_that("processingChunkFactor works", { expect_error(processingChunkFactor("a"), "Spectra") }) + +test_that("filterPeaksRanges,Spectra works", { + df <- data.frame(rtime = 123.3, new_var = 4, msLevel = 2L) + df$mz <- list(c(100.1, 100.2, 100.3, 100.4, 200.1, 200.2, 200.3, + 300.1, 300.3, 300.4, 300.5)) + df$intensity <- list(1:11) + s <- Spectra(df) + ## Check errors + expect_error(filterPeaksRanges(3), "'Spectra' object") + expect_error(filterPeaksRanges(s, rtime = c(1, 2), not_exist = c(1, 2)), + "valid spectra variables") + expect_error(filterPeaksRanges(s, rtime = 2, mz = c(1, 2)), + "'numeric' of length 2") + expect_error(filterPeaksRanges( + s, rtime = rbind(c(1, 2), c(2, 3)), mz = c(1, 2)), + "Number of rows of the range matrices") + + ## Single range per variable + res <- filterPeaksRanges(s, rtime = c(100, 200), mz = cbind(200, 300)) + expect_true(inherits(res, "Spectra")) + expect_true(length(res@processingQueue) > 0L) + expect_equal(res@processingQueueVariables, c("rtime", "msLevel")) + expect_equal(length(res@processing), 1L) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(5:7)) + res <- filterPeaksRanges(s, rtime = c(100, 200), mz = cbind(200, 300), + keep = FALSE) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(1:4, 8:11)) + + ## Multiple ranges per variable + res <- filterPeaksRanges( + s, new_var = rbind(c(1, 8), c(1, 4), c(1, 5)), + rtime = rbind(c(100, 200), c(400, 500), c(100, 200)), + mz = rbind(c(100, 100.3), c(0, 500), c(300.3, 310))) + expect_true(inherits(res, "Spectra")) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(1:3, 9:11)) + res <- filterPeaksRanges( + s, new_var = rbind(c(1, 8), c(1, 4), c(1, 5)), + rtime = rbind(c(100, 200), c(400, 500), c(100, 200)), + mz = rbind(c(100, 100.3), c(0, 500), c(300.3, 310)), keep = FALSE) + expect_true(inherits(res, "Spectra")) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(4:8)) + + ## Filter also with msLevel; to have the same behaviour as with other + ## filters we would need to add a second filter for e.g. MS level 2 + s <- c(s, s) + s$msLevel <- c(1L, 2L) + res <- filterPeaksRanges(s, rtime = c(100, 200), msLevel = c(1, 1), + mz = c(100, 200)) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 1:4) + a <- peaksData(res)[[2L]] + expect_true(nrow(a) == 0L) + res <- filterPeaksRanges(s, rtime = rbind(c(100, 200), c(100, 200)), + msLevel = rbind(c(1, 1), c(2, 2)), + mz = rbind(c(100, 200), c(0, 400))) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 1:4) + a <- peaksData(res)[[2L]] + expect_equal(a[, 2L], 1:11) + res <- filterPeaksRanges(s, rtime = rbind(c(100, 200), c(100, 200)), + msLevel = rbind(c(1, 1), c(2, 2)), + mz = rbind(c(100, 200), c(0, 400)), + keep = FALSE) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 5:11) + a <- peaksData(res)[[2L]] + expect_true(nrow(a) == 0) +}) diff --git a/tests/testthat/test_peaks-functions.R b/tests/testthat/test_peaks-functions.R index b7204b46..f28452dd 100644 --- a/tests/testthat/test_peaks-functions.R +++ b/tests/testthat/test_peaks-functions.R @@ -490,3 +490,235 @@ test_that(".peaks_filter_precursor_keep_below works", { precursorMz = 14.2, tolerance = 0.1) expect_equal(unname(res[, "intensity"]), 1) }) + +test_that(".peaks_filter_ranges works", { + ## Testing all possible combinations, with/without spectra and/or peaks + ## variables, single/multiple variables, single/multiple rows, NA handling + x <- cbind(mz = c(100.1, 100.2, 100.3, 100.4, + 104.1, 104.2, + 200.3, 200.4, 200.5, + 300.1, 300.2), + intensity = 1:11) + ## res <- .peaks_filter_ranges(x, spectrumMsLevel = 1L, msLevel = 2L) + ## expect_equal(res, x) + + ## Single filters. + ranges <- list(rtime = cbind(1, 2), new_var = cbind(3, 4), + mz = cbind(200, 201), intensity = cbind(8, 9)) + + ## * No peaks variables. + pvars <- character() + svars <- c("rtime", "new_var") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + + ## * No spectra variables. + pvars <- c("mz", "intensity") + svars <- character() + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], 8:9) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], c(1:7, 10:11)) + ranges$mz <- cbind(100, 106) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + ranges$mz <- cbind(200, 201) + + ## * Spectra and peaks variables. + svars <- c("rtime") + pvars <- c("mz") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], 7:9) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], c(1:6, 10:11)) + + ## Multiple filters. + ranges <- list(rtime = rbind(c(1, 2), c(0, 4), c(2, 3)), + new_var = rbind(c(3, 4), c(1, 9), c(3, 5)), + mz = rbind(c(200, 201), c(100, 101), c(200, 201)), + intensity = rbind(c(8, 9), c(1, 20), c(3, 12))) + + ## * No peaks variables. + svars <- c("rtime", "new_var") + pvars <- character() + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 20, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0L) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 20, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + + ## * No spectra variables. + svars <- character() + pvars <- c("mz", "intensity") + res <- .peaks_filter_ranges(x, pvars = pvars, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, 2L], sort(c(8, 9, 1, 2, 3, 4, 7))) + res <- .peaks_filter_ranges(x, pvars = pvars, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, 2L], c(5:6, 10:11)) + res <- .peaks_filter_ranges(x, pvars = c("intensity"), ranges = ranges, + spectrumMsLevel = 1L) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, pvars = c("intensity"), ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_equal(res, x[logical(), , drop = FALSE]) + + ## * Spectra and peaks variables. + svars <- c("rtime") + pvars <- c("mz") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 2, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], c(1:4, 7:9)) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 2, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], c(5:6, 10:11)) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], 1:4) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], 5:11) + + ## Handling NA + ## * spectra variable value is NA + ranges <- lapply(ranges, function(z) z[1, , drop = FALSE]) + svars <- "rtime" + pvars <- c("mz", "intensity") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_equal(res, x) + + svars <- c("rtime", "new_var") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L, + new_var = 3) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + + ## * peaks variable value is NA + x[8, 2L] <- NA_real_ + res <- .peaks_filter_ranges(x, pvars = c("mz", "intensity"), + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(unname(res[, 2L]), 9L) + + ## * range value is NA + ranges$rtime <- cbind(NA, 2) + res <- .peaks_filter_ranges(x, svars = c("rtime", "new_var"), rtime = 2, + new_var = 3, spectrumMsLevel = 1L, + ranges = ranges) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + res <- .peaks_filter_ranges(x, svars = c("rtime", "new_var"), rtime = 2, + new_var = 3, spectrumMsLevel = 1L, + ranges = ranges, keep = FALSE) + expect_equal(res, x) +}) diff --git a/vignettes/Spectra.Rmd b/vignettes/Spectra.Rmd index 4fbbb95d..8d383700 100644 --- a/vignettes/Spectra.Rmd +++ b/vignettes/Spectra.Rmd @@ -360,25 +360,39 @@ Similar to spectra variables it is also possible to replace values for **existing** peaks variables using the `$<-` function. -## Filtering, subsetting and merging +## Filtering, aggregating and merging spectra data -Apart from *classical* subsetting operations such as `[` and `split()`, a set of -filter functions are defined for `Spectra` objects (for detailed help please see -the `?Spectra` help). Filter and subset functions either reduce the number of -spectra within a `Spectra` object, or affect the number of -peaks (by either aggregating or subset) within each spectrum. Filter functions -affecting the total number of spectra are (in alphabetic order): +Various functions are available to filter, subset and merge `Spectra` +objects. These can be generally subdivided into functions that subset or filter +*spectra data* and operations that filter *mass peak data*. A third category of +function allows to aggregate data within a `Spectra` or to merge and combine +multiple `Spectra` objects into one. Functions of the various categories are +described in the following subsections. Please refer to the function's +documentation for more details and information. + +### Filter spectra data + +These functions comprise subset operations that reduce the total number of +spectra in a `Spectra` object as well as filter functions that reduce the +content of the `Spectra`'s spectra data (i.e. the content of its +`spectraVariables()`). These functions thus don't change or affect the mass +peaks data of the `Spectra`'s individual spectra. + +- `[`: operation to reduce a `Spectra` object to selected elements. +- `dropNaSpectraVariables()`: drops `spectraVariables()` that contain only + missing values. The function returns a `Spectra` object with the same number + of elements, but with eventually fewer spectra variables. - `filterAcquisitionNum()`: retains spectra with certain acquisition numbers. - `filterDataOrigin()`: subsets to spectra from specific origins. - `filterDataStorage()`: subsets to spectra from certain data storage files. - `filterEmptySpectra()`: removes spectra without mass peaks. -- `filterMzRange()`: subsets spectra keeping only peaks with an m/z within the - provided m/z range. - `filterIsolationWindow()`: keeps spectra with the provided `mz` in their isolation window (m/z range). - `filterMsLevel()`: filters by MS level. - `filterPolarity()`: filters by polarity. +- `filterPrecursorCharge()`: retains (MSn) spectra with specified + precursor charge(s). - `filterPrecursorIsotopes()`: identifies precursor ions (from fragment spectra) that could represent isotopes of the same molecule. For each of these spectra groups only the spectrum of the monoisotopic precursor ion is returned. MS1 @@ -390,53 +404,59 @@ affecting the total number of spectra are (in alphabetic order): the provided m/z range. - `filterPrecursorMzValues(()`: retains (MSn) spectra with precursor m/z value matching the provided value(s) considering also a `tolerance` and `ppm`. -- `filterPrecursorCharge()`: retains (MSn) spectra with specified - precursor charge(s). - `filterPrecursorScan()`: retains (parent and children) scans of an acquisition number. -- `filterRanges()`: allows filtering of the `Spectra` object based on user - defined *numeric* ranges (parameter `ranges`) for one or more available - spectra variables in object (spectra variable names can be specified with - parameter `spectraVariables`). Spectra for which the value of a spectra - variable is within it's defined range are retained. If multiple - ranges/spectra variables are defined, the `match` parameter can be used - to specify whether all conditions (`match = "all"`; the default) or if - any of the conditions must match (`match = "any"`; all spectra for which - values are within any of the provided ranges are retained). +- `filterRanges()`: filters a `Spectra` object based on (multiple) user + defined *numeric* ranges for one or more available (numeric) spectra + variables. - `filterRt()`: filters based on retention time range. -- `filterValues()`: allows filtering of the `Spectra` object based on - similarities of *numeric* values of one or more `spectraVariables(object)` - (parameter `spectraVariables`) to provided values (parameter `values`) - given acceptable differences (parameters tolerance and ppm). If multiple - values/spectra variables are defined, the `match` parameter can be used - to specify whether all conditions (`match = "all"`; the default) or if - any of the conditions must match (`match = "any"`; all spectra for which - values are within any of the provided ranges are retained). -- `combineSpectra()`: allows to combine the MS data from sets of spectra into a - single spectrum per set. Thus, instead of filtering the data, this function - aggregates it. +- `filterValues()`: filters a `Spectra` object based on similarities of + *numeric* values of one or more available spectra variables. +- `selectSpectraVariables()`: reduces the (spectra) data within the object to + the selected spectra variables. + -Filter functions that return the same number of spectra, but affect/subset the -peaks data (m/z and intensity values) within each spectrum are: +### Filter or aggregate mass peak data + +These function filter or aggregate the mass peak data (`peaksData()`) of each +spectrum in a `Spectra` without changing the total number of spectra. - `combinePeaks()`: groups peaks **within each spectrum** based on similarity of their m/z values and combines these into a single peak per peak group. - `deisotopeSpectra()`: deisotopes each individual spectrum keeping only the monoisotopic peak for peaks groups of potential isotopologues. +- `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier transform + artifact peaks from spectra. - `filterIntensity()`: filter each spectrum keeping only peaks with intensities meeting certain criteria. -- `filterMzRange()`: subsets peaks data within each spectrum keeping only peaks - with their m/z values within the specified m/z range. +- `filterMzRange()`: filters mass peaks keeping (or removing) those with an + m/z within the provided m/z range. +- `filterMzValues()`: filters mass peaks within each spectrum keeping (or + removing) those with an m/z matching the provided value(s). +- `filterPeaksRanges()`: filters mass peaks using any set of range-based filters + on numeric spectra or peaks variables. - `filterPrecursorPeaks()`: removes peaks with either an m/z value matching the precursor m/z of the respective spectrum (with parameter `mz = "=="`) or peaks with an m/z value larger or equal to the precursor m/z (with parameter `mz = ">="`). -- `filterMzValues()`: subsets peaks within each spectrum keeping or removing - (all) peaks matching provided m/z value(s) (given parameters `ppm` and - `tolerance`). - `reduceSpectra()`: filters individual spectra keeping only the largest peak for groups of peaks with similar m/z values. + +### Merging, aggregating and splitting + + +- `c()`: combine several `Spectra` into a single `Spectra` object. +- `combineSpectra()`: allows to combine the MS data from sets of spectra into a + single spectrum per set. Thus, instead of filtering the data, this function + aggregates it. +- `joinSpectraData()`: merge a `DataFrame` to the existing spectra data. +- `split()`: splits the `Spectra` object based on a provided grouping factor. + + + +### Examples and use cases for filter operations + In this example, we use the `filterValues()` function to retain spectra with a base peak m/z close to 100 (+/- 30 ppm) and a retention time around 230 (+/- 5 s). From 4ad59d11ea40da7c5f2371e1c8d3cd0094c9d96f Mon Sep 17 00:00:00 2001 From: Laurent Gatto Date: Tue, 16 Jul 2024 18:32:03 +0200 Subject: [PATCH 12/41] Adds asDataFrame() function (close #325) --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 ++ R/Spectra.R | 25 ++++++++++++++++++++++++- man/Spectra.Rd | 18 +++++++++++++++++- tests/testthat/test_Spectra.R | 21 +++++++++++++++++++++ 6 files changed, 66 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f61a77b5..73d5926c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -73,7 +73,7 @@ BugReports: https://github.com/RforMassSpectrometry/Spectra/issues URL: https://github.com/RforMassSpectrometry/Spectra biocViews: Infrastructure, Proteomics, MassSpectrometry, Metabolomics Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Roxygen: list(markdown=TRUE) Collate: 'hidden_aliases.R' diff --git a/NAMESPACE b/NAMESPACE index 269d2e35..80588f99 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(MsBackendMemory) export(MsBackendMzR) export(PrecursorMzParam) export(applyProcessing) +export(asDataFrame) export(chunkapply) export(combinePeaksData) export(combineSpectra) diff --git a/NEWS.md b/NEWS.md index 73614759..32664ccb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ *MetaboCoreUtils* directly through their namespace (`MsCoreUtils::`) to avoid errors if performed in parallel on Windows machines or if called on a re-loaded object. +- New `asDataFrame()` function to convert a (small) `Spectra` object + into a long `DataFrame`. ## Changes in 1.15.2 diff --git a/R/Spectra.R b/R/Spectra.R index cf212594..397c5993 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -898,7 +898,9 @@ NULL #' window left and right of a peak where to remove fourier transform #' artefacts. #' -#' @param i For `[`: `integer`, `logical` or `character` to subset the object. +#' @param i For `[`: `integer`, `logical` or `character` to subset the +#' object. For `asDataFrame()` an `numeric` indicating which scans to coerce +#' to a `DataFrame` (default is `seq_along(object)`). #' #' @param j For `[`: not supported. #' @@ -2831,3 +2833,24 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { dataStorageBasePath(object@backend) <- value object }) + +#' @export +#' @rdname Spectra +#' +#' @param spectraVars `character()` indicating what spectra variables to add to +#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all +#' available variables. +#' +#' @examples +#' +#' ## Convert a subset of the Spectra object to a long DataFrame. +#' asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) +asDataFrame <- function(object, i = seq_along(object), + spectraVars = spectraVariables(object)) { + stopifnot(inherits(object, "Spectra")) + object <- object[i] + n <- sapply(peaksData(object), nrow) + v <- spectraData(object)[rep(seq_along(object), n), spectraVars] + p <- do.call(rbind, as.list(peaksData(object))) + cbind(p, v) +} \ No newline at end of file diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 399b8262..f87cddbe 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -110,6 +110,7 @@ \alias{entropy,ANY-method} \alias{dataStorageBasePath,Spectra-method} \alias{dataStorageBasePath<-,Spectra-method} +\alias{asDataFrame} \title{The Spectra class to manage and access MS data} \usage{ applyProcessing( @@ -510,6 +511,12 @@ coreSpectraVariables() \S4method{dataStorageBasePath}{Spectra}(object) \S4method{dataStorageBasePath}{Spectra}(object) <- value + +asDataFrame( + object, + i = seq_along(object), + spectraVars = spectraVariables(object) +) } \arguments{ \item{object}{For \code{Spectra()}: either a \code{DataFrame} or \code{missing}. See @@ -688,7 +695,9 @@ total ion current should be (re)calculated on the actual data \item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return or set.} -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the +object. For \code{asDataFrame()} an \code{numeric} indicating which scans to coerce +to a \code{DataFrame} (default is \code{seq_along(object)}).} \item{j}{For \code{[}: not supported.} @@ -838,6 +847,10 @@ value using an intensity-weighted mean. Defaults to \code{weighted = TRUE}.} \item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for details.} + +\item{spectraVars}{\code{character()} indicating what spectra variables to add to +the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all +available variables.} } \value{ See individual method description for the return value. @@ -1961,6 +1974,9 @@ peaksData(sps, columns = peaksVariables(sps))[[1L]] ## Access just the pk_ann variable sps$pk_ann + +## Convert a subset of the Spectra object to a long DataFrame. +asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) } \author{ Nir Shahaf, Johannes Rainer diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index b0cda2ca..5169b2fe 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -1908,3 +1908,24 @@ test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { #' errors expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") }) + + +test_that("asDataFrame works", { + sciex_file <- normalizePath( + dir(system.file("sciex", package = "msdata"), full.names = TRUE)) + sp <- Spectra(sciex_file) + ## Full dataframe + df <- asDataFrame(sp) + expect_identical(nrow(df), sum(sapply(peaksData(sp), nrow))) + expect_identical(ncol(df), length(spectraVariables(sp)) + 2L) + expect_identical(names(df), c("mz", "intensity", spectraVariables(sp))) + ## Three first scans and 2 spectra variables + df <- asDataFrame(sp, i = 1:3, spectraVars = c("msLevel", "rtime")) + expect_identical(nrow(df), sum(sapply(peaksData(sp[1:3]), nrow))) + expect_identical(ncol(df), 2L + 2L) + ## Three first scans and no spectra variables + df <- asDataFrame(sp, i = 1:3, spectraVars = NULL) + expect_identical(nrow(df), sum(sapply(peaksData(sp[1:3]), nrow))) + expect_identical(ncol(df), 2L) + expect_identical(names(df), c("mz", "intensity")) +}) \ No newline at end of file From ba7c8ba1e14b61c056effb788398ee9f39ebd4c1 Mon Sep 17 00:00:00 2001 From: Laurent Gatto Date: Tue, 16 Jul 2024 18:35:24 +0200 Subject: [PATCH 13/41] bump version --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 73d5926c..082e156a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.3 +Version: 1.15.4 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 32664ccb..6fc3f7c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # Spectra 1.15 +## Changes in 1.15.4 + +- Nothing yet. + ## Changes in 1.15.3 - For evaluation of the `Spectra`'s processing queue: call functions from the From bd1784b5735c924fb95e79aa368219e5f7654e0f Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 18 Jul 2024 17:01:30 +0200 Subject: [PATCH 14/41] docs: fix Phili's comments --- DESCRIPTION | 2 +- R/MsBackend.R | 2 +- R/Spectra-functions.R | 23 ++++++++++++++--------- R/Spectra.R | 15 +++++++++------ man/MsBackend.Rd | 2 +- man/Spectra.Rd | 15 +++++++++------ man/filterPeaksRanges.Rd | 23 ++++++++++++++--------- 7 files changed, 49 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 082e156a..47b81ac5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.4 +Version: 1.15.5 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/R/MsBackend.R b/R/MsBackend.R index dc73ee5f..eae122b1 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -774,7 +774,7 @@ #' #' @return See documentation of respective function. #' -#' @author Johannes Rainer, Sebastian Gibb, Laurent Gatto +#' @author Johannes Rainer, Sebastian Gibb, Laurent Gatto, Philippine Louail #' #' @md #' diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 67d47517..8bb9042d 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -1194,11 +1194,11 @@ processingChunkFactor <- function(x) { #' has to be an available spectra or peaks variable. `` can be a #' `numeric` of length 2 defining the lower and upper boundary, or a `numeric` #' two-column matrix (multi-row matrices are also supported, see further -#' below). `filterPeaksRanges(s, mz = c(200, 300))` woudl for example reduce +#' below). `filterPeaksRanges(s, mz = c(200, 300))` would for example reduce #' the peaks matrices of the `Spectra` object `s` to mass peaks with an m/z #' value between 200 and 300. `filterPeaksRanges()` returns the original #' `Spectra` object with the filter operation added to the processing queue. -#' Thus, the filter gets only applied when the peaks data gets extracted +#' Thus, the filter gets **only** applied when the peaks data gets extracted #' with `mz()`, `intensity()` or `peaksData()`. If ranges for both spectra #' **and** peaks variables are defined, the function evaluates first whether #' the spectra variable value for a spectrum is within the provided range and, @@ -1221,19 +1221,24 @@ processingChunkFactor <- function(x) { #' or peaks variables) have to match. #' #' **Missing value handling**: any comparison which involves a missing value -#' (beingvit a spectra variable value, a peaks variable value or a value -#' in one of thevprovided ranges) is treated as a logical `FALSE`. For +#' (being it a spectra variable value, a peaks variable value or a value +#' in one of the provided ranges) is treated as a logical `FALSE`. For #' example, if the retention time of a spectrum is `NA` and the data is #' filtered using a retention time range, an empty peaks matrix is returned #' (for `keep = TRUE`, for `keep = FALSE` the full peaks matrix is returned). #' #' @note #' -#' In contrast to other *filter* functions, this function does not provide a -#' `msLevel.` parameter to apply the filter only on spectra of the specified -#' MS levels. In contrast, to apply no, or different, filters to spectra from -#' different MS levels, multi-row range matrices can be used (see examples -#' below). +#' In contrast to some other *filter* functions, this function does not provide +#' a `msLevel` parameter that allows to define the MS level of spectra on which +#' the filter should be applied. The filter(s) will always be applied to +#' **all** spectra (irrespectively of their MS level). Through combination of +#' multiple filter ranges it is however possible to apply MS level-dependent +#' filters (see examples below for details). +#' +#' The filter will not be applied immediately to the data but only executed when +#' the mass peak data is accessed (through `peaksData()`, `mz()` or +#' `intensity()`) or by calling `applyProcessing()`. #' #' @param object A [Spectra] object. #' diff --git a/R/Spectra.R b/R/Spectra.R index 24b5f8ae..a20a44ba 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -334,10 +334,10 @@ NULL #' @section Filter spectra data: #' #' Filter a `Spectra` object based on the spectra data. This includes subset -#' operations that reduce the number of spectra in the object as well as filters -#' that reduce the *content* of the `Spectra` object. See section -#' *Filter peaks data* below for functions that filter the peaks data of a -#' `Spectra`. +#' operations that immediately reduce the number of spectra in the object as +#' well as filters that reduce the *content* of the `Spectra` object. +#' See section *Filter peaks data* below for functions that filter the peaks +#' data of a `Spectra`. #' #' - `[`: subsets the spectra keeping only selected elements (`i`). The method #' **always** returns a `Spectra` object. @@ -456,7 +456,10 @@ NULL #' @section Filter or aggregate mass peak data: #' #' Operations that filter or aggregate the mass peak data from each spectrum -#' without changing the number of spectra in a `Spectra` object. +#' without changing the number of spectra in a `Spectra` object. Also, the +#' actual subsetting/aggregation operation is only executed once peaks data is +#' accessed (through `peaksData()`, `mz()` or `intensity()`) or +#' `applyProcessing()` is called. #' #' - `combinePeaks()`: combines mass peaks **within each spectrum** with a #' difference in their m/z values that is smaller than the maximal @@ -1115,7 +1118,7 @@ NULL #' #' @param ... Additional arguments. #' -#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail #' #' @md #' diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 2e9292e9..19bd8b7c 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -1090,5 +1090,5 @@ be$peak_ann <- NULL peaksVariables(be) } \author{ -Johannes Rainer, Sebastian Gibb, Laurent Gatto +Johannes Rainer, Sebastian Gibb, Laurent Gatto, Philippine Louail } diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 53c36e5a..b4f87b54 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1142,10 +1142,10 @@ function is supposed to be more efficient than \code{unique(msLevel(object))}. Filter a \code{Spectra} object based on the spectra data. This includes subset -operations that reduce the number of spectra in the object as well as filters -that reduce the \emph{content} of the \code{Spectra} object. See section -\emph{Filter peaks data} below for functions that filter the peaks data of a -\code{Spectra}. +operations that immediately reduce the number of spectra in the object as +well as filters that reduce the \emph{content} of the \code{Spectra} object. +See section \emph{Filter peaks data} below for functions that filter the peaks +data of a \code{Spectra}. \itemize{ \item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method \strong{always} returns a \code{Spectra} object. @@ -1248,7 +1248,10 @@ Returns the filtered \code{Spectra}. Operations that filter or aggregate the mass peak data from each spectrum -without changing the number of spectra in a \code{Spectra} object. +without changing the number of spectra in a \code{Spectra} object. Also, the +actual subsetting/aggregation operation is only executed once peaks data is +accessed (through \code{peaksData()}, \code{mz()} or \code{intensity()}) or +\code{applyProcessing()} is called. \itemize{ \item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a difference in their m/z values that is smaller than the maximal @@ -1996,5 +1999,5 @@ Nir Shahaf Johannes Rainer -Sebastian Gibb, Johannes Rainer, Laurent Gatto +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail } diff --git a/man/filterPeaksRanges.Rd b/man/filterPeaksRanges.Rd index 2cfa140d..db713c3b 100644 --- a/man/filterPeaksRanges.Rd +++ b/man/filterPeaksRanges.Rd @@ -26,11 +26,11 @@ using the \code{...} as \verb{ = } pairs. \verb{} can be a \code{numeric} of length 2 defining the lower and upper boundary, or a \code{numeric} two-column matrix (multi-row matrices are also supported, see further -below). \code{filterPeaksRanges(s, mz = c(200, 300))} woudl for example reduce +below). \code{filterPeaksRanges(s, mz = c(200, 300))} would for example reduce the peaks matrices of the \code{Spectra} object \code{s} to mass peaks with an m/z value between 200 and 300. \code{filterPeaksRanges()} returns the original \code{Spectra} object with the filter operation added to the processing queue. -Thus, the filter gets only applied when the peaks data gets extracted +Thus, the filter gets \strong{only} applied when the peaks data gets extracted with \code{mz()}, \code{intensity()} or \code{peaksData()}. If ranges for both spectra \strong{and} peaks variables are defined, the function evaluates first whether the spectra variable value for a spectrum is within the provided range and, @@ -53,18 +53,23 @@ of a row. The number of rows of the provided ranges (being it for spectra or peaks variables) have to match. \strong{Missing value handling}: any comparison which involves a missing value -(beingvit a spectra variable value, a peaks variable value or a value -in one of thevprovided ranges) is treated as a logical \code{FALSE}. For +(being it a spectra variable value, a peaks variable value or a value +in one of the provided ranges) is treated as a logical \code{FALSE}. For example, if the retention time of a spectrum is \code{NA} and the data is filtered using a retention time range, an empty peaks matrix is returned (for \code{keep = TRUE}, for \code{keep = FALSE} the full peaks matrix is returned). } \note{ -In contrast to other \emph{filter} functions, this function does not provide a -\code{msLevel.} parameter to apply the filter only on spectra of the specified -MS levels. In contrast, to apply no, or different, filters to spectra from -different MS levels, multi-row range matrices can be used (see examples -below). +In contrast to some other \emph{filter} functions, this function does not provide +a \code{msLevel} parameter that allows to define the MS level of spectra on which +the filter should be applied. The filter(s) will always be applied to +\strong{all} spectra (irrespectively of their MS level). Through combination of +multiple filter ranges it is however possible to apply MS level-dependent +filters (see examples below for details). + +The filter will not be applied immediately to the data but only executed when +the mass peak data is accessed (through \code{peaksData()}, \code{mz()} or +\code{intensity()}) or by calling \code{applyProcessing()}. } \examples{ From d0ea0b1d5cac32c7220c1380a933d9b79d466b0a Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Tue, 23 Jul 2024 15:29:15 +0200 Subject: [PATCH 15/41] fix: selectSpectraVariables to not drop peaks variables - Ensure `selectSpectraVariables()` for `MsBackendMzR` is not removing peaks variables `"mz"` and `"intensity"` by default. --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/MsBackendDataFrame.R | 4 ++-- tests/testthat/test_MsBackendDataFrame.R | 17 +++++++++++++++++ tests/testthat/test_MsBackendMzR.R | 9 +++++++++ 5 files changed, 34 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 47b81ac5..7006652c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.5 +Version: 1.15.6 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 383deb8b..953f9468 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.6 + +- Fix in `selectSpectraVariables()` for `MsBackendMzR`: ensure peaks variables + `"mz"` and `"intensity"` are not by default removed. + ## Changes in 1.15.5 - Add new `filterPeaksRanges()` function to filter mass peaks by ranges on diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index 424e64da..c13052b7 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -406,7 +406,7 @@ setMethod("selectSpectraVariables", "MsBackendDataFrame", spectraVariables(object))], collapse = ", "), " not available") keep <- spectraVariables[spectraVariables %in% - colnames(object@spectraData)] + colnames(object@spectraData)] if (length(keep)) object@spectraData <- object@spectraData[, keep, drop = FALSE] @@ -414,7 +414,7 @@ setMethod("selectSpectraVariables", "MsBackendDataFrame", if (length(msg)) stop(msg) object@peaksVariables <- intersect(object@peaksVariables, - colnames(object@spectraData)) + spectraVariables) validObject(object) object }) diff --git a/tests/testthat/test_MsBackendDataFrame.R b/tests/testthat/test_MsBackendDataFrame.R index 3b8fe64c..4b41d6d0 100644 --- a/tests/testthat/test_MsBackendDataFrame.R +++ b/tests/testthat/test_MsBackendDataFrame.R @@ -622,15 +622,32 @@ test_that("selectSpectraVariables,MsBackendDataFrame works", { be <- backendInitialize(MsBackendDataFrame(), df) res <- selectSpectraVariables(be, c("dataStorage", "other_col")) + + expect_equal(res@peaksVariables, be@peaksVariables) expect_equal(colnames(res@spectraData), c("dataStorage", "other_col")) expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) res <- selectSpectraVariables(be, c("dataStorage", "rtime")) expect_equal(colnames(res@spectraData), c("dataStorage", "rtime")) + expect_equal(res@peaksVariables, be@peaksVariables) expect_error(selectSpectraVariables(be, "rtime"), "dataStorage is/are missing") expect_error(selectSpectraVariables(be, "something"), "something not available") + + df$mz <- list(c(1.2, 1.4), c(5.3, 34.5, 52.1)) + df$intensity <- list(c(123, 121.1), c(1231.1, 343.1, 21.1)) + be <- backendInitialize(MsBackendDataFrame(), df) + res <- selectSpectraVariables(be, c("dataStorage", "other_col")) + expect_equal(colnames(res@spectraData), c("dataStorage", "other_col")) + expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) + expect_equal(res@peaksVariables, character()) + + be <- backendInitialize(MsBackendDataFrame(), df) + res <- selectSpectraVariables(be, c("dataStorage", "mz", "intensity")) + expect_equal(colnames(res@spectraData), c("dataStorage", "mz", "intensity")) + expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) + expect_equal(res@peaksVariables, c("mz", "intensity")) }) test_that("$,$<-,MsBackendDataFrame works", { diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index dee66253..d051b8e6 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -493,6 +493,14 @@ test_that("selectSpectraVariables,MsBackendMzR works", { "scanIndex")) expect_equal(colnames(res@spectraData), c("dataStorage", "msLevel", "rtime", "scanIndex")) + expect_equal(res@peaksVariables, character()) + + res <- selectSpectraVariables(be, c("dataStorage", "msLevel", "rtime", + "scanIndex", "mz", "intensity")) + expect_equal(colnames(res@spectraData), c("dataStorage", "msLevel", "rtime", + "scanIndex")) + expect_equal(res@peaksVariables, c("mz", "intensity")) + expect_error(selectSpectraVariables(be, c("dataStorage", "msLevel")), "scanIndex is/are missing") }) @@ -559,6 +567,7 @@ test_that("dropNaSpectraVariables works with MsBackendMzR", { expect_equal(mz(res[1]), mz(sciex_mzr[1])) expect_true(length(spectraVariables(res)) < length(spectraVariables(sciex_mzr))) + expect_equal(res@peaksVariables, sciex_mzr@peaksVariables) }) test_that("supportsSetBackend,MsBackendMzR", { From e93fa632a542c99562feebf8a6ea5a792bed1778 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 31 Jul 2024 13:04:09 +0200 Subject: [PATCH 16/41] refactor: change estimatePrecursorIntensity to a method - Change `estimatePrecursorIntensity()` to a method to avoid overrides by the implementation in *xcms*. --- DESCRIPTION | 4 +- NAMESPACE | 3 +- NEWS.md | 5 ++ R/Spectra-functions.R | 76 ---------------------- R/Spectra.R | 83 ++++++++++++++++++++++++- man/estimatePrecursorIntensity.Rd | 14 ++--- tests/testthat/test_Spectra-functions.R | 10 --- tests/testthat/test_Spectra.R | 12 +++- 8 files changed, 109 insertions(+), 98 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7006652c..c233fc3c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.6 +Version: 1.15.7 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different @@ -41,7 +41,7 @@ Depends: R (>= 4.0.0), S4Vectors, BiocParallel, - ProtGenerics (>= 1.35.4) + ProtGenerics (>= 1.37.1) Imports: methods, IRanges, diff --git a/NAMESPACE b/NAMESPACE index c2a95ca3..0a9fbcf9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,6 @@ export(concatenateSpectra) export(coreSpectraVariables) export(countIdentifications) export(deisotopeSpectra) -export(estimatePrecursorIntensity) export(estimatePrecursorMz) export(filterPeaksRanges) export(filterPrecursorIsotopes) @@ -84,6 +83,7 @@ exportMethods(dataStorage) exportMethods(dataStorageBasePath) exportMethods(dropNaSpectraVariables) exportMethods(entropy) +exportMethods(estimatePrecursorIntensity) exportMethods(export) exportMethods(filterAcquisitionNum) exportMethods(filterDataOrigin) @@ -248,6 +248,7 @@ importMethodsFrom(ProtGenerics,collisionEnergy) importMethodsFrom(ProtGenerics,compareSpectra) importMethodsFrom(ProtGenerics,dataOrigin) importMethodsFrom(ProtGenerics,dataStorage) +importMethodsFrom(ProtGenerics,estimatePrecursorIntensity) importMethodsFrom(ProtGenerics,filterAcquisitionNum) importMethodsFrom(ProtGenerics,filterDataOrigin) importMethodsFrom(ProtGenerics,filterDataStorage) diff --git a/NEWS.md b/NEWS.md index 953f9468..49e8e289 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.7 + +- Change `estimatePrecursorIntensity()` to a method to avoid overrides/clashes + with the same-named implementation in *xcms*. + ## Changes in 1.15.6 - Fix in `selectSpectraVariables()` for `MsBackendMzR`: ensure peaks variables diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 8bb9042d..517452c4 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -690,82 +690,6 @@ processingLog <- function(x) { x@processing } -#' @title Estimate Precursor Intensities -#' -#' @description -#' -#' Some MS instrument manufacturers don't provide precursor intensities for -#' fragment spectra. These can however be estimated, given that also MS1 -#' spectra are available. The `estimatePrecursorIntensity()` funtion defines the -#' precursor intensities for MS2 spectra using the intensity of the matching -#' MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured -#' before the respective MS2 spectrum). With `method = "interpolation"` it is -#' also possible to calculate the precursor intensity based on an interpolation -#' of intensity values (and retention times) of the matching MS1 peaks from the -#' previous and next MS1 spectrum. See below for an example. -#' -#' @param x `Spectra` with MS1 and MS2 spectra. -#' -#' @param ppm `numeric(1)` with the maximal allowed relative difference of m/z -#' values between the precursor m/z of a spectrum and the m/z of the -#' respective ion on the MS1 scan. -#' -#' @param tolerance `numeric(1)` with the maximal allowed difference of m/z -#' values between the precursor m/z of a spectrum and the m/z of the -#' respective ion on the MS1 scan. -#' -#' @param method `character(1)` defining whether the precursor intensity -#' should be estimated on the previous MS1 spectrum (`method = "previous"`, -#' the default) or based on an interpolation on the previous and next -#' MS1 spectrum (`method = "interpolation"`). -#' -#' @param msLevel. `integer(1)` the MS level for which precursor intensities -#' should be estimated. Defaults to `2L`. -#' -#' @param f `factor` (or vector to be coerced to `factor`) defining which -#' spectra belong to the same original data file (sample). -#' Defaults to `f = dataOrigin(x)`. -#' -#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more -#' information. This is passed directly to the [backendInitialize()] method -#' of the [MsBackend-class]. -#' -#' @author Johannes Rainer with feedback and suggestions from Corey Broeckling -#' -#' @export -#' -#' @rdname estimatePrecursorIntensity -#' -#' @examples -#' -#' #' ## Calculating the precursor intensity for MS2 spectra: -#' ## -#' ## Some MS instrument manufacturer don't report the precursor intensities -#' ## for MS2 spectra. The `estimatePrecursorIntensity` function can be used -#' ## in these cases to calculate the precursor intensity on MS1 data. Below -#' ## we load an mzML file from a vendor providing precursor intensities and -#' ## compare the estimated and reported precursor intensities. -#' tmt <- Spectra(msdata::proteomics(full.names = TRUE)[5], -#' backend = MsBackendMzR()) -#' pmi <- estimatePrecursorIntensity(tmt) -#' plot(pmi, precursorIntensity(tmt)) -#' -#' ## We can also replace the original precursor intensity values with the -#' ## newly calculated ones -#' tmt$precursorIntensity <- pmi -estimatePrecursorIntensity <- function(x, ppm = 20, tolerance = 0, - method = c("previous", "interpolation"), - msLevel. = 2L, f = dataOrigin(x), - BPPARAM = bpparam()) { - if (is.factor(f)) - f <- as.character(f) - f <- factor(f, levels = unique(f)) - BPPARAM <- backendBpparam(x@backend, BPPARAM) - unlist(bplapply(split(x, f), FUN = .estimate_precursor_intensity, ppm = ppm, - tolerance = tolerance, method = method, msLevel = msLevel., - BPPARAM = BPPARAM), use.names = FALSE) -} - #' estimate precursor intensities based on MS1 peak intensity. This function #' assumes that `x` is a `Spectra` with data **from a single file/sample**. #' diff --git a/R/Spectra.R b/R/Spectra.R index a20a44ba..76a467c5 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -2859,4 +2859,85 @@ asDataFrame <- function(object, i = seq_along(object), v <- spectraData(object)[rep(seq_along(object), n), spectraVars] p <- do.call(rbind, as.list(peaksData(object))) cbind(p, v) -} \ No newline at end of file +} + +#' @title Estimate Precursor Intensities +#' +#' @description +#' +#' Some MS instrument manufacturers don't provide precursor intensities for +#' fragment spectra. These can however be estimated, given that also MS1 +#' spectra are available. The `estimatePrecursorIntensity()` funtion defines the +#' precursor intensities for MS2 spectra using the intensity of the matching +#' MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured +#' before the respective MS2 spectrum). With `method = "interpolation"` it is +#' also possible to calculate the precursor intensity based on an interpolation +#' of intensity values (and retention times) of the matching MS1 peaks from the +#' previous and next MS1 spectrum. See below for an example. +#' +#' @param object `Spectra` with MS1 and MS2 spectra. +#' +#' @param ppm `numeric(1)` with the maximal allowed relative difference of m/z +#' values between the precursor m/z of a spectrum and the m/z of the +#' respective ion on the MS1 scan. +#' +#' @param tolerance `numeric(1)` with the maximal allowed difference of m/z +#' values between the precursor m/z of a spectrum and the m/z of the +#' respective ion on the MS1 scan. +#' +#' @param method `character(1)` defining whether the precursor intensity +#' should be estimated on the previous MS1 spectrum (`method = "previous"`, +#' the default) or based on an interpolation on the previous and next +#' MS1 spectrum (`method = "interpolation"`). +#' +#' @param msLevel. `integer(1)` the MS level for which precursor intensities +#' should be estimated. Defaults to `2L`. +#' +#' @param f `factor` (or vector to be coerced to `factor`) defining which +#' spectra belong to the same original data file (sample). +#' Defaults to `f = dataOrigin(x)`. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @author Johannes Rainer with feedback and suggestions from Corey Broeckling +#' +#' @importMethodsFrom ProtGenerics estimatePrecursorIntensity +#' +#' @exportMethod estimatePrecursorIntensity +#' +#' @rdname estimatePrecursorIntensity +#' +#' @examples +#' +#' #' ## Calculating the precursor intensity for MS2 spectra: +#' ## +#' ## Some MS instrument manufacturer don't report the precursor intensities +#' ## for MS2 spectra. The `estimatePrecursorIntensity` function can be used +#' ## in these cases to calculate the precursor intensity on MS1 data. Below +#' ## we load an mzML file from a vendor providing precursor intensities and +#' ## compare the estimated and reported precursor intensities. +#' tmt <- Spectra(msdata::proteomics(full.names = TRUE)[5], +#' backend = MsBackendMzR()) +#' pmi <- estimatePrecursorIntensity(tmt) +#' plot(pmi, precursorIntensity(tmt)) +#' +#' ## We can also replace the original precursor intensity values with the +#' ## newly calculated ones +#' tmt$precursorIntensity <- pmi +setMethod( + "estimatePrecursorIntensity", "Spectra", + function(object, ppm = 20, tolerance = 0, + method = c("previous", "interpolation"), + msLevel. = 2L, f = dataOrigin(object), BPPARAM = bpparam()) { + if (is.factor(f)) + f <- as.character(f) + f <- factor(f, levels = unique(f)) + BPPARAM <- backendBpparam(object@backend, BPPARAM) + unlist(bplapply(split(object, f), + FUN = .estimate_precursor_intensity, ppm = ppm, + tolerance = tolerance, method = method, + msLevel = msLevel., BPPARAM = BPPARAM), + use.names = FALSE) + }) diff --git a/man/estimatePrecursorIntensity.Rd b/man/estimatePrecursorIntensity.Rd index e4a7efd9..97a2cde2 100644 --- a/man/estimatePrecursorIntensity.Rd +++ b/man/estimatePrecursorIntensity.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R -\name{estimatePrecursorIntensity} -\alias{estimatePrecursorIntensity} +% Please edit documentation in R/Spectra.R +\name{estimatePrecursorIntensity,Spectra-method} +\alias{estimatePrecursorIntensity,Spectra-method} \title{Estimate Precursor Intensities} \usage{ -estimatePrecursorIntensity( - x, +\S4method{estimatePrecursorIntensity}{Spectra}( + object, ppm = 20, tolerance = 0, method = c("previous", "interpolation"), msLevel. = 2L, - f = dataOrigin(x), + f = dataOrigin(object), BPPARAM = bpparam() ) } \arguments{ -\item{x}{\code{Spectra} with MS1 and MS2 spectra.} +\item{object}{\code{Spectra} with MS1 and MS2 spectra.} \item{ppm}{\code{numeric(1)} with the maximal allowed relative difference of m/z values between the precursor m/z of a spectrum and the m/z of the diff --git a/tests/testthat/test_Spectra-functions.R b/tests/testthat/test_Spectra-functions.R index 7fabfbb5..8df50d71 100644 --- a/tests/testthat/test_Spectra-functions.R +++ b/tests/testthat/test_Spectra-functions.R @@ -690,16 +690,6 @@ test_that(".estimate_precursor_intensity works", { expect_true(all(is.na(res))) }) -test_that("estimatePrecursorIntensity works", { - fls <- msdata::proteomics(full.names = TRUE)[c(5, 3)] - second <- Spectra(fls[2], backend = MsBackendMzR()) - both <- Spectra(fls, backend = MsBackendMzR()) - - res_second <- estimatePrecursorIntensity(second) - res_both <- estimatePrecursorIntensity(both) - expect_equal(res_second, res_both[510:length(res_both)]) -}) - test_that(".chunk_factor works", { res <- .chunk_factor(10, chunkSize = 3) expect_equal(res, as.factor(c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4))) diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index 5169b2fe..e81efdcb 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -1928,4 +1928,14 @@ test_that("asDataFrame works", { expect_identical(nrow(df), sum(sapply(peaksData(sp[1:3]), nrow))) expect_identical(ncol(df), 2L) expect_identical(names(df), c("mz", "intensity")) -}) \ No newline at end of file +}) + +test_that("estimatePrecursorIntensity works", { + fls <- msdata::proteomics(full.names = TRUE)[c(5, 3)] + second <- Spectra(fls[2], backend = MsBackendMzR()) + both <- Spectra(fls, backend = MsBackendMzR()) + + res_second <- estimatePrecursorIntensity(second) + res_both <- estimatePrecursorIntensity(both) + expect_equal(res_second, res_both[510:length(res_both)]) +}) From 27565369872bc6db9c2e86ae62571b56c4704de5 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 1 Aug 2024 13:42:56 +0200 Subject: [PATCH 17/41] ci: remove all custom library installations for macOS --- .github/workflows/check-bioc.yml | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 296f3674..a558612a 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -126,31 +126,31 @@ jobs: if: matrix.config.os == 'macOS-latest' run: | ## Enable installing XML from source if needed - brew install libxml2 - echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV - - ## Required to install magick as noted at - ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 - brew install imagemagick@6 - - ## For textshaping, required by ragg, and required by pkgdown - brew install harfbuzz fribidi - - brew install libgit2 - ## Helps compile RCurl from source - ## brew uninstall curl - - ## required for ncdf4 - can not use the homebrew one because that uses GCC - ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ - curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz - tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / - rm netcdf-4.7.4-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz - tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / - rm hdf5-1.12.0-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz - tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / - rm szip-2.1.1-darwin.17-x86_64.tar.gz + # brew install libxml2 + # echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV + + # ## Required to install magick as noted at + # ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 + # brew install imagemagick@6 + + # ## For textshaping, required by ragg, and required by pkgdown + # brew install harfbuzz fribidi + + # brew install libgit2 + # ## Helps compile RCurl from source + # ## brew uninstall curl + + # ## required for ncdf4 - can not use the homebrew one because that uses GCC + # ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ + # curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz + # tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / + # rm netcdf-4.7.4-darwin.17-x86_64.tar.gz + # curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz + # tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / + # rm hdf5-1.12.0-darwin.17-x86_64.tar.gz + # curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz + # tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / + # rm szip-2.1.1-darwin.17-x86_64.tar.gz - name: Install Windows system dependencies if: runner.os == 'Windows' From 36006b55a1c656c713c1140f47ebf620b7193d5e Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 1 Aug 2024 14:04:48 +0200 Subject: [PATCH 18/41] ci: remove comments --- .github/workflows/check-bioc.yml | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index a558612a..b0c1e8df 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -125,32 +125,7 @@ jobs: - name: Install macOS system dependencies if: matrix.config.os == 'macOS-latest' run: | - ## Enable installing XML from source if needed - # brew install libxml2 - # echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV - - # ## Required to install magick as noted at - # ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 - # brew install imagemagick@6 - - # ## For textshaping, required by ragg, and required by pkgdown - # brew install harfbuzz fribidi - - # brew install libgit2 - # ## Helps compile RCurl from source - # ## brew uninstall curl - - # ## required for ncdf4 - can not use the homebrew one because that uses GCC - # ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ - # curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz - # tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / - # rm netcdf-4.7.4-darwin.17-x86_64.tar.gz - # curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz - # tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / - # rm hdf5-1.12.0-darwin.17-x86_64.tar.gz - # curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz - # tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / - # rm szip-2.1.1-darwin.17-x86_64.tar.gz + shell: Rscript {0} - name: Install Windows system dependencies if: runner.os == 'Windows' From fe145b84e1b749a0bbd6c000183d018f8f4cbac4 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 11 Sep 2024 07:02:22 +0200 Subject: [PATCH 19/41] docs: start reorganizing the documentation --- R/Spectra.R | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/R/Spectra.R b/R/Spectra.R index 76a467c5..eaed13ac 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -3,20 +3,22 @@ NULL #' @title The Spectra class to manage and access MS data #' -#' @aliases Spectra-class [,Spectra-method -#' @aliases uniqueMsLevels uniqueMsLevels,Spectra-method -#' @aliases combinePeaks +#' @aliases Spectra-class #' #' @name Spectra #' #' @description #' -#' The `Spectra` class encapsules spectral mass spectrometry data and -#' related metadata. +#' The `Spectra` class encapsules spectral mass spectrometry (MS) data and +#' related metadata. The MS data is represented by a *backend* extending the +#' virual [MsBackend] class which provides the data to the `Spectra` object. +#' The `Spectra` class implements only data accessor, filtering and analysis +#' methods for the MS data and relies on its *backend* to provide the MS data. +#' This allows to change data representations of a `Spectra` object depending +#' on the user's needs and properties of the data. Different backends and +#' their properties are explained in the [MsBackend] documentation. #' -#' It supports multiple data backends, e.g. in-memory ([MsBackendMemory], -#' [MsBackendDataFrame()]), on-disk as mzML ([MsBackendMzR()]) or HDF5 -#' ([MsBackendHdf5Peaks()]). +#' LLLLL link to other documentations. #' #' @details #' @@ -54,6 +56,28 @@ NULL #' the `acquisitionNum`) #' #' See also [this issue](https://github.com/lgatto/MSnbase/issues/525). + +#' @title Accessing Mass Spectrometry Data +#' +#' + +#' @title Merging, Splitting and Aggregating Spectra +#' +#' @aliases [,Spectra-method + +#' @title Filtering and Subsetting Spectra Objects +#' + +#' @title Data Manipulation and Analysis Methods +#' + +#' @title Spectra Similarity Calculations + +#' @title The Spectra class to manage and access MS data +#' +#' @aliases uniqueMsLevels uniqueMsLevels,Spectra-method +#' @aliases combinePeaks +#' #' #' @section Creation of objects, conversion, changing the backend and export: #' From 90c30ed25be94195b694bbc398b8c77770e9f8fb Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 12 Sep 2024 13:38:01 +0200 Subject: [PATCH 20/41] docs: start refactoring the documentation --- R/Spectra.R | 1616 ++++++--------------------------------------------- 1 file changed, 186 insertions(+), 1430 deletions(-) diff --git a/R/Spectra.R b/R/Spectra.R index eaed13ac..a81b606f 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1,11 +1,15 @@ #' @include hidden_aliases.R NULL +################################################################################ +## +## Spectra class, creation, data representation, export +## +################################################################################ + #' @title The Spectra class to manage and access MS data #' -#' @aliases Spectra-class -#' -#' @name Spectra +#' @name Spectra-class #' #' @description #' @@ -18,75 +22,24 @@ NULL #' on the user's needs and properties of the data. Different backends and #' their properties are explained in the [MsBackend] documentation. #' -#' LLLLL link to other documentations. -#' -#' @details -#' -#' The `Spectra` class uses by default a lazy data manipulation strategy, -#' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` -#' are not applied immediately to the data, but applied on-the-fly to the -#' spectrum data once it is retrieved. For some backends that allow to write -#' data back to the data storage (such as the [MsBackendMemory()], -#' [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it is possible to apply -#' to queue with the `applyProcessing` function. See the *Data manipulation and -#' analysis *methods* section below for more details. -#' -#' For more information on parallel or chunk-wise processing (especially -#' helpful for very large data sets) see [processingChunkSize()]. -#' -#' To apply arbitrary functions to a `Spectra` use the `spectrapply()` function -#' (or directly [chunkapply()] for chunk-wise processing). See description of -#' the `spectrapply()` function below for details. -#' -#' For details on plotting spectra, see [plotSpectra()]. -#' -#' Clarifications regarding scan/acquisition numbers and indices: -#' -#' - A `spectrumId` (or `spectrumID`) is a vendor specific field in -#' the mzML file that contains some information about the -#' run/spectrum, e.g.: `controllerType=0 controllerNumber=1 -#' scan=5281 file=2` +#' Documentation on other topics and functionality of `Spectra`can be found in: #' -#' - `acquisitionNum` is a more a less sanitize spectrum id generated -#' from the `spectrumId` field by `mzR` (see -#' [here](https://github.com/sneumann/mzR/blob/master/src/pwiz/data/msdata/MSData.cpp#L552-L580)). -#' -#' - `scanIndex` is the `mzR` generated sequence number of the -#' spectrum in the raw file (which doesn't have to be the same as -#' the `acquisitionNum`) -#' -#' See also [this issue](https://github.com/lgatto/MSnbase/issues/525). - -#' @title Accessing Mass Spectrometry Data -#' -#' - -#' @title Merging, Splitting and Aggregating Spectra -#' -#' @aliases [,Spectra-method - -#' @title Filtering and Subsetting Spectra Objects -#' - -#' @title Data Manipulation and Analysis Methods -#' - -#' @title Spectra Similarity Calculations - -#' @title The Spectra class to manage and access MS data -#' -#' @aliases uniqueMsLevels uniqueMsLevels,Spectra-method -#' @aliases combinePeaks +#' LLLLLLL add links to individual documentations. +#' - [processingChunkSize()] for information on parallel and chunk-wise data +#' processing. +#' - [plotSpectra()] for visualization of `Spectra`. #' #' -#' @section Creation of objects, conversion, changing the backend and export: +#' @section Creation of objects: #' #' `Spectra` classes can be created with the `Spectra()` constructor function #' which supports the following formats: #' #' - parameter `object` is a `data.frame` or `DataFrame` containing the -#' spectrum data. The provided `backend` (by default a -#' [MsBackendMemory-class]) will be initialized with that data. +#' full spectrum data (spectra variables in columns as well as columns +#' with the individual MS peak data, *m/z* and intensity). The provided +#' `backend` (by default a [MsBackendMemory-class]) will be initialized +#' with that data. #' #' - parameter `object` is a [MsBackend-class] (assumed to be already #' initialized). @@ -103,45 +56,80 @@ NULL #' #' With `...` additional arguments can be passed to the backend's #' [backendInitialize()] method. Parameter `backend` allows to specify which -#' [MsBackend-class] should be used for data storage. +#' [MsBackend-class] should be used for data representation and storage. +#' +#' +#' @section Data representation of a `Spectra`: +#' +#' The MS data which can be accessed through the `Spectra` object is +#' *represented* by its backend, which means that this backend defines how +#' and where the data is stored (e.g. in memory or on disk). The `Specrta` +#' object relies on the backend to provide the MS data whenever it needs it +#' for data processing. +#' Different backends with different properties, such as minimal memory +#' requirement or fast data access, are defined in the *Spectra* package or +#' one of the MsBackend* packages. More information on backends and their +#' properties is provided in the documentation of [MsBackend]. +#' +#' On-disk backends keep only a limited amount of data in memory retrieving +#' most of the data (usually the MS peak data) upon request on-the-fly from +#' their on-disk data representations. Moving the on-disk data storage of such +#' a backend or a serialized object to a different location in the file +#' system will cause data corruption. The `dataStorageBasePath()` and +#' `dataStorageBasePath<-` functions allow in such cases (and if thebackend +#' classes support this operation), to get or change the *base* +#' path to the directory of the backend's data storage. In-memory backends +#' such as [MsBackendMemory] or [MsBackendDataFrame] keeping all MS data in +#' memory don't support, and need, this function, but for [MsBackendMzR] this +#' function can be used to update/adapt the path to the directory containing +#' the original data files. Thus, for `Spectra` objects (using this backend) +#' that were moved to another file system or computer, these functions allow to +#' adjust/adapt the base file path. +#' #' -#' The backend of a `Spectra` object can be changed with the `setBackend()` -#' method that takes an instance of the new backend as second parameter -#' `backend`. A call to `setBackend(sps, backend = MsBackendDataFrame())` +#' @section Changing data representation of a `Spectra`: +#' +#' The data representation, i.e. the backend of a `Spectra` object can be +#' changed with the `setBackend()` method that takes an instance of the new +#' backend as second parameter `backend`. A call to +#' `setBackend(sps, backend = MsBackendDataFrame())` #' would for example change the backend of `sps` to the *in-memory* #' `MsBackendDataFrame`. Changing to a backend is only supported if that #' backend has a `data` parameter in its `backendInitialize()` method and if #' `supportsSetBackend()` returns `TRUE` for that backend. `setBackend()` will -#' transfer the full spectra data from the originating backend as a -#' `DataFrame` to the new backend. -#' Most *read-only* backends do not support `setBackend()`. It is for example -#' not possible to change the backend to a *read-only* backend (such as -#' the [MsBackendMzR()] backend). +#' transfer the full spectra data from the originating backend as a `DataFrame` +#' to the new backend. +#' +#' Generally, it is not possible to change **to** a read-only backend such as +#' the [MsBackendMzR()] backend. #' #' The definition of the function is: #' `setBackend(object, backend, ..., f = dataStorage(object), #' BPPARAM = bpparam())` and its parameters are: #' -#' - parameter `object`: the `Spectra` object. +#' - `object`: the `Spectra` object. #' -#' - parameter `backend`: an instance of the new backend, e.g. -#' `[MsBackendMemory()]`. +#' - `backend`: an instance of the new backend, e.g. `[MsBackendMemory()]`. #' -#' - parameter `f`: factor allowing to parallelize the change of the backends. -#' By default the process of copying the spectra data from the original to the +#' - `f`: factor allowing to parallelize the change of the backends. By +#' default the process of copying the spectra data from the original to the #' new backend is performed separately (and in parallel) for each file. Users #' are advised to use the default setting. #' -#' - parameter `...`: optional additional arguments passed to the -#' [backendInitialize()] method of the new `backend`. +#' - `...`: optional additional arguments passed to the [backendInitialize()] +#' method of the new `backend`. #' -#' - parameter `BPPARAM`: setup for the parallel processing. See [bpparam()] for +#' - `BPPARAM`: setup for the parallel processing. See [bpparam()] for #' details. #' +#' +#' @section Exporting data from a `Spectra` object: +#' #' Data from a `Spectra` object can be **exported** to a file with the -#' `export()` function. The actual export of the data has to be performed by +#' `export()` function. The actual export of the data is performed by #' the `export` method of the [MsBackend] class defined with the mandatory -#' parameter `backend`. Note however that not all backend classes support +#' parameter `backend` which defines also the format in which the data +#' is exported. Note however that not all backend classes support #' export of data. From the `MsBackend` classes in the `Spectra` package #' currently only the `MsBackendMzR` backend supports data export (to #' mzML/mzXML file(s)); see the help page of the [MsBackend-class] for @@ -160,998 +148,47 @@ NULL #' - `...`: additional parameters specific for the `MsBackend` passed with #' parameter `backend`. #' -#' The `dataStorageBasePath()` and `dataStorageBasePath<-` functions allow, for -#' backend classes that support this operation, to get or change the *base* -#' path to the directory where the backend stores the data. In-memory backends -#' such as [MsBackendMemory] or [MsBackendDataFrame] keeping all MS data in -#' memory don't support, and need, this function, but for [MsBackendMzR] this -#' function can be used to update/adapt the path to the directory containing -#' the original data files. Thus, for `Spectra` objects (using this backend) -#' that were moved to another file system or computer, these functions allow to -#' adjust/adapt the base file path. #' -#' @section Accessing spectra data: -#' -#' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. -#' See examples for details. Note that replacing values of a peaks variable -#' is not supported with a non-empty processing queue, i.e. if any filtering -#' or data manipulations on the peaks data was performed. In these cases -#' [applyProcessing()] needs to be called first to apply all cached data -#' operations. -#' -#' - `[[`, `[[<-`: access or set/add a single spectrum variable (column) in the -#' backend. -#' -#' - `acquisitionNum()`: returns the acquisition number of each -#' spectrum. Returns an `integer` of length equal to the number of -#' spectra (with `NA_integer_` if not available). -#' -#' - `centroided()`, `centroided<-`: gets or sets the centroiding -#' information of the spectra. `centroided()` returns a `logical` -#' vector of length equal to the number of spectra with `TRUE` if a -#' spectrum is centroided, `FALSE` if it is in profile mode and `NA` -#' if it is undefined. See also `isCentroided()` for estimating from -#' the spectrum data whether the spectrum is centroided. `value` -#' for `centroided<-` is either a single `logical` or a `logical` of -#' length equal to the number of spectra in `object`. -#' -#' - `collisionEnergy()`, `collisionEnergy<-`: gets or sets the -#' collision energy for all spectra in `object`. `collisionEnergy()` -#' returns a `numeric` with length equal to the number of spectra -#' (`NA_real_` if not present/defined), `collisionEnergy<-` takes a -#' `numeric` of length equal to the number of spectra in `object`. -#' -#' - `coreSpectraVariables()`: returns the *core* spectra variables along with -#' their expected data type. -#' -#' - `dataOrigin()`, `dataOrigin<-`: gets or sets the *data origin* for each -#' spectrum. `dataOrigin()` returns a `character` vector (same length than -#' `object`) with the origin of the spectra. `dataOrigin<-` expects a -#' `character` vector (same length than `object`) with the replacement -#' values for the data origin of each spectrum. -#' -#' - `dataStorage()`: returns a `character` vector (same length than `object`) -#' with the data storage location of each spectrum. -#' -#' - `intensity()`: gets the intensity values from the spectra. Returns -#' a [NumericList()] of `numeric` vectors (intensity values for each -#' spectrum). The length of the list is equal to the number of -#' `spectra` in `object`. -#' -#' - `ionCount()`: returns a `numeric` with the sum of intensities for -#' each spectrum. If the spectrum is empty (see `isEmpty()`), -#' `NA_real_` is returned. -#' -#' - `isCentroided()`: a heuristic approach assessing if the spectra in -#' `object` are in profile or centroided mode. The function takes -#' the `qtl`th quantile top peaks, then calculates the difference -#' between adjacent m/z value and returns `TRUE` if the first -#' quartile is greater than `k`. (See `Spectra:::.isCentroided()` for -#' the code.) -#' -#' - `isEmpty()`: checks whether a spectrum in `object` is empty -#' (i.e. does not contain any peaks). Returns a `logical` vector of -#' length equal number of spectra. -#' -#' - `isolationWindowLowerMz()`, `isolationWindowLowerMz<-`: gets or sets the -#' lower m/z boundary of the isolation window. -#' -#' - `isolationWindowTargetMz()`, `isolationWindowTargetMz<-`: gets or sets the -#' target m/z of the isolation window. -#' -#' - `isolationWindowUpperMz()`, `isolationWindowUpperMz<-`: gets or sets the -#' upper m/z boundary of the isolation window. -#' -#' - `containsMz()`: checks for each of the spectra whether they contain mass -#' peaks with an m/z equal to `mz` (given acceptable difference as defined by -#' parameters `tolerance` and `ppm` - see [common()] for details). Parameter -#' `which` allows to define whether any (`which = "any"`, the default) or -#' all (`which = "all"`) of the `mz` have to match. The function returns -#' `NA` if `mz` is of length 0 or is `NA`. -#' -#' - `containsNeutralLoss()`: checks for each spectrum in `object` if it has a -#' peak with an m/z value equal to its precursor m/z - `neutralLoss` (given -#' acceptable difference as defined by parameters `tolerance` and `ppm`). -#' Returns `NA` for MS1 spectra (or spectra without a precursor m/z). -#' -#' - `length()`: gets the number of spectra in the object. -#' -#' - `lengths()`: gets the number of peaks (m/z-intensity values) per -#' spectrum. Returns an `integer` vector (length equal to the -#' number of spectra). For empty spectra, `0` is returned. -#' -#' - `msLevel()`: gets the spectra's MS level. Returns an integer vector (names -#' being spectrum names, length equal to the number of spectra) with the MS -#' level for each spectrum. -#' -#' - `mz()`: gets the mass-to-charge ratios (m/z) from the -#' spectra. Returns a [NumericList()] or length equal to the number of -#' spectra, each element a `numeric` vector with the m/z values of -#' one spectrum. -#' -#' - `peaksData()`: gets the *peaks* data for all spectra in `object`. Peaks -#' data consist of the m/z and intensity values as well as possible additional -#' annotations (variables) of all peaks of each spectrum. The function -#' returns a [SimpleList()] of two dimensional arrays (either `matrix` or -#' `data.frame`), with each array providing the values for the requested -#' *peak variables* (by default `"mz"` and `"intensity"`). Optional parameter -#' `columns` is passed to the backend's `peaksData()` function to allow -#' the selection of specific (or additional) peaks variables (columns) that -#' should be extracted (if available). Importantly, -#' it is **not** guaranteed that each backend supports this parameter (while -#' each backend must support extraction of `"mz"` and `"intensity"` columns). -#' Parameter `columns` defaults to `c("mz", "intensity")` but any value -#' returned by `peaksVariables(object)` is supported. -#' Note also that it is possible to extract the peak data with -#' `as(x, "list")` and `as(x, "SimpleList")` as a `list` and `SimpleList`, -#' respectively. Note however that, in contrast to `peaksData()`, `as()` -#' does not support the parameter `columns`. -#' -#' - `peaksVariables()`: lists the available variables for mass peaks provided -#' by the backend. Default peak variables are `"mz"` and `"intensity"` (which -#' all backends need to support and provide), but some backends might provide -#' additional variables. -#' These variables correspond to the column names of the peak data array -#' returned by `peaksData()`. -#' -#' - `polarity()`, `polarity<-`: gets or sets the polarity for each -#' spectrum. `polarity()` returns an `integer` vector (length equal -#' to the number of spectra), with `0` and `1` representing negative -#' and positive polarities, respectively. `polarity<-` expects an -#' `integer` vector of length 1 or equal to the number of spectra. -#' -#' - `precursorCharge()`, `precursorIntensity()`, `precursorMz()`, -#' `precScanNum()`, `precAcquisitionNum()`: gets the charge (`integer`), -#' intensity (`numeric`), m/z (`numeric`), scan index (`integer`) -#' and acquisition number (`interger`) of the precursor for MS level > -#' 2 spectra from the object. Returns a vector of length equal to -#' the number of spectra in `object`. `NA` are reported for MS1 -#' spectra of if no precursor information is available. -#' -#' - `rtime()`, `rtime<-`: gets or sets the retention times (in seconds) -#' for each spectrum. `rtime()` returns a `numeric` vector (length -#' equal to the number of spectra) with the retention time for each -#' spectrum. `rtime<-` expects a numeric vector with length equal -#' to the number of spectra. -#' -#' - `scanIndex()`: returns an `integer` vector with the *scan index* -#' for each spectrum. This represents the relative index of the -#' spectrum within each file. Note that this can be different to the -#' `acquisitionNum` of the spectrum which represents the index of the -#' spectrum during acquisition/measurement (as reported in the mzML file). -#' -#' - `smoothed()`,`smoothed<-`: gets or sets whether a spectrum is -#' *smoothed*. `smoothed()` returns a `logical` vector of length equal -#' to the number of spectra. `smoothed<-` takes a `logical` vector -#' of length 1 or equal to the number of spectra in `object`. -#' -#' - `spectraData()`: gets general spectrum metadata (annotation, also called -#' header). `spectraData()` returns a `DataFrame`. Note that this -#' method does by default **not** return m/z or intensity values. -#' -#' - `spectraData<-`: **replaces** the full spectra data of the `Spectra` -#' object with the one provided with `value`. The `spectraData<-` function -#' expects a `DataFrame` to be passed as value with the same number of rows -#' as there a spectra in `object`. Note that replacing values of -#' peaks variables is not supported with a non-empty processing queue, i.e. -#' if any filtering or data manipulations on the peaks data was performed. -#' In these cases [applyProcessing()] needs to be called first to apply all -#' cached data operations and empty the processing queue. -#' -#' - `spectraNames()`, `spectraNames<-`: gets or sets the spectra names. -#' -#' - `spectraVariables()`: returns a `character` vector with the -#' available spectra variables (columns, fields or attributes of each -#' spectrum) available in `object`. Note that `spectraVariables()` does not -#' list the *peak variables* (`"mz"`, `"intensity"` and eventual additional -#' annotations for each MS peak). Peak variables are returned by -#' `peaksVariables()`. -#' -#' - `tic()`: gets the total ion current/count (sum of signal of a -#' spectrum) for all spectra in `object`. By default, the value -#' reported in the original raw data file is returned. For an empty -#' spectrum, `0` is returned. -#' -#' - `uniqueMsLevels()`: get the unique MS levels available in `object`. This -#' function is supposed to be more efficient than `unique(msLevel(object))`. -#' -#' @section Filter spectra data: -#' -#' Filter a `Spectra` object based on the spectra data. This includes subset -#' operations that immediately reduce the number of spectra in the object as -#' well as filters that reduce the *content* of the `Spectra` object. -#' See section *Filter peaks data* below for functions that filter the peaks -#' data of a `Spectra`. -#' -#' - `[`: subsets the spectra keeping only selected elements (`i`). The method -#' **always** returns a `Spectra` object. -#' -#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the -#' object's `spectraData` that contain only missing values (`NA`). Note that -#' while columns with only `NA`s are removed, a `spectraData()` call after -#' `dropNaSpectraVariables()` might still show columns containing `NA` values -#' for *core* spectra variables. The total number of spectra is not changed -#' by this function. -#' -#' - `filterAcquisitionNum()`: filters the object keeping only spectra matching -#' the provided acquisition numbers (argument `n`). If `dataOrigin` or -#' `dataStorage` is also provided, `object` is subsetted to the spectra with -#' an acquisition number equal to `n` **in spectra with matching dataOrigin -#' or dataStorage values** retaining all other spectra. -#' Returns the filtered `Spectra`. -#' -#' - `filterDataOrigin()`: filters the object retaining spectra matching the -#' provided `dataOrigin`. Parameter `dataOrigin` has to be of type -#' `character` and needs to match exactly the data origin value of the -#' spectra to subset. -#' Returns the filtered `Spectra` object (with spectra ordered according to -#' the provided `dataOrigin` parameter). -#' -#' - `filterDataStorage()`: filters the object retaining spectra stored in the -#' specified `dataStorage`. Parameter `dataStorage` has to be of type -#' `character` and needs to match exactly the data storage value of the -#' spectra to subset. -#' Returns the filtered `Spectra` object (with spectra ordered according to -#' the provided `dataStorage` parameter). -#' -#' - `filterEmptySpectra()`: removes empty spectra (i.e. spectra without peaks). -#' Returns the filtered `Spectra` object (with spectra in their -#' original order). -#' -#' - `filterIsolationWindow()`: retains spectra that contain `mz` in their -#' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` -#' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` -#' object (with spectra in their original order). -#' -#' - `filterMsLevel()`: filters object by MS level keeping only spectra matching -#' the MS level specified with argument `msLevel`. Returns the filtered -#' `Spectra` (with spectra in their original order). -#' -#' - `filterPolarity()`: filters the object keeping only spectra matching the -#' provided polarity. Returns the filtered `Spectra` (with spectra in their -#' original order). -#' -#' - `filterPrecursorCharge()`: retains spectra with the defined precursor -#' charge(s). -#' -#' - `filterPrecursorIsotopes()`: groups MS2 spectra based on their precursor -#' m/z and precursor intensity into predicted isotope groups and keep for each -#' only the spectrum representing the monoisotopic precursor. MS1 spectra -#' are returned as is. See documentation for `deisotopeSpectra()` below for -#' details on isotope prediction and parameter description. -#' -#' - `filterPrecursorMaxIntensity()`: filters the `Spectra` keeping for groups -#' of (MS2) spectra with similar precursor m/z values (given parameters -#' `ppm` and `tolerance`) the one with the highest precursor intensity. The -#' function filters only MS2 spectra and returns all MS1 spectra. If -#' precursor intensities are `NA` for all spectra within a spectra group, the -#' first spectrum of that groups is returned. -#' Note: some manufacturers don't provide precursor intensities. These can -#' however also be estimated with [estimatePrecursorIntensity()]. -#' -#' - `filterPrecursorMzRange()` (previously `filterPrecursorMz()` which is now -#' deprecated): retains spectra with a precursor m/z within the -#' provided m/z range. See examples for details on selecting spectra with -#' a precursor m/z for a target m/z accepting a small difference in *ppm*. -#' -#' - `filterPrecursorMzValues()`: retains spectra with precursor m/z matching -#' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with -#' missing precursor m/z value (e.g. MS1 spectra) are dropped. -#' -#' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. -#' MS2) of acquisition number `acquisitionNum`. Returns the filtered -#' `Spectra` (with spectra in their original order). Parameter `f` allows to -#' define which spectra belong to the same sample or original data file ( -#' defaults to `f = dataOrigin(object)`). -#' -#' - `filterRanges()`: allows filtering of the `Spectra` object based on user -#' defined *numeric* ranges (parameter `ranges`) for one or more available -#' spectra variables in object (spectra variable names can be specified with -#' parameter `spectraVariables`). Spectra for which the value of a spectra -#' variable is within it's defined range are retained. If multiple -#' ranges/spectra variables are defined, the `match` parameter can be used -#' to specify whether all conditions (`match = "all"`; the default) or if -#' any of the conditions must match (`match = "any"`; all spectra for which -#' values are within any of the provided ranges are retained). -#' -#' - `filterRt()`: retains spectra of MS level `msLevel` with retention -#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) -#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their -#' original order). -#' -#' - `filterValues()`: allows filtering of the `Spectra` object based on -#' similarities of *numeric* values of one or more `spectraVariables(object)` -#' (parameter `spectraVariables`) to provided values (parameter `values`) -#' given acceptable differences (parameters tolerance and ppm). If multiple -#' values/spectra variables are defined, the `match` parameter can be used -#' to specify whether all conditions (`match = "all"`; the default) or if -#' any of the conditions must match (`match = "any"`; all spectra for which -#' values are within any of the provided ranges are retained). -#' -#' - `selectSpectraVariables()`: reduces the information within the object to -#' the selected spectra variables: all data for variables not specified will -#' be dropped. For mandatory columns (i.e., those listed by -#' [coreSpectraVariables()], such as *msLevel*, *rtime* ...) only -#' the values will be dropped but not the variable itself. Additional (or -#' user defined) spectra variables will be completely removed. -#' Returns the filtered `Spectra`. -#' -#' -#' @section Filter or aggregate mass peak data: -#' -#' Operations that filter or aggregate the mass peak data from each spectrum -#' without changing the number of spectra in a `Spectra` object. Also, the -#' actual subsetting/aggregation operation is only executed once peaks data is -#' accessed (through `peaksData()`, `mz()` or `intensity()`) or -#' `applyProcessing()` is called. -#' -#' - `combinePeaks()`: combines mass peaks **within each spectrum** with a -#' difference in their m/z values that is smaller than the maximal -#' acceptable difference defined by `ppm` and `tolerance`. Parameters -#' `intensityFun` and `mzFun` allow to define functions to aggregate the -#' intensity and m/z values for each such group of peaks. With -#' `weighted = TRUE` (the default), the m/z value of the combined peak is -#' calculated using an intensity-weighted mean and parameter `mzFun` is -#' ignored. The [MsCoreUtils::group()] function is used for the grouping of -#' mass peaks. Parameter `msLevel.` allows to define selected MS levels for -#' which peaks should be combined. This function returns a `Spectra` with -#' the same number of spectra than the input object, but with possibly -#' combined peaks within each spectrum. -# Additional peak variables (other than `"mz"` and `"intensity"`) are -#' dropped (i.e. their values are replaced with `NA`) for combined peaks -#' unless they are constant across the combined peaks. See also -#' `reduceSpectra()` for a function to select a single *representative* -#' mass peak for each peak group. -#' -#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the -#' monoisotopic peak for groups of isotopologues. Isotopologues are -#' estimated using the [isotopologues()] function from the -#' *MetaboCoreUtils* package. Note that -#' the default parameters for isotope prediction/detection have been -#' determined using data from the Human Metabolome Database (HMDB) and -#' isotopes for elements other than CHNOPS might not be detected. See -#' parameter `substDefinition` in the documentation of [isotopologues()] for -#' more information. The approach and code to define the parameters for -#' isotope prediction is described -#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). -#' -#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier -#' artefact peaks from spectra (see examples below). The function iterates -#' through all intensity ordered peaks in a spectrum and removes all peaks -#' with an m/z within +/- `halfWindowSize` of the current peak if their -#' intensity is lower than `threshold` times the current peak's intensity. -#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` -#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` -#' being the maximum charge that should be considered and `isotopeTolerance` -#' the absolute acceptable tolerance for matching their m/z). -#' See [filterFourierTransformArtefacts()] for details and background and -#' `deisitopeSpectra()` for an alternative. -#' -#' - `filterIntensity()`: filters mass peaks in each spectrum keeping only -#' those with intensities that are within the provided range or match the -#' criteria of the provided function. For the former, parameter `intensity` -#' has to be a `numeric` defining the intensity range, for the latter a -#' `function` that takes the intensity values of the spectrum and returns -#' a `logical` whether the peak should be retained or not (see examples -#' below for details) - additional parameters to the function can be passed -#' with `...`. -#' To remove only peaks with intensities below a certain threshold, say -#' 100, use `intensity = c(100, Inf)`. Note: also a single value can be -#' passed with the `intensity` parameter in which case an upper limit of -#' `Inf` is used. -#' Note that this function removes also peaks with missing intensities -#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the -#' filtering to spectra of the specified MS level(s). -#' -#' - `filterMzRange()`: filters mass peaks in the object keeping or removing -#' those in each spectrum that are within the provided m/z range. Whether -#' peaks are retained or removed can be configured with parameter `keep` -#' (default `keep = TRUE`). -#' -#' - `filterMzValues()`: filters mass peaks in the object keeping all -#' peaks in each spectrum that match the provided m/z value(s) (for -#' `keep = TRUE`, the default) or removing all of them (for `keep = FALSE`). -#' The m/z matching considers also the absolute `tolerance` and m/z-relative -#' `ppm` values. `tolerance` and `ppm` have to be of length 1. -#' -#' - `filterPeaksRanges()`: filters mass peaks of a `Spectra` object using any -#' set of range-based filters on numeric spectra or peaks variables. See -#' [filterPeaksRanges()] for more information. -#' -#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with -#' an m/z equal or larger than the m/z of the precursor, depending on the -#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching -#' m/z (considering an absolute and relative acceptable difference depending -#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all -#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` -#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` -#' allows to restrict the filter to certain MS levels (by default the filter -#' is applied to all MS levels). Note that no peaks are removed if the -#' precursor m/z is `NA` (e.g. typically for MS1 spectra). -#' -#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in -#' (given `ppm` and `tolerance`) in each spectrum only the peak with the -#' highest intensity removing all other peaks hence *reducing* each -#' spectrum to the highest intensity peaks per *peak group*. -#' Peak groups are defined using the [group()] function from the -#' *MsCoreUtils* package. See also the `combinePeaks()` function for an -#' alternative function to combine peaks within each spectrum. -#' -#' -#' @section Merging, aggregating and splitting: -#' -#' Several `Spectra` objects can be concatenated into a single object with the -#' `c()` or the `concatenateSpectra()` function. Concatenation will fail if the -#' processing queue of any of the `Spectra` objects is not empty or if -#' different backends are used in the `Spectra` objects. Thus, in these cases, -#' prior to merging `Spectra` object it is suggested to change the backend to -#' a `MsBackendMemory` using the `setBackend()` function, and to *apply* all -#' data processing steps using `applyProcessing()`. The spectra variables -#' of the resulting `Spectra` object is the union of the spectra variables of -#' the individual `Spectra` objects. -#' -#' - `combineSpectra()`: combines MS data (i.e. mass peaks) from sets of -#' spectra into a single spectrum per set (in contrast to `combinePeaks()` -#' or `reduceSpectra()` that combine mass peaks **within each spectrum**). -#' For each spectrum group (set), spectra variables from the first spectrum -#' are used and the peak matrices are combined using the function specified -#' with `FUN`, which defaults to [combinePeaksData()]. Please refer to the -#' [combinePeaksData()] help page for details and options of the actual -#' combination of peaks across the sets of spectra and to the package -#' vignette for examples and alternative ways to aggregate spectra. -#' The sets of spectra can be specified with parameter `f`. -#' In addition it is possible to define, with parameter `p` if and how to -#' split the input data for parallel processing. -#' This defaults to `p = x$dataStorage` and hence a per-file parallel -#' processing is applied for `Spectra` with file-based backends (such as the -#' [MsBackendMzR()]). -#' Prior combination of the spectra all processings queued in the lazy -#' evaluation queue are applied. Be aware that calling `combineSpectra()` on a -#' `Spectra` object with certain backends that allow modifications might -#' **overwrite** the original data. This does not happen with a -#' `MsBackendMemory` or `MsBackendDataFrame` backend, but with a -#' `MsBackendHdf5Peaks` backend the m/z and intensity values in the original -#' hdf5 file(s) will be overwritten. -#' The function returns a `Spectra` of length equal to the unique levels -#' of `f`. -#' -#' - `joinSpectraData()`: Individual spectra variables can be directly -#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` -#' function allows to merge a `DataFrame` to the existing spectra -#' data. This function diverges from the [merge()] method in two -#' main ways: -#' - The `by.x` and `by.y` column names must be of length 1. -#' - If variable names are shared in `x` and `y`, the spectra -#' variables of `x` are not modified. It's only the `y` -#' variables that are appended the suffix defined in -#' `suffix.y`. This is to avoid modifying any core spectra -#' variables that would lead to an invalid object. -#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not -#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) -#' throw a warning and only the last occurrence is kept. These -#' should be explored and ideally be removed using for -#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar -#' functions. -#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` -#' of `Spectra` objects. -#' -#' -#' @section Data manipulation and analysis methods: -#' -#' Many data manipulation operations, such as those listed in this section, are -#' not applied immediately to the spectra, but added to a -#' *lazy processing/manipulation queue*. Operations stored in this queue are -#' applied on-the-fly to spectra data each time it is accessed. This lazy -#' execution guarantees the same functionality for `Spectra` objects with -#' any backend, i.e. backends supporting to save changes to spectrum data -#' ([MsBackendMemory()], [MsBackendDataFrame()] or [MsBackendHdf5Peaks()]) as -#' well as read-only backends (such as the [MsBackendMzR()]). -#' Note that for the former it is possible to apply the processing queue and -#' write the modified peak data back to the data storage with the -#' `applyProcessing()` function. -#' -#' - `addProcessing()`: adds an arbitrary function that should be applied to the -#' peaks matrix of every spectrum in `object`. The function (can be passed -#' with parameter `FUN`) is expected to take a peaks matrix as input and to -#' return a peaks matrix. A peaks matrix is a numeric matrix with two columns, -#' the first containing the m/z values of the peaks and the second the -#' corresponding intensities. The function has to have `...` in its -#' definition. Additional arguments can be passed with `...`. With parameter -#' `spectraVariables` it is possible to define additional spectra variables -#' from `object` that should be passed to the function `FUN`. These will be -#' passed by their name (e.g. specifying `spectraVariables = "precursorMz"` -#' will pass the spectra's precursor m/z as a parameter named `precursorMz` -#' to the function. The only exception is the spectra's MS level, these will -#' be passed to the function as a parameter called `spectrumMsLevel` (i.e. -#' with `spectraVariables = "msLevel"` the MS levels of each spectrum will be -#' submitted to the function as a parameter called `spectrumMsLevel`). -#' Examples are provided in the package vignette. -#' -#' - `applyProcessing()`: for `Spectra` objects that use a **writeable** backend -#' only: apply all steps from the lazy processing queue to the peak data and -#' write it back to the data storage. Parameter `f` allows to specify how -#' `object` should be split for parallel processing. This should either be -#' equal to the `dataStorage`, or `f = rep(1, length(object))` to disable -#' parallel processing alltogether. Other partitionings might result in -#' errors (especially if a `MsBackendHdf5Peaks` backend is used). -#' -#' - `bin()`: aggregates individual spectra into discrete (m/z) bins. Binning is -#' performed only on spectra of the specified MS level(s) (parameter -#' `msLevel`, by default all MS levels of `x`). The bins can be defined with -#' parameter `breaks` which by default are equally sized bins, with size -#' being defined by parameter `binSize`, from the minimal to the maximal m/z -#' of all spectra (of MS level `msLevel`) within `x`. The same bins are used -#' for all spectra in `x`. All intensity values for peaks falling into the -#' same bin are aggregated using the function provided with parameter `FUN` -#' (defaults to `FUN = sum`, i.e. all intensities are summed up). Note that -#' the binning operation is applied to the peak data on-the-fly upon data -#' access and it is possible to *revert* the operation with the `reset()` -#' function (see description of `reset()` above). -#' -#' - `compareSpectra()`: compares each spectrum in `x` with each spectrum in `y` -#' using the function provided with `FUN` (defaults to [ndotproduct()]). If -#' `y` is missing, each spectrum in `x` is compared with each other spectrum -#' in `x`. -#' The matching/mapping of peaks between the compared spectra is done with the -#' `MAPFUN` function. The default [joinPeaks()] matches peaks of both spectra -#' and allows to keep all peaks from the first spectrum (`type = "left"`), -#' from the second (`type = "right"`), from both (`type = "outer"`) and to -#' keep only matching peaks (`type = "inner"`); see [joinPeaks()] for more -#' information and examples). The `MAPFUN` function should have parameters -#' `x`, `y`, `xPrecursorMz` and `yPrecursorMz` as these values are passed to -#' the function. In addition to `joinPeaks()` also [joinPeaksGnps()] is -#' supported for GNPS-like similarity score calculations. Note that -#' `joinPeaksGnps()` should only be used in combination with -#' `FUN = MsCoreUtils::gnps` (see [joinPeaksGnps()] for more information and -#' details). Use `MAPFUN = joinPeaksNone` to disable internal peak -#' matching/mapping if a similarity scoring function is used that performs -#' the matching internally. -#' `FUN` is supposed to be a function to compare intensities of (matched) -#' peaks of the two spectra that are compared. The function needs to take two -#' matrices with columns `"mz"` and `"intensity"` as input and is supposed -#' to return a single numeric as result. In addition to the two peak matrices -#' the spectra's precursor m/z values are passed to the function as parameters -#' `xPrecursorMz` (precursor m/z of the `x` peak matrix) and `yPrecursorMz` -#' (precursor m/z of the `y` peak matrix). Additional parameters to functions -#' `FUN` and `MAPFUN` can be passed with `...`. Parameters `ppm` and -#' `tolerance` are passed to both `MAPFUN` and `FUN`. -#' The function returns a `matrix` with the results of `FUN` for each -#' comparison, number of rows equal to `length(x)` and number of columns -#' equal `length(y)` (i.e. element in row 2 and column 3 is the result from -#' the comparison of `x[2]` with `y[3]`). If `SIMPLIFY = TRUE` the `matrix` -#' is *simplified* to a `numeric` if length of `x` or `y` is one. See also -#' the vignette for additional examples, such as using spectral entropy -#' similarity in the scoring. -#' -#' - `entropy()`: calculates the entropy of each spectra based on the metrics -#' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). -#' See also [nentropy()] in the *MsCoreUtils* package for details. -#' -#' - `estimatePrecursorIntensity()`: defines the precursor intensities for MS2 -#' spectra using the intensity of the matching MS1 peak from the -#' closest MS1 spectrum (i.e. the last MS1 spectrum measured before the -#' respective MS2 spectrum). With `method = "interpolation"` it is also -#' possible to calculate the precursor intensity based on an interpolation of -#' intensity values (and retention times) of the matching MS1 peaks from the -#' previous and next MS1 spectrum. See [estimatePrecursorIntensity()] for -#' examples and more details. -#' -#' - `estimatePrecursorMz()`: **for DDA data**: allows to estimate a fragment -#' spectra's precursor m/z based on the reported precursor m/z and the data -#' from the previous MS1 spectrum. See [estimatePrecursorMz()] for details. -#' -#' - `neutralLoss()`: calculates neutral loss spectra for fragment spectra. See -#' [neutralLoss()] for detailed documentation. -#' -#' - `processingLog()`: returns a `character` vector with the processing log -#' messages. -#' -#' - `reset()`: restores the data to its original state (as much as possible): -#' removes any processing steps from the lazy processing queue and calls -#' `reset()` on the backend which, depending on the backend, can also undo -#' e.g. data filtering operations. Note that a `reset*(` call after -#' `applyProcessing()` will not have any effect. See examples below for more -#' information. -#' -#' - `scalePeaks()`: scales intensities of peaks within each spectrum depending -#' on parameter `by`. With `by = sum` (the default) peak intensities are -#' divided by the sum of peak intensities within each spectrum. The sum of -#' intensities is thus 1 for each spectrum after scaling. Parameter -#' `msLevel.` allows to apply the scaling of spectra of a certain MS level. -#' By default (`msLevel. = uniqueMsLevels(x)`) intensities for all -#' spectra will be scaled. -#' -#' - `spectrapply()`: applies a given function to each individual spectrum or -#' sets of a `Spectra` object. By default, the `Spectra` is split into -#' individual spectra (i.e. `Spectra` of length 1) and the function `FUN` -#' is applied to each of them. An alternative splitting can be defined with -#' parameter `f`. Parameters for `FUN` can be passed using `...`. -#' The returned result and its order depend on the function `FUN` and how -#' `object` is split (hence on `f`, if provided). Parallel processing is -#' supported and can be configured with parameter `BPPARAM`, is however only -#' suggested for computational intense `FUN`. -#' As an alternative to the (eventual parallel) processing of the full -#' `Spectra`, `spectrapply()` supports also a chunk-wise processing. For this, -#' parameter `chunkSize` needs to be specified. `object` is then split into -#' chunks of size `chunkSize` which are then (stepwise) processed by `FUN`. -#' This guarantees a lower memory demand (especially for on-disk backends) -#' since only the data for one chunk needs to be loaded into memory in each -#' iteration. Note that by specifying `chunkSize`, parameters `f` and -#' `BPPARAM` will be ignored. -#' See also [chunkapply()] or examples below for details on chunk-wise -#' processing. +#' @details #' -#' - `smooth()`: smooths individual spectra using a moving window-based approach -#' (window size = `2 * halfWindowSize`). Currently, the -#' Moving-Average- (`method = "MovingAverage"`), -#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, -#' weights depending on the distance of the center and calculated -#' `1/2^(-halfWindowSize:halfWindowSize)`) and -#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. -#' For details how to choose the correct `halfWindowSize` please see -#' [`MsCoreUtils::smooth()`]. -#' -#' - `pickPeaks()`: picks peaks on individual spectra using a moving -#' window-based approach (window size = `2 * halfWindowSize`). For noisy -#' spectra there are currently two different noise estimators available, -#' the *M*edian *A*bsolute *D*eviation (`method = "MAD"`) and -#' Friedman's Super Smoother (`method = "SuperSmoother"`), -#' as implemented in the [`MsCoreUtils::noise()`]. -#' The method supports also to optionally *refine* the m/z value of -#' the identified centroids by considering data points that belong (most -#' likely) to the same mass peak. Therefore the m/z value is calculated as an -#' intensity weighted average of the m/z values within the peak region. -#' The peak region is defined as the m/z values (and their respective -#' intensities) of the `2 * k` closest signals to the centroid or the closest -#' valleys (`descending = TRUE`) in the `2 * k` region. For the latter the `k` -#' has to be chosen general larger. See [`MsCoreUtils::refineCentroids()`] for -#' details. -#' If the ratio of the signal to the highest intensity of the peak is below -#' `threshold` it will be ignored for the weighted average. -#' -#' - `replaceIntensitiesBelow()`: replaces intensities below a specified -#' threshold with the provided `value`. Parameter `threshold` can be either -#' a single numeric value or a function which is applied to all non-`NA` -#' intensities of each spectrum to determine a threshold value for each -#' spectrum. The default is `threshold = min` which replaces all values -#' which are <= the minimum intensity in a spectrum with `value` (the -#' default for `value` is `0`). Note that the function specified with -#' `threshold` is expected to have a parameter `na.rm` since `na.rm = TRUE` -#' will be passed to the function. If the spectrum is in profile mode, -#' ranges of successive non-0 peaks <= `threshold` are set to 0. -#' Parameter `msLevel.` allows to apply this to only spectra of certain MS -#' level(s). -#' -#' -#' @return See individual method description for the return value. -#' -#' @param acquisitionNum for `filterPrecursorScan()`: `integer` with the -#' acquisition number of the spectra to which the object should be -#' subsetted. -#' -#' @param backend For `Spectra()`: [MsBackend-class] to be used as backend. See -#' section on creation of `Spectra` objects for details. For `setBackend()`: -#' instance of [MsBackend-class] that supports `setBackend()` (i.e. for -#' which `supportsSetBackend()` returns `TRUE`). Such backends have a -#' parameter `data` in their `backendInitialize()` function that support -#' passing the full spectra data to the initialize method. See section on -#' creation of `Spectra` objects for details. -#' For `export()`: [MsBackend-class] to be used to export the data. -#' -#' @param binSize For `bin()`: `numeric(1)` defining the size for the m/z bins. -#' Defaults to `binSize = 1`. +#' The `Spectra` class uses by default a lazy data manipulation strategy, +#' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` +#' are not applied immediately to the data, but applied on-the-fly to the +#' spectrum data once it is retrieved. For some backends that allow to write +#' data back to the data storage (such as the [MsBackendMemory()], +#' [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it is possible to apply +#' to queue with the `applyProcessing` function. See the *Data manipulation and +#' analysis *methods* section below for more details. #' -#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more -#' information. This is passed directly to the [backendInitialize()] method -#' of the [MsBackend-class]. +#' Clarifications regarding scan/acquisition numbers and indices: #' -#' @param breaks For `bin()`: `numeric` defining the m/z breakpoints between -#' bins. -#' -#' @param by For `scalePeaks()`: function to calculate a single `numeric` from -#' intensity values of a spectrum by which all intensities (of -#' that spectrum) should be divided by. The default `by = sum` will -#' divide intensities of each spectrum by the sum of intensities of that -#' spectrum. -#' -#' @param by.x A `character(1)` specifying the spectra variable used -#' for merging. Default is `"spectrumId"`. -#' -#' @param by.y A `character(1)` specifying the column used for -#' merging. Set to `by.x` if missing. -#' -#' @param charge For `deisotopeSpectra()`: expected charge of the ionized -#' compounds. See [isotopologues()] for details. -#' -#' @param chunkSize For `spectrapply()`: size of the chunks into which `Spectra` -#' should be split. This parameter overrides parameters `f` and `BPPARAM`. -#' -#' @param columns For `spectraData()` accessor: optional `character` with -#' column names (spectra variables) that should be included in the -#' returned `DataFrame`. By default, all columns are returned. -#' For `peaksData()` accessor: optional `character` with requested columns -#' in the individual `matrix` of the returned `list`. Defaults to -#' `c("mz", "value")` but any values returned by `peaksVariables(object)` -#' with `object` being the `Spectra` object are supported. -#' -#' @param match For `filterRanges()` and `filterValues()`: `character(1) ` -#' defining whether the condition has to match for all provided -#' `ranges`/`values` (`match = "all"`; the default), or for any of them -#' (`match = "any"`) for spectra to be retained. -#' -#' @param dataOrigin For `filterDataOrigin()`: `character` to define which -#' spectra to keep. -#' For `filterAcquisitionNum()`: optionally specify if filtering should -#' occurr only for spectra of selected `dataOrigin`. -#' -#' @param dataStorage For `filterDataStorage()`: `character` to define which -#' spectra to keep. -#' For `filterAcquisitionNum()`: optionally specify if filtering should -#' occur only for spectra of selected `dataStorage`. -#' -#' @param descending For `pickPeaks()`: `logical`, if `TRUE` just values between -#' the nearest valleys around the peak centroids are used. -# -#' @param drop For `[`, `split()`: not considered. -#' -#' @param f For `split()`: factor defining how to split `x`. See [base::split()] -#' for details. For `setBackend()`: factor defining how to split the data -#' for parallelized copying of the spectra data to the new backend. For some -#' backends changing this parameter can lead to errors. -#' For `combineSpectra()`: `factor` defining the grouping of the spectra -#' that should be combined. For `spectrapply()`: `factor` how `object` -#' should be splitted. For `filterPrecursorScan()`: defining which spectra -#' belong to the same original data file (sample): Defaults to -#' `f = dataOrigin(x)`. -#' For `intensity()`, `mz()` and `peaksData()`: factor defining how data -#' should be chunk-wise loaded an processed. Defaults to -#' [processingChunkFactor()]. -#' -#' @param FUN For `addProcessing()`: function to be applied to the peak matrix -#' of each spectrum in `object`. For `compareSpectra()`: function to compare -#' intensities of peaks between two spectra with each other. -#' For `combineSpectra()`: function to combine the (peak matrices) of the -#' spectra. See section *Data manipulations* and examples below for more -#' details. -#' For `bin()`: function to aggregate intensity values of peaks falling -#' into the same bin. Defaults to `FUN = sum` thus summing up intensities. -#' For `spectrapply()` and `chunkapply()`: function to be applied to -#' `Spectra`. -#' -#' @param halfWindowSize -#' - For `pickPeaks()`: `integer(1)`, used in the -#' identification of the mass peaks: a local maximum has to be the maximum -#' in the window from `(i - halfWindowSize):(i + halfWindowSize)`. -#' - For `smooth()`: `integer(1)`, used in the smoothing algorithm, the -#' window reaches from `(i - halfWindowSize):(i + halfWindowSize)`. -#' - For `filterFourierTransformArtefacts()`: `numeric(1)` defining the m/z -#' window left and right of a peak where to remove fourier transform -#' artefacts. -#' -#' @param i For `[`: `integer`, `logical` or `character` to subset the -#' object. For `asDataFrame()` an `numeric` indicating which scans to coerce -#' to a `DataFrame` (default is `seq_along(object)`). -#' -#' @param j For `[`: not supported. -#' -#' @param initial For `tic()`: `logical(1)` whether the initially -#' reported total ion current should be reported, or whether the -#' total ion current should be (re)calculated on the actual data -#' (`initial = FALSE`, same as `ionCount()`). -#' -#' @param intensity For `filterIntensity()`: `numeric` of length 1 or 2 -#' defining either the lower or the lower and upper intensity limit for the -#' filtering, or a `function` that takes the intensities as input and -#' returns a `logical` (same length then peaks in the spectrum) whether the -#' peak should be retained or not. Defaults to `intensity = c(0, Inf)` thus -#' only peaks with `NA` intensity are removed. -#' -#' @param intensityFun For `combinePeaks()`: function to be used to aggregate -#' intensities for all peaks in each peak group into a single intensity -#' value. -#' -#' @param isotopeTolerance For `filterFourierTransformArtefacts()`: the m/z -#' `tolerance` to be used to define whether peaks might be isotopes of -#' the current tested peak. -#' -#' @param k For `pickPeaks()`: `integer(1)`, number of values left and right of -#' the peak that should be considered in the weighted mean calculation. -#' -#' @param keep For `filterMzValues()` and `filterMzRange()`: `logical(1)` -#' whether the matching peaks should be retained (`keep = TRUE`, the -#' default) or dropped (`keep = FALSE`). -#' -#' @param keepIsotopes For `filterFourierTransformArtefacts()`: whether isotope -#' peaks should not be removed as fourier artefacts. -#' -#' @param maxCharge For `filterFourierTransformArtefacts()`: the maximum charge -#' to be considered for isotopes. -#' -#' @param MAPFUN For `compareSpectra()`: function to map/match peaks between the -#' two compared spectra. See [joinPeaks()] for more information and possible -#' functions. -#' -#' @param method -#' - For `pickPeaks()`: `character(1)`, the noise estimators that -#' should be used, currently the the *M*edian *A*bsolute *D*eviation -#' (`method = "MAD"`) and Friedman's Super Smoother -#' (`method = "SuperSmoother"`) are supported. -#' - For `smooth()`: `character(1)`, the smoothing function that should be -#' used, currently, the Moving-Average- (`method = "MovingAverage"`), -#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, -#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. -#' -#' @param metadata For `Spectra()`: optional `list` with metadata information. -#' -#' @param msLevel. `integer` defining the MS level(s) of the spectra to which -#' the function should be applied (defaults to all MS levels of `object`. -#' For `filterMsLevel()`: the MS level to which `object` should be -#' subsetted. -#' -#' @param mz For `filterIsolationWindow()`: `numeric(1)` with the m/z value to -#' filter the object. For `filterPrecursorMz()` and `filterMzRange()`: -#' `numeric(2)` defining the lower and upper m/z boundary. -#' For `filterMzValues()` and `filterPrecursorMzValues()`: `numeric` with -#' the m/z values to match peaks or precursor m/z against. -#' -#' @param mzFun For `combinePeaks()`: function to aggregate m/z values for all -#' peaks within each peak group into a single m/z value. This parameter -#' is ignored if `weighted = TRUE` (the default). -#' -#' @param n for `filterAcquisitionNum()`: `integer` with the acquisition -#' numbers to filter for. -#' -#' @param name For `$` and `$<-`: the name of the spectra variable to return -#' or set. -#' -#' @param neutralLoss for `containsNeutralLoss()`: `numeric(1)` defining the -#' value which should be subtracted from the spectrum's precursor m/z. -#' -#' @param normalized for `entropy()`: `logical(1)` whether the normalized -#' entropy should be calculated (default). See also [nentropy()] for -#' details. -#' -#' @param object For `Spectra()`: either a `DataFrame` or `missing`. See -#' section on creation of `Spectra` objects for details. For all other -#' methods a `Spectra` object. -#' -#' @param p For `combineSpectra()`: `factor` defining how to split the input -#' `Spectra` for parallel processing. Defaults to `x$dataStorage`, i.e., -#' depending on the used backend, per-file parallel processing will be -#' performed. -#' -#' @param polarity for `filterPolarity()`: `integer` specifying the polarity to -#' to subset `object`. -#' -#' @param ppm For `compareSpectra()`, `containsMz()`, `deisotopeSpectra()`, -#' `filterMzValues()` and `reduceSpectra()`: `numeric(1)` -#' defining a relative, m/z-dependent, maximal accepted difference between -#' m/z values for peaks to be matched (or grouped). -#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the relative -#' maximal accepted difference of precursor m/z values of spectra for -#' grouping them into *precursor groups*. For `filterPrecursorIsotopes()`: -#' passed directly to the [isotopologues()] function. -#' For `filterValues()`: `numeric` of any length allowing to define -#' a maximal accepted difference between user input `values` and the -#' `spectraVariables` values. If it is not equal to the length of the -#' value provided with parameter `spectraVariables`, `ppm[1]` will be -#' recycled. -#' -#' @param processingQueue For `Spectra()`: optional `list` of -#' [ProcessingStep-class] objects. -#' -#' @param ranges for `filterRanges()`: A `numeric` vector of paired values -#' (upper and lower boundary) that define the ranges to filter the `object`. -#' These paired values need to be in the same order as the -#' `spectraVariables` parameter (see below). -#' -#' @param rt for `filterRt()`: `numeric(2)` defining the retention time range to -#' be used to subset/filter `object`. -#' -#' @param SIMPLIFY For `compareSpectra()` whether the result matrix should be -#' *simplified* to a `numeric` if possible (i.e. if either `x` or `y` is -#' of length 1). -#' -#' @param snr For `pickPeaks()`: `double(1)` defining the -#' *S*ignal-to-*N*oise-*R*atio. The intensity of a local maximum has to be -#' higher than `snr * noise` to be considered as peak. -#' -#' @param source For `Spectra()`: instance of [MsBackend-class] that can be used -#' to import spectrum data from the provided files. See section *Creation -#' of objects, conversion and changing the backend* for more details. -#' -#' @param spectraVariables -#' - For `selectSpectraVariables()`: `character` with the -#' names of the spectra variables to which the backend should be -#' subsetted. -#' - For `addProcessing()`: `character` with additional spectra variables -#' that should be passed along to the function defined with `FUN`. See -#' function description for details. -#' - For `filterRanges()` and `filterValues()`: `character` vector -#' specifying the column(s) from `spectraData(object)` on which to filter -#' the data and that correspond to the the names of the spectra variables -#' that should be used for the filtering. -#' -#' @param substDefinition For `deisotopeSpectra()` and -#' `filterPrecursorIsotopes()`: `matrix` or `data.frame` with definitions -#' of isotopic substitutions. Uses by default isotopic substitutions -#' defined from all compounds in the Human Metabolome Database (HMDB). See -#' [isotopologues()] or [isotopicSubstitutionMatrix()] for details. -#' -#' @param suffix.y A `character(1)` specifying the suffix to be used -#' for making the names of columns in the merged spectra variables -#' unique. This suffix will be used to amend `names(y)`, while -#' `spectraVariables(x)` will remain unchanged. -#' -#' @param tolerance For `compareSpectra()`, `containsMz()`, -#' `deisotopeSpectra()`, `filterMzValues()` and `reduceSpectra()`: -#' `numeric(1)` allowing to define a constant maximal accepted difference -#' between m/z values for peaks to be matched (or grouped). For -#' `containsMz()` it can also be of length equal `mz` to specify a different -#' tolerance for each m/z value. -#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the -#' (constant) maximal accepted difference of precursor m/z values of -#' spectra for grouping them into *precursor groups*. For -#' `filterPrecursorIsotopes()`: passed directly to the [isotopologues()] -#' function. For `filterValues()`: `numeric` of any length allowing to -#' define a maximal accepted difference between user input `values` and the -#' `spectraVariables` values. If it is not equal to the length of the -#' value provided with parameter `spectraVariables`, `tolerance[1]` will be -#' recycled. Default is `tolerance = 0` -#' -#' @param threshold -#' - For `pickPeaks()`: a `double(1)` defining the proportion of the maximal -#' peak intensity. Just values above are used for the weighted mean -#' calculation. -#' - For `replaceIntensitiesBelow()`: a `numeric(1)` defining the threshold -#' or a `function` to calculate the threshold for each spectrum on its -#' intensity values. Defaults to `threshold = min`. -#' - For `filterFourierTransformArtefacts()`: the relative intensity (to a -#' peak) below which peaks are considered fourier artefacts. Defaults to -#' `threshold = 0.2` hence removing peaks that have an intensity below 0.2 -#' times the intensity of the tested peak (within the selected -#' `halfWindowSize`). -#' -#' @param use.names For `lengths()`: ignored. -#' -#' @param value replacement value for `<-` methods. See individual -#' method description or expected data type. -#' -#' @param values for `filterValues()`: A `numeric` vector that define the -#' values to filter the Spectra data. These values need to be in the same -#' order as the `spectraVariables` parameter. -#' -#' @param weighted For `combinePeaks()`: `logical(1)` whether m/z values of -#' peaks within each peak group should be aggregated into a single m/z -#' value using an intensity-weighted mean. Defaults to `weighted = TRUE`. -#' -#' @param which for `containsMz()`: either `"any"` or `"all"` defining whether -#' any (the default) or all provided `mz` have to be present in the -#' spectrum. -#' -#' @param x A `Spectra` object. -#' -#' @param y A `Spectra` object. A `DataFrame` for `joinSpectraData()`. -#' -#' @param z For `filterPrecursorCharge()`: `integer()` with the precursor -#' charges to be used as filter. -#' -#' @param zero.rm `logical`. For `bin()`: indicating whether to remove bins -#' with zero intensity. Defaults to `TRUE`, meaning the function will -#' discard bins created with an intensity of 0 to enhance memory efficiency. -#' -#' @param ... Additional arguments. +#' - A `spectrumId` (or `spectrumID`) is a vendor specific field in +#' the mzML file that contains some information about the +#' run/spectrum, e.g.: `controllerType=0 controllerNumber=1 +#' scan=5281 file=2` #' -#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +#' - `acquisitionNum` is a more a less sanitize spectrum id generated +#' from the `spectrumId` field by `mzR` (see +#' [here](https://github.com/sneumann/mzR/blob/master/src/pwiz/data/msdata/MSData.cpp#L552-L580)). +#' +#' - `scanIndex` is the `mzR` generated sequence number of the +#' spectrum in the raw file (which doesn't have to be the same as +#' the `acquisitionNum`) +#' +#' See also [this issue](https://github.com/lgatto/MSnbase/issues/525). #' #' @md #' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +#' #' @exportClass Spectra #' #' @exportMethod Spectra #' #' @examples #' +#' ## ---- CREATION OF SPECTRA OBJECTS ---- +#' #' ## Create a Spectra providing a `DataFrame` containing the spectrum data. #' #' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) @@ -1161,12 +198,6 @@ NULL #' data <- Spectra(spd) #' data #' -#' ## Get the number of spectra -#' length(data) -#' -#' ## Get the number of peaks per spectrum -#' lengths(data) -#' #' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk #' ## backend. #' sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -1174,6 +205,9 @@ NULL #' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) #' sciex #' +#' +#' ## ---- CHANGING DATA REPRESENTATIONS ---- +#' #' ## The MS data is on disk and will be read into memory on-demand. We can #' ## however change the backend to a MsBackendMemory backend which will #' ## keep all of the data in memory. @@ -1209,310 +243,6 @@ NULL #' head(dataOrigin(sciex_im)) #' #' -#' ## ---- ACCESSING AND ADDING DATA ---- -#' -#' ## Get the MS level for each spectrum. -#' msLevel(data) -#' -#' ## Alternatively, we could also use $ to access a specific spectra variable. -#' ## This could also be used to add additional spectra variables to the -#' ## object (see further below). -#' data$msLevel -#' -#' ## Get the intensity and m/z values. -#' intensity(data) -#' mz(data) -#' -#' ## Determine whether one of the spectra has a specific m/z value -#' containsMz(data, mz = 120.4) -#' -#' ## Accessing spectra variables works for all backends: -#' intensity(sciex) -#' intensity(sciex_im) -#' -#' ## Get the m/z for the first spectrum. -#' mz(data)[[1]] -#' -#' ## Get the peak data (m/z and intensity values). -#' pks <- peaksData(data) -#' pks -#' pks[[1]] -#' pks[[2]] -#' -#' ## Note that we could get the same resulb by coercing the `Spectra` to -#' ## a `list` or `SimpleList`: -#' as(data, "list") -#' as(data, "SimpleList") -#' -#' ## List all available spectra variables (i.e. spectrum data and metadata). -#' spectraVariables(data) -#' -#' ## For all *core* spectrum variables accessor functions are available. These -#' ## return NA if the variable was not set. -#' centroided(data) -#' dataStorage(data) -#' rtime(data) -#' precursorMz(data) -#' -#' ## The core spectra variables are: -#' coreSpectraVariables() -#' -#' ## Add an additional metadata column. -#' data$spectrum_id <- c("sp_1", "sp_2") -#' -#' ## List spectra variables, "spectrum_id" is now also listed -#' spectraVariables(data) -#' -#' ## Get the values for the new spectra variable -#' data$spectrum_id -#' -#' ## Extract specific spectra variables. -#' spectraData(data, columns = c("spectrum_id", "msLevel")) -#' -#' ## Drop spectra variable data and/or columns. -#' res <- selectSpectraVariables(data, c("mz", "intensity")) -#' -#' ## This removed the additional columns "spectrum_id" and deleted all values -#' ## for all spectra variables, except "mz" and "intensity". -#' spectraData(res) -#' -#' ## Compared to the data before selectSpectraVariables. -#' spectraData(data) -#' -#' -#' ## ---- SUBSETTING, FILTERING AND COMBINING -#' -#' ## Subset to all MS2 spectra. -#' data[msLevel(data) == 2] -#' -#' ## Same with the filterMsLevel function -#' filterMsLevel(data, 2) -#' -#' ## Below we combine the `data` and `sciex_im` objects into a single one. -#' data_comb <- c(data, sciex_im) -#' -#' ## The combined Spectra contains a union of all spectra variables: -#' head(data_comb$spectrum_id) -#' head(data_comb$rtime) -#' head(data_comb$dataStorage) -#' head(data_comb$dataOrigin) -#' -#' ## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm -#' spd$precursorMz <- c(323.4, 543.2302) -#' data_filt <- Spectra(spd) -#' filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10)) -#' -#' ## Filter a Spectra keeping only peaks matching certain m/z values -#' sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3) -#' mz(sps_sub) -#' -#' ## This function can also be used to remove specific peaks from a spectrum -#' ## by setting `keep = FALSE`. -#' sps_sub <- filterMzValues(data, mz = c(103, 104), -#' tolerance = 0.3, keep = FALSE) -#' mz(sps_sub) -#' -#' ## Note that `filterMzValues()` keeps or removes all peaks with a matching -#' ## m/z given the provided `ppm` and `tolerance` parameters. -#' -#' ## Filter a Spectra keeping only peaks within a m/z range -#' sps_sub <- filterMzRange(data, mz = c(100, 300)) -#' mz(sps_sub) -#' -#' ## Remove empty spectra variables -#' sciex_noNA <- dropNaSpectraVariables(sciex) -#' -#' ## Available spectra variables before and after `dropNaSpectraVariables()` -#' spectraVariables(sciex) -#' spectraVariables(sciex_noNA) -#' -#' -#' ## Adding new spectra variables -#' sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) -#' spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging -#' var1 = rnorm(10), -#' var2 = sample(letters, 10)) -#' spv -#' -#' sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") -#' -#' spectraVariables(sciex2) -#' spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] -#' -#' ## Removing fourier transform artefacts seen in Orbitra data. -#' -#' ## Loading an Orbitrap spectrum with artefacts. -#' data(fft_spectrum) -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) -#' -#' fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) -#' fft_spectrum -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) -#' -#' ## Using a few examples peaks in your data you can optimize the parameters -#' fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, -#' halfWindowSize = 0.2, -#' threshold = 0.005, -#' keepIsotopes = TRUE, -#' maxCharge = 5, -#' isotopeTolerance = 0.005 -#' ) -#' -#' fft_spectrum_filtered -#' length(mz(fft_spectrum_filtered)[[1]]) -#' plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) -#' -#' ## Using filterRanges to filter spectra object based on variables available -#' ## in `spectraData`. -#' ## First, determine the variable(s) on which to base the filtering: -#' sv <- c("rtime", "precursorMz", "peaksCount") -#' ## Note that ANY variables can be chosen here, and as many as wanted. -#' -#' ## Define the ranges (pairs of values with lower and upper boundary) to be -#' ## used for the individual spectra variables. The first two values will be -#' ## used for the first spectra variable (e.g., rtime here), the next two for -#' ## the second (e.g. precursorMz here) and so on: -#' ranges <- c(30, 350, 200,500, 350, 600) -#' -#' ## Input the parameters within the filterRanges function: -#' filt_spectra <- filterRanges(sciex, spectraVariables = sv, -#' ranges = ranges) -#' -#' ## Using `filterRanges()` to filter spectra object with multiple ranges for -#' ## the same `spectraVariable` (e.g, here rtime) -#' sv <- c("rtime", "rtime") -#' ranges <- c(30, 100, 200, 300) -#' filt_spectra <- filterRanges(sciex, spectraVariables = sv, -#' ranges = ranges, match = "any") -#' -#' ## Using filterValues in a similar way to a filter spectra object based on -#' ## variables available in `spectraData`. However, this time not based on -#' ## ranges but similarities to user input single values with given -#' ## tolerance/ppm -#' ## First determine the variable(s) on which to base the filtering: -#' sv <- c("rtime", "precursorMz") -#' ## Note that ANY variables can be chosen here, and as many as wanted. -#' -#' ## Define the values that will be used to filter the spectra based on their -#' ## similarities to their respective spectraVariables. -#' ## The first values in the parameters values, tolerance and ppm will be -#' ## used for the first spectra variable (e.g. rtime here), the next for the -#' ## second (e.g. precursorMz here) and so on: -#' values <- c(350, 400) -#' tolerance <- c(100, 0) -#' ppm <- c(0,50) -#' -#' ## Input the parameters within the `filterValues()` function: -#' filt_spectra <- filterValues(sciex, spectraVariables = sv, -#' values = values, tolerance = tolerance, ppm = ppm) -#' -#' ## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ---- -#' -#' ## Set the data to be centroided -#' centroided(data) <- TRUE -#' -#' ## Replace peak intensities below 40 with 3. -#' res <- replaceIntensitiesBelow(data, threshold = 40, value = 3) -#' res -#' -#' ## Get the intensities of the first and second spectrum. -#' intensity(res)[[1]] -#' intensity(res)[[2]] -#' -#' ## Remove all peaks with an intensity below 40. -#' res <- filterIntensity(res, intensity = c(40, Inf)) -#' -#' ## Get the intensities of the first and second spectrum. -#' intensity(res)[[1]] -#' intensity(res)[[2]] -#' -#' ## Lengths of spectra is now different -#' lengths(mz(res)) -#' lengths(mz(data)) -#' -#' ## In addition it is possible to pass a function to `filterIntensity()`: in -#' ## the example below we want to keep only peaks that have an intensity which -#' ## is larger than one third of the maximal peak intensity in that spectrum. -#' keep_peaks <- function(x, prop = 3) { -#' x > max(x, na.rm = TRUE) / prop -#' } -#' res2 <- filterIntensity(data, intensity = keep_peaks) -#' intensity(res2)[[1L]] -#' intensity(data)[[1L]] -#' -#' ## We can also change the proportion by simply passing the `prop` parameter -#' ## to the function. To keep only peaks that have an intensity which is -#' ## larger than half of the maximum intensity: -#' res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2) -#' intensity(res2)[[1L]] -#' intensity(data)[[1L]] -#' -#' ## Since data manipulation operations are by default not directly applied to -#' ## the data but only added to the internal lazy evaluation queue, it is also -#' ## possible to remove these data manipulations with the `reset()` function: -#' res_rest <- reset(res) -#' res_rest -#' lengths(mz(res_rest)) -#' lengths(mz(res)) -#' lengths(mz(data)) -#' -#' ## `reset()` after a `applyProcessing()` can not restore the data, because -#' ## the data in the backend was changed. Similarly, `reset()` after any -#' ## filter operations can not restore data for a `Spectra` with a -#' ## `MsBackendMemory` or `MsBackendDataFrame`. -#' res_2 <- applyProcessing(res) -#' res_rest <- reset(res_2) -#' lengths(mz(res)) -#' lengths(mz(res_rest)) -#' -#' -#' ## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using -#' ## the normalized dotproduct method. -#' res <- compareSpectra(sciex_im[2:3], sciex_im[10:20]) -#' ## first row contains comparisons of spectrum 2 with spectra 10 to 20 and -#' ## the second row comparisons of spectrum 3 with spectra 10 to 20 -#' res -#' -#' ## To use a simple Pearson correlation instead we can define a function -#' ## that takes the two peak matrices and calculates the correlation for -#' ## their second columns (containing the intensity values). -#' correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) { -#' cor(x[, 2], y[, 2], use = use) -#' } -#' res <- compareSpectra(sciex_im[2:3], sciex_im[10:20], -#' FUN = correlateSpectra) -#' res -#' -#' ## Use compareSpectra to determine the number of common (matching) peaks -#' ## with a ppm of 10: -#' ## type = "inner" uses a *inner join* to match peaks, i.e. keeps only -#' ## peaks that can be mapped betwen both spectra. The provided FUN returns -#' ## simply the number of matching peaks. -#' compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner", -#' FUN = function(x, y, ...) nrow(x)) -#' -#' ## Apply an arbitrary function to each spectrum in a Spectra. -#' ## In the example below we calculate the mean intensity for each spectrum -#' ## in a subset of the sciex_im data. Note that we can access all variables -#' ## of each individual spectrum either with the `$` operator or the -#' ## corresponding method. -#' res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]])) -#' head(res) -#' -#' ## It is however important to note that dedicated methods to access the -#' ## data (such as `intensity`) are much more efficient than using `lapply()`: -#' res <- lapply(intensity(sciex_im[1:20]), mean) -#' head(res) -#' -#' ## As an alternative, applying a function `FUN` to a `Spectra` can be -#' ## performed *chunk-wise*. The advantage of this is, that only the data for -#' ## one chunk at a time needs to be loaded into memory reducing the memory -#' ## demand. This type of processing can be performed by specifying the size -#' ## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` -#' ## parameter -#' spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) -#' #' ## ---- DATA EXPORT ---- #' #' ## Some `MsBackend` classes provide an `export()` method to export the data @@ -1542,54 +272,28 @@ NULL #' #' mz(res) #' mz(data) -#' -#' ## ---- PEAKS VARIABLES AND DATA ---- -#' -#' ## Some `MsBackend` classes provide support for arbitrary peaks variables -#' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below -#' ## we create a simple data frame with an additional peak variable `"pk_ann"` -#' ## and create a `Spectra` with a `MsBackendMemory` for that data. -#' ## Importantly the number of values (per spectrum) need to be the same -#' ## for all peak variables. -#' -#' tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) -#' tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) -#' tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) -#' tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) -#' -#' ## Create the Spectra. With parameter `peaksVariables` we can define -#' ## the columns in `tmp` that contain peaks variables. -#' sps <- Spectra(tmp, source = MsBackendMemory(), -#' peaksVariables = c("mz", "intensity", "pk_ann")) -#' peaksVariables(sps) -#' -#' ## Extract just the m/z and intensity values -#' peaksData(sps)[[1L]] -#' -#' ## Extract the full peaks data -#' peaksData(sps, columns = peaksVariables(sps))[[1L]] -#' -#' ## Access just the pk_ann variable -#' sps$pk_ann -NULL #' The Spectra class #' #' The [Spectra-class] encapsulates data and meta-data for mass #' spectrometry experiments. #' -#' #' @slot backend A derivate of [MsBackend-class] holding/controlling the spectra -#' data. +#' data. +#' #' @slot processingQueue `list` of `ProcessingStep` objects. +#' #' @slot processingQueueVariables `character` of spectraVariables that should #' be passed to the processing step function. +#' #' @slot processing A `character` storing logging information. +#' #' @slot metadata A `list` storing experiment metadata. -#' @slot version A `characher(1)` containing the class version. #' -#' @name Spectra-class +#' @slot version A `character(1)` containing the class version. +#' #' @docType class +#' #' @author Sebastian Gibb \email{mail@@sebastiangibb.de} #' #' @importClassesFrom S4Vectors DataFrame @@ -1653,7 +357,7 @@ setMethod("show", "Spectra", } }) -#' @rdname Spectra +#' @rdname Spectra-class setMethod("Spectra", "missing", function(object, processingQueue = list(), metadata = list(), ..., backend = MsBackendMemory(), @@ -1662,7 +366,7 @@ setMethod("Spectra", "missing", function(object, processingQueue = list(), backend = backend) }) -#' @rdname Spectra +#' @rdname Spectra-class setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), metadata = list(), ..., BPPARAM = bpparam()) { @@ -1670,7 +374,7 @@ setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), backend = object) }) -#' @rdname Spectra +#' @rdname Spectra-class #' #' @importFrom methods callNextMethod setMethod("Spectra", "character", function(object, processingQueue = list(), @@ -1687,7 +391,7 @@ setMethod("Spectra", "character", function(object, processingQueue = list(), ..., BPPARAM = BPPARAM) }) -#' @rdname Spectra +#' @rdname Spectra-class setMethod("Spectra", "ANY", function(object, processingQueue = list(), metadata = list(), source = MsBackendMemory(), @@ -1702,7 +406,7 @@ setMethod("Spectra", "ANY", function(object, processingQueue = list(), else sp }) -#' @rdname Spectra +#' @rdname Spectra-class #' #' @importMethodsFrom ProtGenerics setBackend #' @@ -1751,6 +455,78 @@ setMethod( object }) +#' @rdname Spectra-class +#' +#' @export +setMethod("export", "Spectra", + function(object, backend, ...) { + if (missing(backend)) + stop("Parameter 'backend' is required.") + export(backend, object, ...) + }) + +#' @rdname Spectra-class +setMethod("dataStorageBasePath", "Spectra", function(object) { + dataStorageBasePath(object@backend) +}) + +#' @rdname Spectra-class +setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { + dataStorageBasePath(object@backend) <- value + object +} +) + +## CONTINUNE HERE: +## - check if some additional methods/functions need to be moved up. + +################################################################################ +## +## Merging, splitting and aggregating Spectra: length of Spectra is changed +## +################################################################################ + +#' @title Merging, splitting and aggregating Spectra +#' +#' @aliases [,Spectra-method + +################################################################################ +## +## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## +################################################################################ + +#' @title Filtering and subsetting Spectra objects +#' + +################################################################################ +## +## Accessing and adding/setting/changing MS data. +## +################################################################################ + +#' @title Accessing mass spectrometry data +#' +#' + +################################################################################ +## +## Data manipulation and analysis operations (lazy processing) +## +################################################################################ + +#' @title Data manipulation and analysis methods +#' + +################################################################################ +## +## Spectra similarity calculations +## +################################################################################ + +#' @title Spectra similarity calculations + + #' @rdname Spectra #' #' @importFrom MsCoreUtils vapply1c @@ -1769,15 +545,6 @@ setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { }) }) -#' @rdname Spectra -#' -#' @export -setMethod("export", "Spectra", - function(object, backend, ...) { - if (missing(backend)) - stop("Parameter 'backend' is required.") - export(backend, object, ...) - }) #### --------------------------------------------------------------------------- ## @@ -2853,17 +1620,6 @@ setMethod("entropy", "ANY", function(object, ...) { MsCoreUtils::entropy(object) }) -#' @rdname Spectra -setMethod("dataStorageBasePath", "Spectra", function(object) { - dataStorageBasePath(object@backend) -}) - -#' @rdname Spectra -setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { - dataStorageBasePath(object@backend) <- value - object -}) - #' @export #' @rdname Spectra #' From 34476af736759c223839370a329c5a53e17786a4 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 12 Sep 2024 16:38:04 +0200 Subject: [PATCH 21/41] refactor: improve and fix the Spectra constructor method - Refactor `Spectra()` to better support backends that define their own specific parameters in `backendInitialize()`. --- DESCRIPTION | 2 +- NEWS.md | 5 +++ R/MsBackendMzR.R | 4 ++- R/Spectra.R | 38 ++++++++++++++-------- tests/testthat/test_Spectra.R | 61 +++++++++++++++++++++++++++++++---- 5 files changed, 88 insertions(+), 22 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c233fc3c..c547b903 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.7 +Version: 1.15.8 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 49e8e289..2d6c6193 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.8 + +- Refactor the `Spectra()` constructor method: better support for + initialization of backends that define their own specific parameters. + ## Changes in 1.15.7 - Change `estimatePrecursorIntensity()` to a method to avoid overrides/clashes diff --git a/R/MsBackendMzR.R b/R/MsBackendMzR.R index 7cadc0d5..69a04987 100644 --- a/R/MsBackendMzR.R +++ b/R/MsBackendMzR.R @@ -43,12 +43,14 @@ setValidity("MsBackendMzR", function(object) { #' @importFrom BiocParallel bpparam setMethod("backendInitialize", "MsBackendMzR", function(object, files, ..., BPPARAM = bpparam()) { - if (missing(files) || !length(files)) + if (missing(files)) stop("Parameter 'files' is mandatory for 'MsBackendMzR'") if (!is.character(files)) stop("Parameter 'files' is expected to be a character vector", " with the files names from where data should be", " imported") + if (!length(files)) + return(object) files <- normalizePath(files, mustWork = FALSE) msg <- .valid_ms_backend_files_exist(files) if (length(msg)) diff --git a/R/Spectra.R b/R/Spectra.R index 76a467c5..9bcc784c 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1634,8 +1634,10 @@ setMethod("Spectra", "missing", function(object, processingQueue = list(), metadata = list(), ..., backend = MsBackendMemory(), BPPARAM = bpparam()) { - new("Spectra", metadata = metadata, processingQueue = processingQueue, - backend = backend) + if (length(backend)) + new("Spectra", metadata = metadata, processingQueue = processingQueue, + backend = backend) + else callNextMethod() }) #' @rdname Spectra @@ -1654,13 +1656,12 @@ setMethod("Spectra", "character", function(object, processingQueue = list(), source = MsBackendMzR(), backend = source, ..., BPPARAM = bpparam()) { - if (!length(object)) - Spectra(backend, metadata = metadata, - processingQueue = processingQueue) - else - callNextMethod(object = object, processingQueue = processingQueue, - metadata = metadata, source = source, backend = backend, - ..., BPPARAM = BPPARAM) + sp <- .create_spectra(object, processingQueue = processingQueue, + metadata = metadata, backend = source, + ..., BPPARAM = BPPARAM) + if (class(source)[1L] != class(backend)[1L]) + setBackend(sp, backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + else sp }) #' @rdname Spectra @@ -1669,15 +1670,26 @@ setMethod("Spectra", "ANY", function(object, processingQueue = list(), source = MsBackendMemory(), backend = source, ..., BPPARAM = bpparam()) { - sp <- new("Spectra", metadata = metadata, processingQueue = processingQueue, - backend = backendInitialize( - source, object, ..., - BPPARAM = backendBpparam(source, BPPARAM))) + sp <- .create_spectra(object, processingQueue = processingQueue, + metadata = metadata, backend = source, + ..., BPPARAM = BPPARAM) if (class(source)[1L] != class(backend)[1L]) setBackend(sp, backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) else sp }) +.create_spectra <- function(object, processingQueue = list(), metadata = list(), + backend = MsBackendMemory(), ..., + BPPARAM = bpparam()) { + if (missing(object)) + backend <- backendInitialize( + backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + else backend <- backendInitialize( + backend, object, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + new("Spectra", metadata = metadata, processingQueue = processingQueue, + backend = backend) +} + #' @rdname Spectra #' #' @importMethodsFrom ProtGenerics setBackend diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index e81efdcb..43638a4d 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -13,32 +13,48 @@ test_that("Spectra,ANY works", { df$polarity <- "NEG" expect_error(Spectra(df), "wrong data type: polarity") + + res <- Spectra(files = sciex_file, source = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + expect_true(length(res) > 1) }) test_that("Spectra,missing works", { res <- Spectra() expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendMemory") + + res <- Spectra(backend = MsBackendDataFrame()) + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendDataFrame") + + res <- Spectra(source = MsBackendDataFrame()) + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendDataFrame") be <- backendInitialize(MsBackendDataFrame(), DataFrame(msLevel = c(1L, 2L), fromFile = 1L)) res <- Spectra(backend = be) + expect_s4_class(res@backend, "MsBackendDataFrame") expect_true(length(res) == 2) expect_identical(msLevel(res), c(1L, 2L)) }) test_that("Spectra,MsBackend works", { - res <- Spectra() - expect_true(length(res) == 0) - - be <- backendInitialize(MsBackendDataFrame(), DataFrame(msLevel = c(1L, 2L), - fromFile = 1L)) + be <- backendInitialize(MsBackendDataFrame(), + DataFrame(msLevel = c(1L, 2L), + fromFile = 1L)) res <- Spectra(be) expect_true(length(res) == 2) expect_identical(msLevel(res), c(1L, 2L)) }) test_that("Spectra,character works", { - res <- Spectra(sciex_file, backend = MsBackendMzR()) + res <- Spectra(sciex_file) + expect_true(is(res@backend, "MsBackendMzR")) + expect_true(length(res) > 0) + + res <- Spectra(sciex_file, source = MsBackendMzR()) expect_true(is(res@backend, "MsBackendMzR")) expect_equal(unique(res@backend$dataStorage), sciex_file) expect_identical(rtime(res), rtime(sciex_mzr)) @@ -51,7 +67,7 @@ test_that("Spectra,character works", { show(res) ## Empty character - res <- Spectra(character(), backend = MsBackendMzR()) + res <- Spectra(character()) expect_s4_class(res, "Spectra") expect_s4_class(res@backend, "MsBackendMzR") expect_true(length(res) == 0) @@ -62,6 +78,37 @@ test_that("Spectra,character works", { expect_true(length(res) == 0) }) +test_that(".create_spectra works, ", { + ## missing object + res <- .create_spectra() + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendMemory") + expect_error(res <- .create_spectra(backend = MsBackendMzR()), "mandatory") + + ## object being a character, backend a MsBackendMemory -> error + res <- expect_error(.create_spectra(sciex_file), "DataFrame") + ## object being a character, backend a MsBackendMzR + res <- .create_spectra(sciex_file, backend = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + dta <- spectraData(res@backend) + + ## object being a DataFrame, backend a MsBackendDataFrame + res <- .create_spectra(dta, backend = MsBackendDataFrame()) + expect_s4_class(res@backend, "MsBackendDataFrame") + expect_equal(res$msLevel, dta$msLevel) + + ## object missing but providing files + res <- .create_spectra(files = sciex_file, backend = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + expect_equal(res$msLevel, dta$msLevel) + + ## object missing but providing data + res <- .create_spectra(data = dta, backend = MsBackendMemory()) + expect_s4_class(res@backend, "MsBackendMemory") + expect_equal(res$msLevel, dta$msLevel) + +}) + test_that("setBackend,Spectra works", { df <- DataFrame(rtime = as.numeric(1:9), fact = c(2L, 1L, 2L, 1L, 3L, 2L, 3L, 3L, 1L)) From 2f11a8564b47290c46ce36d979df5a60ef38c626 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 13 Sep 2024 08:11:20 +0200 Subject: [PATCH 22/41] Update Rbuildignore --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 22a5d1be..37442c70 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,6 +1,7 @@ ^\.github$ .editorconfig .travis.yml +man/figures* local_data favicon logo.png From 6fe7a59ea728a7202c55a695e074157c2531192d Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 13 Sep 2024 15:29:35 +0200 Subject: [PATCH 23/41] docs: refactor the data accessor documentation --- DESCRIPTION | 2 +- NEWS.md | 4 + R/Spectra.R | 1156 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 779 insertions(+), 383 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c547b903..a04e4ac3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.8 +Version: 1.15.9 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 2d6c6193..828350b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # Spectra 1.15 +## Changes in 1.15.9 + +- Restructure and reorganize documentation for `Spectra`. + ## Changes in 1.15.8 - Refactor the `Spectra()` constructor method: better support for diff --git a/R/Spectra.R b/R/Spectra.R index 5cbdb781..d3da9b44 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -11,6 +11,11 @@ NULL #' #' @name Spectra-class #' +#' @aliases Spectra +#' @aliases setBackend +#' @aliases export +#' @aliases dataStorageBasePath +#' #' @description #' #' The `Spectra` class encapsules spectral mass spectrometry (MS) data and @@ -28,6 +33,22 @@ NULL #' - [processingChunkSize()] for information on parallel and chunk-wise data #' processing. #' - [plotSpectra()] for visualization of `Spectra`. +#' - [spectraData()] for accessing and using MS data through `Spectra`. +#' +#' +#' @section Data stored in a `Spectra` object: +#' +#' The `Spectra` object is a container for MS data that includes mass peak +#' data (*m/z* and related intensity values, also referred to as *peaks data* +#' in the context of `Spectra`) and metadata of individual spectra (so called +#' *spectra variables*). While a core set of spectra variables (the +#' `coreSpectraVariables()`) are guaranteed to be provided by a +#' `Spectra`, it is possible to add arbitrary additional spectra variables to +#' a `Spectra` object. +#' +#' The `Spectra` object is designed to contain MS data of a (large) set of mass +#' spectra. The data is organized *linearly* and can be thought of a list of +#' mass spectra, i.e. each element in the `Spectra` is one spectrum. #' #' #' @section Creation of objects: @@ -486,30 +507,7 @@ setMethod("dataStorageBasePath", "Spectra", function(object) { setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { dataStorageBasePath(object@backend) <- value object -} -) - -## CONTINUNE HERE: -## - check if some additional methods/functions need to be moved up. - -################################################################################ -## -## Merging, splitting and aggregating Spectra: length of Spectra is changed -## -################################################################################ - -#' @title Merging, splitting and aggregating Spectra -#' -#' @aliases [,Spectra-method - -################################################################################ -## -## Filtering, subsetting Spectra: subsetting Spectra and its data content. -## -################################################################################ - -#' @title Filtering and subsetting Spectra objects -#' +}) ################################################################################ ## @@ -519,68 +517,409 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' @title Accessing mass spectrometry data #' +#' @name spectraData +#' +#' @aliases acquisitionNum +#' @aliases centroided +#' @aliases collisionEnergy +#' @aliases dataOrigin +#' @aliases dataStorage +#' @aliases ionCount +#' @aliases isCentroided +#' @aliases isEmpty +#' @aliases isolationWindowLowerMz +#' @aliases isolationWindowUpperMz +#' @aliases isolationWindowTargetMz +#' @aliases lengths +#' @aliases msLevel +#' @aliases mz +#' @aliases peaksData +#' @aliases peaksVariables +#' @aliases polarity +#' @aliases precursorCharge +#' @aliases precursorIntensity +#' @aliases precursorMz +#' @aliases rtime +#' @aliases scanIndex +#' @aliases smoothed +#' @aliases spectraData +#' @aliases spectraNames +#' @aliases spectraVariables +#' @aliases tic +#' @aliases uniqueMsLevels #' - -################################################################################ -## -## Data manipulation and analysis operations (lazy processing) -## -################################################################################ - -#' @title Data manipulation and analysis methods +#' @description #' - -################################################################################ -## -## Spectra similarity calculations -## -################################################################################ - -#' @title Spectra similarity calculations - - -#' @rdname Spectra +#' As detailed in the documentation of the [Spectra-class], a `Spectra` object +#' is a container for mass spectrometry (MS) data that includes both the mass +#' peaks data (or *peaks data*, generally *m/z* and intensity values) as well +#' as spectra metadata (so called *spectra variables*). Spectra variables +#' generally define one value per spectrum, while for peaks variables one value +#' per mass peak is defined and hence multiple values per spectrum (depending +#' on the number of mass peaks of a spectrum). +#' +#' Data can be extracted from a `Spectra` object using dedicated accessor +#' functions or also using the `$` operator. Depending on the backend class +#' used by the `Spectra` to represent the data, data can also be added or +#' replaced (again, using dedicated functions or using `$<-`). +#' +#' @section Spectra variables: +#' +#' A common set of *core spectra variables* are defined for `Spectra`. These +#' have a pre-defined data type and each `Spectra` will return a value for +#' these if requested. If no value for a spectra variable is defined, a missing +#' value (of the correct data type) is returned. The list of core spectra +#' variables and their respective data type is: +#' +#' - *acquisitionNum* `integer(1)`: the index of acquisition of a spectrum +#' during an MS run. +#' - *centroided* `logical(1)`: whether the spectrum is in profile or centroid +#' mode. +#' - *collisionEnergy* `numeric(1)`: collision energy used to create an MSn +#' spectrum. +#' - *dataOrigin* `character(1)`: the *origin* of the spectrum's data, e.g. the +#' mzML file from which it was read. +#' - *dataStorage* `character(1)`: the (current) storage location of the +#' spectrum data. This value depends on the backend used to handle and +#' provide the data. For an *in-memory* backend like the `MsBackendDataFrame` +#' this will be `""`, for an on-disk backend such as the +#' `MsBackendHdf5Peaks` it will be the name of the HDF5 file where the +#' spectrum's peak data is stored. +#' - *isolationWindowLowerMz* `numeric(1)`: lower m/z for the isolation +#' window in which the (MSn) spectrum was measured. +#' - *isolationWindowTargetMz* `numeric(1)`: the target m/z for the isolation +#' window in which the (MSn) spectrum was measured. +#' - *isolationWindowUpperMz* `numeric(1)`: upper m/z for the isolation window +#' in which the (MSn) spectrum was measured. +#' - *msLevel* `integer(1)`: the MS level of the spectrum. +#' - *polarity* `integer(1)`: the polarity of the spectrum (`0` and `1` +#' representing negative and positive polarity, respectively). +#' - *precScanNum* `integer(1)`: the scan (acquisition) number of the precursor +#' for an MSn spectrum. +#' - *precursorCharge* `integer(1)`: the charge of the precursor of an MSn +#' spectrum. +#' - *precursorIntensity* `numeric(1)`: the intensity of the precursor of an +#' MSn spectrum. +#' - *precursorMz* `numeric(1)`: the m/z of the precursor of an MSn spectrum. +#' - *rtime* `numeric(1)`: the retention time of a spectrum. +#' - *scanIndex* `integer(1)`: the index of a spectrum within a (raw) file. +#' - *smoothed* `logical(1)`: whether the spectrum was smoothed. +#' +#' For each of these spectra variable a dedicated accessor function is defined +#' (such as `msLevel()` or `rtime()`) that allows to extract the values of +#' that spectra variable for all spectra in a `Spectra` object. Also, +#' replacement functions are defined, but not all backends might support +#' replacing values for spectra variables. As described above, additional +#' spectra variables can be defined or added. The `spectraVariables()` function +#' can be used to +#' +#' Values for multiple spectra variables, or all spectra vartiables* can be +#' extracted with the `spectraData()` function. +#' +#' +#' @section Peaks variables: +#' +#' `Spectra` also provide mass peak data with the *m/z* and intensity values +#' being the *core* peaks variables: +#' +#' - *intensity* `numeric`: intensity values for the spectrum's peaks. +#' - *mz* `numeric`: the m/z values for the spectrum's peaks. +#' +#' Values for these can be extracted with the `mz()` and `intensity()` +#' functions, or the `peaksData()` function. The former functions return a +#' `NumericList` with the respective values, while the latter returns a `List` +#' with `numeric` two-column matrices. The list of peaks matrices can also +#' be extracted using `as(x, "list")` or `as(x, "SimpleList")` with `x` being +#' a `Spectra` object. +#' +#' Some `Spectra`/backends provide also values for additional peaks variables. +#' The set of available peaks variables can be extracted with the +#' `peaksVariables()` function. +#' +#' +#' @section Functions to access MS data: +#' +#' The set of available functions to extract data from, or set data in, a +#' `Spectra` object are (in alphabetical order) listed below. Note that there +#' are also other functions to extract information from a `Spectra` object +#' documented LLLLLLL +#' +#' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. +#' See examples for details. Note that replacing values of a peaks variable +#' is not supported with a non-empty processing queue, i.e. if any filtering +#' or data manipulations on the peaks data was performed. In these cases +#' [applyProcessing()] needs to be called first to apply all cached data +#' operations. +#' +#' - `acquisitionNum()`: returns the acquisition number of each +#' spectrum. Returns an `integer` of length equal to the number of +#' spectra (with `NA_integer_` if not available). +#' +#' - `centroided()`, `centroided<-`: gets or sets the centroiding +#' information of the spectra. `centroided()` returns a `logical` +#' vector of length equal to the number of spectra with `TRUE` if a +#' spectrum is centroided, `FALSE` if it is in profile mode and `NA` +#' if it is undefined. See also `isCentroided()` for estimating from +#' the spectrum data whether the spectrum is centroided. `value` +#' for `centroided<-` is either a single `logical` or a `logical` of +#' length equal to the number of spectra in `object`. +#' +#' - `collisionEnergy()`, `collisionEnergy<-`: gets or sets the +#' collision energy for all spectra in `object`. `collisionEnergy()` +#' returns a `numeric` with length equal to the number of spectra +#' (`NA_real_` if not present/defined), `collisionEnergy<-` takes a +#' `numeric` of length equal to the number of spectra in `object`. +#' +#' - `coreSpectraVariables()`: returns the *core* spectra variables along with +#' their expected data type. +#' +#' - `dataOrigin()`, `dataOrigin<-`: gets or sets the *data origin* for each +#' spectrum. `dataOrigin()` returns a `character` vector (same length than +#' `object`) with the origin of the spectra. `dataOrigin<-` expects a +#' `character` vector (same length than `object`) with the replacement +#' values for the data origin of each spectrum. +#' +#' - `dataStorage()`: returns a `character` vector (same length than `object`) +#' with the data storage location of each spectrum. +#' +#' - `intensity()`: gets the intensity values from the spectra. Returns +#' a [NumericList()] of `numeric` vectors (intensity values for each +#' spectrum). The length of the list is equal to the number of +#' `spectra` in `object`. +#' +#' - `ionCount()`: returns a `numeric` with the sum of intensities for +#' each spectrum. If the spectrum is empty (see `isEmpty()`), +#' `NA_real_` is returned. +#' +#' - `isCentroided()`: a heuristic approach assessing if the spectra in +#' `object` are in profile or centroided mode. The function takes +#' the `qtl`th quantile top peaks, then calculates the difference +#' between adjacent m/z value and returns `TRUE` if the first +#' quartile is greater than `k`. (See `Spectra:::.isCentroided()` for +#' the code.) +#' +#' - `isEmpty()`: checks whether a spectrum in `object` is empty +#' (i.e. does not contain any peaks). Returns a `logical` vector of +#' length equal number of spectra. +#' +#' - `isolationWindowLowerMz()`, `isolationWindowLowerMz<-`: gets or sets the +#' lower m/z boundary of the isolation window. +#' +#' - `isolationWindowTargetMz()`, `isolationWindowTargetMz<-`: gets or sets the +#' target m/z of the isolation window. +#' +#' - `isolationWindowUpperMz()`, `isolationWindowUpperMz<-`: gets or sets the +#' upper m/z boundary of the isolation window. +#' +#' - `length()`: gets the number of spectra in the object. +#' +#' - `lengths()`: gets the number of peaks (m/z-intensity values) per +#' spectrum. Returns an `integer` vector (length equal to the +#' number of spectra). For empty spectra, `0` is returned. +#' +#' - `msLevel()`: gets the spectra's MS level. Returns an integer vector (names +#' being spectrum names, length equal to the number of spectra) with the MS +#' level for each spectrum. +#' +#' - `mz()`: gets the mass-to-charge ratios (m/z) from the +#' spectra. Returns a [NumericList()] or length equal to the number of +#' spectra, each element a `numeric` vector with the m/z values of +#' one spectrum. +#' +#' - `peaksData()`: gets the *peaks* data for all spectra in `object`. Peaks +#' data consist of the m/z and intensity values as well as possible additional +#' annotations (variables) of all peaks of each spectrum. The function +#' returns a [SimpleList()] of two dimensional arrays (either `matrix` or +#' `data.frame`), with each array providing the values for the requested +#' *peak variables* (by default `"mz"` and `"intensity"`). Optional parameter +#' `columns` is passed to the backend's `peaksData()` function to allow +#' the selection of specific (or additional) peaks variables (columns) that +#' should be extracted (if available). Importantly, +#' it is **not** guaranteed that each backend supports this parameter (while +#' each backend must support extraction of `"mz"` and `"intensity"` columns). +#' Parameter `columns` defaults to `c("mz", "intensity")` but any value +#' returned by `peaksVariables(object)` is supported. +#' Note also that it is possible to extract the peak data with +#' `as(x, "list")` and `as(x, "SimpleList")` as a `list` and `SimpleList`, +#' respectively. Note however that, in contrast to `peaksData()`, `as()` +#' does not support the parameter `columns`. +#' +#' - `peaksVariables()`: lists the available variables for mass peaks provided +#' by the backend. Default peak variables are `"mz"` and `"intensity"` (which +#' all backends need to support and provide), but some backends might provide +#' additional variables. +#' These variables correspond to the column names of the peak data array +#' returned by `peaksData()`. +#' +#' - `polarity()`, `polarity<-`: gets or sets the polarity for each +#' spectrum. `polarity()` returns an `integer` vector (length equal +#' to the number of spectra), with `0` and `1` representing negative +#' and positive polarities, respectively. `polarity<-` expects an +#' `integer` vector of length 1 or equal to the number of spectra. +#' +#' - `precursorCharge()`, `precursorIntensity()`, `precursorMz()`, +#' `precScanNum()`, `precAcquisitionNum()`: gets the charge (`integer`), +#' intensity (`numeric`), m/z (`numeric`), scan index (`integer`) +#' and acquisition number (`interger`) of the precursor for MS level > +#' 2 spectra from the object. Returns a vector of length equal to +#' the number of spectra in `object`. `NA` are reported for MS1 +#' spectra of if no precursor information is available. +#' +#' - `rtime()`, `rtime<-`: gets or sets the retention times (in seconds) +#' for each spectrum. `rtime()` returns a `numeric` vector (length +#' equal to the number of spectra) with the retention time for each +#' spectrum. `rtime<-` expects a numeric vector with length equal +#' to the number of spectra. +#' +#' - `scanIndex()`: returns an `integer` vector with the *scan index* +#' for each spectrum. This represents the relative index of the +#' spectrum within each file. Note that this can be different to the +#' `acquisitionNum` of the spectrum which represents the index of the +#' spectrum during acquisition/measurement (as reported in the mzML file). +#' +#' - `smoothed()`,`smoothed<-`: gets or sets whether a spectrum is +#' *smoothed*. `smoothed()` returns a `logical` vector of length equal +#' to the number of spectra. `smoothed<-` takes a `logical` vector +#' of length 1 or equal to the number of spectra in `object`. +#' +#' - `spectraData()`: gets general spectrum metadata (annotation, also called +#' header). `spectraData()` returns a `DataFrame`. Note that this +#' method does by default **not** return m/z or intensity values. +#' +#' - `spectraData<-`: **replaces** the full spectra data of the `Spectra` +#' object with the one provided with `value`. The `spectraData<-` function +#' expects a `DataFrame` to be passed as value with the same number of rows +#' as there a spectra in `object`. Note that replacing values of +#' peaks variables is not supported with a non-empty processing queue, i.e. +#' if any filtering or data manipulations on the peaks data was performed. +#' In these cases [applyProcessing()] needs to be called first to apply all +#' cached data operations and empty the processing queue. +#' +#' - `spectraNames()`, `spectraNames<-`: gets or sets the spectra names. +#' +#' - `spectraVariables()`: returns a `character` vector with the +#' available spectra variables (columns, fields or attributes of each +#' spectrum) available in `object`. Note that `spectraVariables()` does not +#' list the *peak variables* (`"mz"`, `"intensity"` and eventual additional +#' annotations for each MS peak). Peak variables are returned by +#' `peaksVariables()`. +#' +#' - `tic()`: gets the total ion current/count (sum of signal of a +#' spectrum) for all spectra in `object`. By default, the value +#' reported in the original raw data file is returned. For an empty +#' spectrum, `0` is returned. +#' +#' - `uniqueMsLevels()`: get the unique MS levels available in `object`. This +#' function is supposed to be more efficient than `unique(msLevel(object))`. #' -#' @importFrom MsCoreUtils vapply1c +#' @md #' -#' @exportMethod c -setMethod("c", "Spectra", function(x, ...) { - .concatenate_spectra(unname(list(unname(x), ...))) -}) - -#' @rdname Spectra -setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { - bcknds <- split(x@backend, f, ...) - lapply(bcknds, function(b) { - slot(x, "backend", check = FALSE) <- b - x - }) -}) - - -#### --------------------------------------------------------------------------- -## -## ACCESSOR METHODS -## -#### --------------------------------------------------------------------------- - -#' @rdname Spectra -setMethod("acquisitionNum", "Spectra", function(object) - acquisitionNum(object@backend)) - -#' @rdname Spectra -setMethod( - "peaksData", "Spectra", - function(object, columns = c("mz", "intensity"), - f = processingChunkFactor(object), ..., BPPARAM = bpparam()) { - if (length(object@processingQueue) || length(f)) - SimpleList(.peaksapply(object, columns = columns, f = f)) - else SimpleList(peaksData(object@backend, columns = columns)) - }) - -#' @rdname Spectra -setMethod("peaksVariables", "Spectra", function(object) - peaksVariables(object@backend)) +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +#' +#' @examples +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex +#' +#' ## Get the number of spectra in the data set +#' length(sciex) +#' +#' ## Get the number of mass peaks per spectrum - limit to the first 6 +#' lengths(sciex) |> head() +#' +#' ## Get the MS level for each spectrum - limit to the first 6 spectra +#' msLevel(sciex) |> head() +#' +#' ## Alternatively, we could also use $ to access a specific spectra variable. +#' ## This could also be used to add additional spectra variables to the +#' ## object (see further below). +#' sciex$msLevel |> head() +#' +#' ## Get the intensity and m/z values. +#' intensity(sciex) +#' mz(sciex) +#' +#' ## Convert a subset of the Spectra object to a long DataFrame. +#' asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) +#' +#' ## Create a Spectra providing a `DataFrame` containing the spectrum data. +#' +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) +#' +#' s <- Spectra(spd) +#' s +#' +#' ## Get the peak data (m/z and intensity values). +#' pks <- peaksData(s) +#' pks +#' pks[[1]] +#' pks[[2]] +#' +#' ## Note that we could get the same resulb by coercing the `Spectra` to +#' ## a `list` or `SimpleList`: +#' as(data, "list") +#' as(data, "SimpleList") +#' +#' ## List all available spectra variables (i.e. spectrum data and metadata). +#' spectraVariables(s) +#' +#' ## For all *core* spectrum variables accessor functions are available. These +#' ## return NA if the variable was not set. +#' centroided(s) +#' dataStorage(s) +#' rtime(s) +#' precursorMz(s) +#' +#' ## The core spectra variables are: +#' coreSpectraVariables() +#' +#' ## Add an additional metadata column. +#' s$spectrum_id <- c("sp_1", "sp_2") +#' +#' ## List spectra variables, "spectrum_id" is now also listed +#' spectraVariables(s) +#' +#' ## Get the values for the new spectra variable +#' s$spectrum_id +#' +#' ## Extract specific spectra variables. +#' spectraData(s columns = c("spectrum_id", "msLevel")) +#' +#' +#' ## ---- PEAKS VARIABLES AND DATA ---- +#' +#' ## Some `MsBackend` classes provide support for arbitrary peaks variables +#' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below +#' ## we create a simple data frame with an additional peak variable `"pk_ann"` +#' ## and create a `Spectra` with a `MsBackendMemory` for that data. +#' ## Importantly the number of values (per spectrum) need to be the same +#' ## for all peak variables. +#' +#' tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) +#' tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) +#' tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) +#' tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) +#' +#' ## Create the Spectra. With parameter `peaksVariables` we can define +#' ## the columns in `tmp` that contain peaks variables. +#' sps <- Spectra(tmp, source = MsBackendMemory(), +#' peaksVariables = c("mz", "intensity", "pk_ann")) +#' peaksVariables(sps) +#' +#' ## Extract just the m/z and intensity values +#' peaksData(sps)[[1L]] +#' +#' ## Extract the full peaks data +#' peaksData(sps, columns = peaksVariables(sps))[[1L]] +#' +#' ## Access just the pk_ann variable +#' sps$pk_ann +NULL #' @importFrom methods setAs setAs("Spectra", "list", function(from, to) { @@ -591,48 +930,68 @@ setAs("Spectra", "SimpleList", function(from, to) { peaksData(from) }) -#' @rdname Spectra +#' @export +#' +#' @rdname spectraData +#' +#' @param spectraVars `character()` indicating what spectra variables to add to +#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all +#' available variables. +asDataFrame <- function(object, i = seq_along(object), + spectraVars = spectraVariables(object)) { + stopifnot(inherits(object, "Spectra")) + object <- object[i] + n <- sapply(peaksData(object), nrow) + v <- spectraData(object)[rep(seq_along(object), n), spectraVars] + p <- do.call(rbind, as.list(peaksData(object))) + cbind(p, v) +} + +#' @rdname spectraData +setMethod("acquisitionNum", "Spectra", function(object) + acquisitionNum(object@backend)) + +#' @rdname spectraData setMethod("centroided", "Spectra", function(object) { centroided(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("centroided", "Spectra", function(object, value) { centroided(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("collisionEnergy", "Spectra", function(object) { collisionEnergy(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("collisionEnergy", "Spectra", function(object, value) { collisionEnergy(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData +#' +#' @export +coreSpectraVariables <- function() .SPECTRA_DATA_COLUMNS + +#' @rdname spectraData setMethod("dataOrigin", "Spectra", function(object) dataOrigin(object@backend)) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("dataOrigin", "Spectra", function(object, value) { dataOrigin(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("dataStorage", "Spectra", function(object) dataStorage(object@backend)) -#' @rdname Spectra -setMethod("dropNaSpectraVariables", "Spectra", function(object) { - object@backend <- dropNaSpectraVariables(object@backend) - object -}) - -#' @rdname Spectra +#' @rdname spectraData setMethod("intensity", "Spectra", function(object, f = processingChunkFactor(object), ...) { @@ -642,7 +1001,7 @@ setMethod("intensity", "Spectra", function(object, else intensity(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("ionCount", "Spectra", function(object) { if (length(object)) unlist(.peaksapply( @@ -651,7 +1010,7 @@ setMethod("ionCount", "Spectra", function(object) { else numeric() }) -#' @rdname Spectra +#' @rdname spectraData setMethod("isCentroided", "Spectra", function(object, ...) { if (length(object)) unlist(.peaksapply(object, FUN = .peaks_is_centroided), @@ -659,7 +1018,7 @@ setMethod("isCentroided", "Spectra", function(object, ...) { else logical() }) -#' @rdname Spectra +#' @rdname spectraData setMethod("isEmpty", "Spectra", function(x) { if (length(x)) unlist(.peaksapply(x, FUN = function(pks, ...) nrow(pks) == 0), @@ -667,125 +1026,45 @@ setMethod("isEmpty", "Spectra", function(x) { else logical() }) -#' @rdname Spectra +#' @rdname spectraData setMethod("isolationWindowLowerMz", "Spectra", function(object) { isolationWindowLowerMz(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("isolationWindowLowerMz", "Spectra", function(object, value) { isolationWindowLowerMz(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("isolationWindowTargetMz", "Spectra", function(object) { isolationWindowTargetMz(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("isolationWindowTargetMz", "Spectra", function(object, value) { isolationWindowTargetMz(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("isolationWindowUpperMz", "Spectra", function(object) { isolationWindowUpperMz(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("isolationWindowUpperMz", "Spectra", function(object, value) { isolationWindowUpperMz(object@backend) <- value object }) -#' @rdname Spectra -#' -#' @exportMethod containsMz -setMethod("containsMz", "Spectra", function(object, mz = numeric(), - tolerance = 0, - ppm = 20, which = c("any", "all"), - BPPARAM = bpparam()) { - cond_fun <- match.fun(match.arg(which)) - if (all(is.na(mz))) - return(rep(NA, length(object))) - mz <- unique(sort(mz)) - BPPARAM <- backendBpparam(object@backend, BPPARAM) - ## TODO: fix to use .peaksapply instead. - if (is(BPPARAM, "SerialParam")) - .has_mz(object, mz, tolerance = tolerance, ppm = ppm, - condFun = cond_fun, parallel = BPPARAM) - else { - sp <- SerialParam() - f <- as.factor(dataStorage(object)) - res <- .lapply(object, FUN = .has_mz, mz = mz, tolerance = tolerance, - condFun = cond_fun, parallel = sp, f = f, - BPPARAM = BPPARAM) - unsplit(res, f = f) - } -}) - -#' @rdname Spectra -#' -#' @exportMethod containsNeutralLoss -setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, - tolerance = 0, ppm = 20, - BPPARAM = bpparam()) { - BPPARAM <- backendBpparam(object@backend, BPPARAM) - ## TODO: FIX me to use chunk size. - if (is(BPPARAM, "SerialParam")) { - .has_mz_each(object, precursorMz(object) - neutralLoss, - tolerance = tolerance, ppm = ppm, parallel = BPPARAM) - } else { - sp <- SerialParam() - f <- as.factor(dataStorage(object)) - res <- .lapply(object, FUN = function(obj, n, tol, ppm, par) { - .has_mz_each(obj, precursorMz(obj) - n, tolerance = tol, - ppm = ppm, parallel = sp) - }, n = neutralLoss, tol = tolerance, ppm = ppm, par = sp, f = f, - BPPARAM = BPPARAM) - unsplit(res, f = f) - } -}) - -#' @rdname Spectra -#' -#' @importMethodsFrom ProtGenerics spectrapply -#' -#' @exportMethod spectrapply -setMethod("spectrapply", "Spectra", function(object, FUN, ..., - chunkSize = integer(), - f = factor(), - BPPARAM = SerialParam()) { - if (missing(FUN)) - FUN <- identity - if (length(chunkSize)) - return(chunkapply(object, FUN, ..., chunkSize = chunkSize)) - if (!length(f)) - f <- as.factor(seq_along(object)) - .lapply(object, FUN = FUN, f = f, ..., - BPPARAM = backendBpparam(object@backend, BPPARAM)) -}) - -#' @rdname Spectra +#' @rdname spectraData #' #' @exportMethod length setMethod("length", "Spectra", function(x) length(x@backend)) -#' @rdname Spectra -setMethod("msLevel", "Spectra", function(object) msLevel(object@backend)) - -#' @rdname Spectra -setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), - ...) { - if (length(object@processingQueue) || length(f)) - NumericList(.peaksapply(object, FUN = function(z, ...) z[, 1], - f = f, ...), compress = FALSE) - else mz(object@backend) -}) - -#' @rdname Spectra +#' @rdname spectraData #' #' @exportMethod lengths setMethod("lengths", "Spectra", function(x, use.names = FALSE) { @@ -798,76 +1077,91 @@ setMethod("lengths", "Spectra", function(x, use.names = FALSE) { } else integer() }) -#' @rdname Spectra +#' @rdname spectraData +setMethod("msLevel", "Spectra", function(object) msLevel(object@backend)) + +#' @rdname spectraData +setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), + ...) { + if (length(object@processingQueue) || length(f)) + NumericList(.peaksapply(object, FUN = function(z, ...) z[, 1], + f = f, ...), compress = FALSE) + else mz(object@backend) +}) + +#' @rdname spectraData +setMethod( + "peaksData", "Spectra", + function(object, columns = c("mz", "intensity"), + f = processingChunkFactor(object), ..., BPPARAM = bpparam()) { + if (length(object@processingQueue) || length(f)) + SimpleList(.peaksapply(object, columns = columns, f = f)) + else SimpleList(peaksData(object@backend, columns = columns)) + }) + +#' @rdname spectraData +setMethod("peaksVariables", "Spectra", function(object) + peaksVariables(object@backend)) + +#' @rdname spectraData setMethod("polarity", "Spectra", function(object) { polarity(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("polarity", "Spectra", function(object, value) { polarity(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("precScanNum", "Spectra", function(object) { precScanNum(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("precursorCharge", "Spectra", function(object) { precursorCharge(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("precursorIntensity", "Spectra", function(object) { precursorIntensity(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("precursorMz", "Spectra", function(object) { precursorMz(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("rtime", "Spectra", function(object) { rtime(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("rtime", "Spectra", function(object, value) { rtime(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("scanIndex", "Spectra", function(object) { scanIndex(object@backend) }) -#' @rdname Spectra -setMethod( - "selectSpectraVariables", "Spectra", - function(object, spectraVariables = union(spectraVariables(object), - peaksVariables(object))) { - spectraVariables <- union(spectraVariables, "dataStorage") - object@backend <- selectSpectraVariables( - object@backend, spectraVariables = spectraVariables) - object - }) - -#' @rdname Spectra +#' @rdname spectraData setMethod("smoothed", "Spectra", function(object) { smoothed(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("smoothed", "Spectra", function(object, value) { smoothed(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData #' #' @importMethodsFrom ProtGenerics spectraData #' @@ -897,7 +1191,7 @@ setMethod( spectraData(object@backend, columns = columns) }) -#' @rdname Spectra +#' @rdname spectraData #' #' @importMethodsFrom ProtGenerics spectraData<- #' @@ -925,23 +1219,23 @@ setReplaceMethod("spectraData", "Spectra", function(object, value) { object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("spectraNames", "Spectra", function(object) { spectraNames(object@backend) }) -#' @rdname Spectra +#' @rdname spectraData setReplaceMethod("spectraNames", "Spectra", function(object, value) { spectraNames(object@backend) <- value object }) -#' @rdname Spectra +#' @rdname spectraData setMethod("spectraVariables", "Spectra", function(object) { setdiff(spectraVariables(object@backend), peaksVariables(object@backend)) }) -#' @rdname Spectra +#' @rdname spectraData setMethod("tic", "Spectra", function(object, initial = TRUE) { if (!length(object)) return(numeric()) @@ -950,7 +1244,12 @@ setMethod("tic", "Spectra", function(object, initial = TRUE) { else ionCount(object) }) -#' @rdname Spectra +#' @rdname spectraData +setMethod("uniqueMsLevels", "Spectra", function(object, ...) { + uniqueMsLevels(object@backend, ...) +}) + +#' @rdname spectraData #' #' @importMethodsFrom S4Vectors $ #' @@ -971,7 +1270,7 @@ setMethod("$", "Spectra", function(x, name) { } }) -#' @rdname Spectra +#' @rdname spectraData #' #' @export setReplaceMethod("$", "Spectra", function(x, name, value) { @@ -986,6 +1285,63 @@ setReplaceMethod("$", "Spectra", function(x, name, value) { x }) + +################################################################################ +## +## Merging, splitting and aggregating Spectra: length of Spectra is changed +## +################################################################################ + +#' @title Merging, splitting and aggregating Spectra +#' +#' @rdname Spectra +#' +#' @importFrom MsCoreUtils vapply1c +#' +#' @exportMethod c +setMethod("c", "Spectra", function(x, ...) { + .concatenate_spectra(unname(list(unname(x), ...))) +}) + +#' @rdname Spectra +setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { + bcknds <- split(x@backend, f, ...) + lapply(bcknds, function(b) { + slot(x, "backend", check = FALSE) <- b + x + }) +}) + + + +################################################################################ +## +## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## +################################################################################ + +#' @title Filtering and subsetting Spectra objects +#' +#' @aliases [,Spectra-method + +#' @rdname Spectra +setMethod("dropNaSpectraVariables", "Spectra", function(object) { + object@backend <- dropNaSpectraVariables(object@backend) + object +}) + +#' @rdname Spectra +setMethod( + "selectSpectraVariables", "Spectra", + function(object, spectraVariables = union(spectraVariables(object), + peaksVariables(object))) { + spectraVariables <- union(spectraVariables, "dataStorage") + object@backend <- selectSpectraVariables( + object@backend, spectraVariables = spectraVariables) + object + }) + + #' @rdname Spectra #' #' @export @@ -1018,11 +1374,6 @@ setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { x }) -#### --------------------------------------------------------------------------- -## -## FILTERING AND SUBSETTING -## -#### --------------------------------------------------------------------------- #' @rdname Spectra setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { @@ -1034,6 +1385,34 @@ setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { x }) +#' @rdname hidden_aliases +setMethod("combinePeaks", "list", function(object, ...) { + .Deprecated("combinePeaksData", old = "combinePeaks", + msg = paste0("'combinePeaks' for lists of peak matrices is ", + "deprecated; please use 'combinePeaksData' ", + "instead.")) + combinePeaksData(object, ...) +}) + +#' @rdname Spectra +#' +#' @exportMethod combinePeaks +setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, + intensityFun = base::mean, + mzFun = base::mean, + weighted = TRUE, + msLevel. = uniqueMsLevels(object), + ...) { + object <- addProcessing( + object, .peaks_combine, ppm = ppm, tolerance = tolerance, + intensityFun = intensityFun, mzFun = mzFun, weighted = weighted, + msLevel = force(msLevel.), spectraVariables = "msLevel") + object@processing <- .logging( + object@processing, "Combining peaks within each spectrum with ppm = ", + ppm, " and tolerance = ", tolerance, ".") + object +}) + #' @rdname Spectra setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), dataStorage = character(), @@ -1321,17 +1700,6 @@ setMethod("filterRt", "Spectra", object }) -#' @rdname Spectra -setMethod("reset", "Spectra", function(object, ...) { - object@backend <- reset(object@backend) - object@processingQueue <- list() - if (!.hasSlot(object, "processingQueueVariables")) - object <- updateObject(object, check = FALSE) - object@processingQueueVariables <- character() - object@processing <- .logging(object@processing, "Reset object.") - object -}) - #' @rdname Spectra setMethod("filterRanges", "Spectra", function(object, spectraVariables = character(), ranges = numeric(), @@ -1360,11 +1728,48 @@ setMethod("filterValues", "Spectra", object }) -#### --------------------------------------------------------------------------- + +################################################################################ ## -## DATA MANIPULATION METHODS +## Data manipulation and analysis operations (lazy processing) ## -#### --------------------------------------------------------------------------- +################################################################################ + +#' @title Data manipulation and analysis methods +#' +#' `reset()` to clean the lazy processing queue. + +#' @exportMethod addProcessing +#' +#' @importFrom ProtGenerics ProcessingStep +#' +#' @importMethodsFrom ProtGenerics addProcessing +#' +#' @importClassesFrom ProtGenerics ProcessingStep +#' +#' @importFrom methods .hasSlot +#' +#' @importFrom BiocGenerics updateObject +#' +#' @rdname Spectra +setMethod("addProcessing", "Spectra", function(object, FUN, ..., + spectraVariables = character()) { + if (missing(FUN)) + return(object) + object@processingQueue <- c(object@processingQueue, + list(ProcessingStep(FUN, ARGS = list(...)))) + if (!.hasSlot(object, "processingQueueVariables")) + object <- updateObject(object) + object@processingQueueVariables <- union(object@processingQueueVariables, + spectraVariables) + validObject(object) + object +}) + +#' @rdname Spectra +setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { + backendBpparam(object@backend, BPPARAM) +}) #' @rdname Spectra #' @@ -1396,45 +1801,72 @@ setMethod("bin", "Spectra", function(x, binSize = 1L, breaks = NULL, #' @rdname Spectra #' -#' @exportMethod compareSpectra +#' @exportMethod containsMz +setMethod("containsMz", "Spectra", function(object, mz = numeric(), + tolerance = 0, + ppm = 20, which = c("any", "all"), + BPPARAM = bpparam()) { + cond_fun <- match.fun(match.arg(which)) + if (all(is.na(mz))) + return(rep(NA, length(object))) + mz <- unique(sort(mz)) + BPPARAM <- backendBpparam(object@backend, BPPARAM) + ## TODO: fix to use .peaksapply instead. + if (is(BPPARAM, "SerialParam")) + .has_mz(object, mz, tolerance = tolerance, ppm = ppm, + condFun = cond_fun, parallel = BPPARAM) + else { + sp <- SerialParam() + f <- as.factor(dataStorage(object)) + res <- .lapply(object, FUN = .has_mz, mz = mz, tolerance = tolerance, + condFun = cond_fun, parallel = sp, f = f, + BPPARAM = BPPARAM) + unsplit(res, f = f) + } +}) + +#' @rdname Spectra #' -#' @importFrom MsCoreUtils ndotproduct +#' @exportMethod containsNeutralLoss +setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, + tolerance = 0, ppm = 20, + BPPARAM = bpparam()) { + BPPARAM <- backendBpparam(object@backend, BPPARAM) + ## TODO: FIX me to use chunk size. + if (is(BPPARAM, "SerialParam")) { + .has_mz_each(object, precursorMz(object) - neutralLoss, + tolerance = tolerance, ppm = ppm, parallel = BPPARAM) + } else { + sp <- SerialParam() + f <- as.factor(dataStorage(object)) + res <- .lapply(object, FUN = function(obj, n, tol, ppm, par) { + .has_mz_each(obj, precursorMz(obj) - n, tolerance = tol, + ppm = ppm, parallel = sp) + }, n = neutralLoss, tol = tolerance, ppm = ppm, par = sp, f = f, + BPPARAM = BPPARAM) + unsplit(res, f = f) + } +}) + +#' @rdname Spectra #' -#' @importMethodsFrom ProtGenerics compareSpectra +#' @importFrom MsCoreUtils entropy nentropy #' -#' @exportMethod compareSpectra -setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), - function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, - FUN = ndotproduct, ..., SIMPLIFY = TRUE) { - mat <- .compare_spectra_chunk(x, y, MAPFUN = MAPFUN, - tolerance = tolerance, - ppm = ppm, FUN = FUN, ...) - if (SIMPLIFY && (length(x) == 1 || length(y) == 1)) - mat <- as.vector(mat) - mat - }) +#' @export +setMethod("entropy", "Spectra", function(object, normalized = TRUE) { + if (length(object)) { + if (normalized) entropy_fun <- nentropy + else entropy_fun <- entropy + unlist(.peaksapply( + object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), + use.names = FALSE + ) + } else numeric() +}) #' @rdname Spectra -setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), - function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, - FUN = ndotproduct, ..., SIMPLIFY = TRUE) { - if (length(x) == 1) - return(compareSpectra(x, x, MAPFUN = MAPFUN, - tolerance = tolerance, - ppm = ppm, FUN = FUN, ..., - SIMPLIFY = SIMPLIFY)) - mat <- .compare_spectra_self(x, MAPFUN = MAPFUN, FUN = FUN, - tolerance = tolerance, ppm = ppm, - ...) - if (SIMPLIFY && length(x) == 1) - mat <- as.vector(mat) - mat - }) - -## estimateMzResolution - -## estimateNoise - -## normalize +setMethod("entropy", "ANY", function(object, ...) { + MsCoreUtils::entropy(object) +}) #' @rdname Spectra #' @@ -1478,10 +1910,6 @@ setMethod("pickPeaks", "Spectra", object }) -## quantify - -## removeReporters - #' @rdname Spectra #' #' @exportMethod replaceIntensitiesBelow @@ -1509,6 +1937,16 @@ setMethod("replaceIntensitiesBelow", "Spectra", object }) +#' @rdname Spectra +setMethod("reset", "Spectra", function(object, ...) { + object@backend <- reset(object@backend) + object@processingQueue <- list() + if (!.hasSlot(object, "processingQueueVariables")) + object <- updateObject(object, check = FALSE) + object@processingQueueVariables <- character() + object@processing <- .logging(object@processing, "Reset object.") + object +}) #' @rdname Spectra #' @@ -1541,120 +1979,29 @@ setMethod("smooth", "Spectra", x }) -#' @exportMethod addProcessing -#' -#' @importFrom ProtGenerics ProcessingStep -#' -#' @importMethodsFrom ProtGenerics addProcessing -#' -#' @importClassesFrom ProtGenerics ProcessingStep -#' -#' @importFrom methods .hasSlot -#' -#' @importFrom BiocGenerics updateObject -#' -#' @rdname Spectra -setMethod("addProcessing", "Spectra", function(object, FUN, ..., - spectraVariables = character()) { - if (missing(FUN)) - return(object) - object@processingQueue <- c(object@processingQueue, - list(ProcessingStep(FUN, ARGS = list(...)))) - if (!.hasSlot(object, "processingQueueVariables")) - object <- updateObject(object) - object@processingQueueVariables <- union(object@processingQueueVariables, - spectraVariables) - validObject(object) - object -}) - -#' @rdname Spectra -#' -#' @export -coreSpectraVariables <- function() .SPECTRA_DATA_COLUMNS - -#' @rdname Spectra -setMethod("uniqueMsLevels", "Spectra", function(object, ...) { - uniqueMsLevels(object@backend, ...) -}) - -#' @rdname Spectra -setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { - backendBpparam(object@backend, BPPARAM) -}) - -#' @rdname hidden_aliases -setMethod("combinePeaks", "list", function(object, ...) { - .Deprecated("combinePeaksData", old = "combinePeaks", - msg = paste0("'combinePeaks' for lists of peak matrices is ", - "deprecated; please use 'combinePeaksData' ", - "instead.")) - combinePeaksData(object, ...) -}) - #' @rdname Spectra #' -#' @exportMethod combinePeaks -setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, - intensityFun = base::mean, - mzFun = base::mean, - weighted = TRUE, - msLevel. = uniqueMsLevels(object), - ...) { - object <- addProcessing( - object, .peaks_combine, ppm = ppm, tolerance = tolerance, - intensityFun = intensityFun, mzFun = mzFun, weighted = weighted, - msLevel = force(msLevel.), spectraVariables = "msLevel") - object@processing <- .logging( - object@processing, "Combining peaks within each spectrum with ppm = ", - ppm, " and tolerance = ", tolerance, ".") - object -}) - - -#' @rdname Spectra -#' -#' @importFrom MsCoreUtils entropy nentropy +#' @importMethodsFrom ProtGenerics spectrapply #' -#' @export -setMethod("entropy", "Spectra", function(object, normalized = TRUE) { - if (length(object)) { - if (normalized) entropy_fun <- nentropy - else entropy_fun <- entropy - unlist(.peaksapply( - object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), - use.names = FALSE - ) - } else numeric() -}) -#' @rdname Spectra -setMethod("entropy", "ANY", function(object, ...) { - MsCoreUtils::entropy(object) +#' @exportMethod spectrapply +setMethod("spectrapply", "Spectra", function(object, FUN, ..., + chunkSize = integer(), + f = factor(), + BPPARAM = SerialParam()) { + if (missing(FUN)) + FUN <- identity + if (length(chunkSize)) + return(chunkapply(object, FUN, ..., chunkSize = chunkSize)) + if (!length(f)) + f <- as.factor(seq_along(object)) + .lapply(object, FUN = FUN, f = f, ..., + BPPARAM = backendBpparam(object@backend, BPPARAM)) }) -#' @export -#' @rdname Spectra -#' -#' @param spectraVars `character()` indicating what spectra variables to add to -#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all -#' available variables. -#' -#' @examples -#' -#' ## Convert a subset of the Spectra object to a long DataFrame. -#' asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) -asDataFrame <- function(object, i = seq_along(object), - spectraVars = spectraVariables(object)) { - stopifnot(inherits(object, "Spectra")) - object <- object[i] - n <- sapply(peaksData(object), nrow) - v <- spectraData(object)[rep(seq_along(object), n), spectraVars] - p <- do.call(rbind, as.list(peaksData(object))) - cbind(p, v) -} - #' @title Estimate Precursor Intensities #' +#' @aliases estimatePrecursorIntensity +#' #' @description #' #' Some MS instrument manufacturers don't provide precursor intensities for @@ -1733,3 +2080,48 @@ setMethod( msLevel = msLevel., BPPARAM = BPPARAM), use.names = FALSE) }) + + +################################################################################ +## +## Spectra similarity calculations +## +################################################################################ + +#' @title Spectra similarity calculations + +#' @rdname Spectra +#' +#' @exportMethod compareSpectra +#' +#' @importFrom MsCoreUtils ndotproduct +#' +#' @importMethodsFrom ProtGenerics compareSpectra +#' +#' @exportMethod compareSpectra +setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), + function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, + FUN = ndotproduct, ..., SIMPLIFY = TRUE) { + mat <- .compare_spectra_chunk(x, y, MAPFUN = MAPFUN, + tolerance = tolerance, + ppm = ppm, FUN = FUN, ...) + if (SIMPLIFY && (length(x) == 1 || length(y) == 1)) + mat <- as.vector(mat) + mat + }) +#' @rdname Spectra +setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), + function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, + FUN = ndotproduct, ..., SIMPLIFY = TRUE) { + if (length(x) == 1) + return(compareSpectra(x, x, MAPFUN = MAPFUN, + tolerance = tolerance, + ppm = ppm, FUN = FUN, ..., + SIMPLIFY = SIMPLIFY)) + mat <- .compare_spectra_self(x, MAPFUN = MAPFUN, FUN = FUN, + tolerance = tolerance, ppm = ppm, + ...) + if (SIMPLIFY && length(x) == 1) + mat <- as.vector(mat) + mat + }) From 3bfe934a7f8d9a6e159d94af2619e3676cf32280 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 07:50:48 +0200 Subject: [PATCH 24/41] docs: restructure documentation --- R/AllGenerics.R | 10 - R/MsBackend.R | 4 +- R/Spectra-estimatePrecursorMz.R | 4 + R/Spectra-functions.R | 42 +- R/Spectra-neutralLoss.R | 4 + R/Spectra.R | 1870 +++++++++++++++++++++++++--- R/countIdentifications.R | 4 + R/peaks-functions.R | 8 +- man/MsBackend.Rd | 4 +- man/Spectra.Rd | 1889 ++--------------------------- man/countIdentifications.Rd | 3 + man/estimatePrecursorIntensity.Rd | 1 + man/estimatePrecursorMz.Rd | 3 + man/hidden_aliases.Rd | 27 - man/joinPeaks.Rd | 7 +- man/neutralLoss.Rd | 21 +- man/processingChunkSize.Rd | 10 +- 17 files changed, 1931 insertions(+), 1980 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 0b69bdaf..5ec6d054 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -5,42 +5,32 @@ NULL setMethod("bin", "numeric", MsCoreUtils::bin) setGeneric("combinePeaks", function(object, ...) standardGeneric("combinePeaks")) -#' @rdname hidden_aliases setGeneric("containsMz", function(object, ...) standardGeneric("containsMz")) -#' @rdname hidden_aliases setGeneric("containsNeutralLoss", function(object, ...) standardGeneric("containsNeutralLoss")) setGeneric("dataStorageBasePath", function(object, ...) standardGeneric("dataStorageBasePath")) setGeneric("dataStorageBasePath<-", function(object, ..., value) standardGeneric("dataStorageBasePath<-")) -#' @rdname hidden_aliases setGeneric("dropNaSpectraVariables", function(object, ...) standardGeneric("dropNaSpectraVariables")) -#' @rdname hidden_aliases setGeneric("entropy", function(object, ...) standardGeneric("entropy")) -#' @rdname hidden_aliases setGeneric("export", function(object, ...) standardGeneric("export")) setGeneric("filterFourierTransformArtefacts", function(object, ...) standardGeneric("filterFourierTransformArtefacts")) -#' @rdname neutralLoss setGeneric("neutralLoss", function(object, param, ...) standardGeneric("neutralLoss")) -#' @rdname hidden_aliases setGeneric("pickPeaks", function(object, ...) standardGeneric("pickPeaks")) setGeneric("plotSpectraMirror", function(x, y, ...) standardGeneric("plotSpectraMirror")) -#' @rdname hidden_aliases setGeneric("replaceIntensitiesBelow", function(object, threshold = min, ...) standardGeneric("replaceIntensitiesBelow")) -#' @rdname hidden_aliases setGeneric("reset", function(object, ...) standardGeneric("reset")) -#' @rdname hidden_aliases setGeneric("selectSpectraVariables", function(object, ...) standardGeneric("selectSpectraVariables")) setGeneric("Spectra", function(object, ...) standardGeneric("Spectra")) diff --git a/R/MsBackend.R b/R/MsBackend.R index eae122b1..b89e5303 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -702,7 +702,7 @@ #' #' The parameters are: #' - `object`: an instance of the `MsBackendMzR` class. -#' - `x`: the [Spectra-class] object to be exported. +#' - `x`: the [Spectra] object to be exported. #' - `file`: `character` with the (full) output file name(s). Should be #' of length 1 or equal `length(x)`. If a single file is specified, all #' spectra are exported to that file. Alternatively it is possible to specify @@ -715,7 +715,7 @@ #' backend and if `dataOrigin(x)` contains the original MS data file names. #' - `BPPARAM`: parallel processing settings. #' -#' See examples in [Spectra-class] or the vignette for more details and +#' See examples in [Spectra] or the vignette for more details and #' examples. #' #' The `MsBackendMzR` ignores parameter `columns` of the `peaksData()` diff --git a/R/Spectra-estimatePrecursorMz.R b/R/Spectra-estimatePrecursorMz.R index 72743d57..ad6ff630 100644 --- a/R/Spectra-estimatePrecursorMz.R +++ b/R/Spectra-estimatePrecursorMz.R @@ -55,6 +55,10 @@ #' #' @author Mar Garcia-Aloy, Johannes Rainer #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @export #' #' @examples diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 517452c4..033a2b2d 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -207,7 +207,7 @@ NULL #' @export applyProcessing #' -#' @rdname Spectra +#' @rdname addProcessing applyProcessing <- function(object, f = processingChunkFactor(object), BPPARAM = bpparam(), ...) { queue <- object@processingQueue @@ -538,14 +538,14 @@ applyProcessing <- function(object, f = processingChunkFactor(object), #' @export concatenateSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra concatenateSpectra <- function(x, ...) { .concatenate_spectra(unlist(unname(list(unname(x), ...)))) } #' @export combineSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, FUN = combinePeaksData, ..., BPPARAM = bpparam()) { if (!is.factor(f)) @@ -622,7 +622,7 @@ combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, #' @export joinSpectraData #' -#' @rdname Spectra +#' @rdname combineSpectra joinSpectraData <- function(x, y, by.x = "spectrumId", by.y, @@ -685,7 +685,7 @@ joinSpectraData <- function(x, y, #' @export #' -#' @rdname Spectra +#' @rdname addProcessing processingLog <- function(x) { x@processing } @@ -831,9 +831,7 @@ chunkapply <- function(x, FUN, ..., chunkSize = 1000L, chunks = factor()) { as.factor(rep(1:ceiling(len / chunkSize), each = chunkSize)[seq_len(len)]) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export deisotopeSpectra <- @@ -845,9 +843,7 @@ deisotopeSpectra <- substDefinition = im, charge = charge) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export reduceSpectra <- function(x, tolerance = 0, ppm = 20) { @@ -856,9 +852,7 @@ reduceSpectra <- function(x, tolerance = 0, ppm = 20) { addProcessing(x, .peaks_reduce, tolerance = tolerance, ppm = ppm) } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { @@ -891,9 +885,7 @@ filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { x } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorIsotopes <- @@ -926,9 +918,7 @@ filterPrecursorIsotopes <- x } -#' @rdname Spectra -#' -#' @author Johannes Rainer +#' @rdname addProcessing #' #' @export scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { @@ -941,7 +931,7 @@ scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { x } -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @export filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, @@ -992,6 +982,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' per file parallel processing if `f` or `chunkSize` is not defined. #' Other on-disk backends: only if requested by the user. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra` object. #' #' @param chunkSize `integer` defining the size of chunks into which `x` should @@ -1067,6 +1062,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' For these, the `backendBpparam()` function will always return a #' `SerialParam()` independently on how parallel processing was defined. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra`. #' #' @param value `integer(1)` defining the chunk size. diff --git a/R/Spectra-neutralLoss.R b/R/Spectra-neutralLoss.R index 53f3b2b5..dc9cf32c 100644 --- a/R/Spectra-neutralLoss.R +++ b/R/Spectra-neutralLoss.R @@ -87,6 +87,10 @@ setClassUnion("functionOrNull", c("function", "NULL")) #' Analysis in METLIN. Journal of the American Society for Mass Spectrometry. #' \doi{10.1021/jasms.1c00343} #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @examples #' #' ## Create a simple example Spectra object with some MS1, MS2 and MS3 spectra. diff --git a/R/Spectra.R b/R/Spectra.R index d3da9b44..179ee58c 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -9,12 +9,12 @@ NULL #' @title The Spectra class to manage and access MS data #' -#' @name Spectra-class +#' @name Spectra #' +#' @aliases Spectra-class #' @aliases Spectra #' @aliases setBackend #' @aliases export -#' @aliases dataStorageBasePath #' #' @description #' @@ -29,12 +29,52 @@ NULL #' #' Documentation on other topics and functionality of `Spectra`can be found in: #' -#' LLLLLLL add links to individual documentations. +#' - [spectraData()] for accessing and using MS data through `Spectra` objects. +#' - [filterMsLevel()] to subset and filter `Spectra` objects. +#' - [plotSpectra()] for visualization of `Spectra` orbjects. #' - [processingChunkSize()] for information on parallel and chunk-wise data #' processing. -#' - [plotSpectra()] for visualization of `Spectra`. -#' - [spectraData()] for accessing and using MS data through `Spectra`. +#' - [combineSpectra()] for merging, aggregating and splitting of `Spectra` +#' objects. +#' - [combinePeaks()] for merging and aggregating `Spectra`'s mass peaks data. +#' - [addProcessing()] for data analysis functions. +#' - [compareSpectra()] for spectra similarity calculations. +#' +#' @param backend For `Spectra()`: [MsBackend-class] to be used as backend. See +#' section on creation of `Spectra` objects for details. For `setBackend()`: +#' instance of [MsBackend-class] that supports `setBackend()` (i.e. for +#' which `supportsSetBackend()` returns `TRUE`). Such backends have a +#' parameter `data` in their `backendInitialize()` function that support +#' passing the full spectra data to the initialize method. See section on +#' creation of `Spectra` objects for details. +#' For `export()`: [MsBackend-class] to be used to export the data. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @param f For `setBackend()`: factor defining how to split the data +#' for parallelized copying of the spectra data to the new backend. For +#' some backends changing this parameter can lead to errors. Defaults to +#' [processingChunkFactor()]. +#' +#' @param metadata For `Spectra()`: optional `list` with metadata information. +#' +#' @param object For `Spectra()`: an object to instantiate the `Spectra` +#' object and initialize the with data.. See section on creation of +#' `Spectra` objects for details. For all other methods a `Spectra` object. +#' +#' @param processingQueue For `Spectra()`: optional `list` of +#' [ProcessingStep-class] objects. +#' +#' @param source For `Spectra()`: instance of [MsBackend-class] that can be +#' used to import spectrum data from the provided files. See section +#' *Creation of objects* for more details. +#' +#' @param value For `dataStorageBasePath()`: A `character` vector that defines +#' the base directory where the data storage files can be found. +#' +#' @param ... Additional arguments. #' #' @section Data stored in a `Spectra` object: #' @@ -175,11 +215,12 @@ NULL #' The `Spectra` class uses by default a lazy data manipulation strategy, #' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` #' are not applied immediately to the data, but applied on-the-fly to the -#' spectrum data once it is retrieved. For some backends that allow to write -#' data back to the data storage (such as the [MsBackendMemory()], -#' [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it is possible to apply -#' to queue with the `applyProcessing` function. See the *Data manipulation and -#' analysis *methods* section below for more details. +#' spectrum data once it is retrieved. This enables data manipulation +#' operations also for *read only* data representations. For some backends that +#' allow to write data back to the data storage (such as the +#' [MsBackendMemory()], [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it +#' is possible to apply to queue with the [applyProcessing()] function (see +#' the [applyProcessing()] function for details). #' #' Clarifications regarding scan/acquisition numbers and indices: #' @@ -208,7 +249,7 @@ NULL #' #' @examples #' -#' ## ---- CREATION OF SPECTRA OBJECTS ---- +#' ## -------- CREATION OF SPECTRA OBJECTS -------- #' #' ## Create a Spectra providing a `DataFrame` containing the spectrum data. #' @@ -227,7 +268,7 @@ NULL #' sciex #' #' -#' ## ---- CHANGING DATA REPRESENTATIONS ---- +#' ## -------- CHANGING DATA REPRESENTATIONS -------- #' #' ## The MS data is on disk and will be read into memory on-demand. We can #' ## however change the backend to a MsBackendMemory backend which will @@ -264,7 +305,7 @@ NULL #' head(dataOrigin(sciex_im)) #' #' -#' ## ---- DATA EXPORT ---- +#' ## -------- DATA EXPORT -------- #' #' ## Some `MsBackend` classes provide an `export()` method to export the data #' ## to the file format supported by the backend. @@ -293,10 +334,11 @@ NULL #' #' mz(res) #' mz(data) +NULL #' The Spectra class #' -#' The [Spectra-class] encapsulates data and meta-data for mass +#' The [Spectra] class encapsulates data and meta-data for mass #' spectrometry experiments. #' #' @slot backend A derivate of [MsBackend-class] holding/controlling the spectra @@ -378,7 +420,7 @@ setMethod("show", "Spectra", } }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "missing", function(object, processingQueue = list(), metadata = list(), ..., backend = MsBackendMemory(), @@ -389,7 +431,7 @@ setMethod("Spectra", "missing", function(object, processingQueue = list(), else callNextMethod() }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), metadata = list(), ..., BPPARAM = bpparam()) { @@ -397,7 +439,7 @@ setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), backend = object) }) -#' @rdname Spectra-class +#' @rdname Spectra #' #' @importFrom methods callNextMethod setMethod("Spectra", "character", function(object, processingQueue = list(), @@ -413,7 +455,7 @@ setMethod("Spectra", "character", function(object, processingQueue = list(), else sp }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "ANY", function(object, processingQueue = list(), metadata = list(), source = MsBackendMemory(), @@ -439,7 +481,7 @@ setMethod("Spectra", "ANY", function(object, processingQueue = list(), backend = backend) } -#' @rdname Spectra-class +#' @rdname Spectra #' #' @importMethodsFrom ProtGenerics setBackend #' @@ -488,7 +530,7 @@ setMethod( object }) -#' @rdname Spectra-class +#' @rdname Spectra #' #' @export setMethod("export", "Spectra", @@ -498,12 +540,12 @@ setMethod("export", "Spectra", export(backend, object, ...) }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("dataStorageBasePath", "Spectra", function(object) { dataStorageBasePath(object@backend) }) -#' @rdname Spectra-class +#' @rdname Spectra setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { dataStorageBasePath(object@backend) <- value object @@ -524,6 +566,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' @aliases collisionEnergy #' @aliases dataOrigin #' @aliases dataStorage +#' @aliases intensity #' @aliases ionCount #' @aliases isCentroided #' @aliases isEmpty @@ -550,7 +593,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' @description #' -#' As detailed in the documentation of the [Spectra-class], a `Spectra` object +#' As detailed in the documentation of the [Spectra] class, a `Spectra` object #' is a container for mass spectrometry (MS) data that includes both the mass #' peaks data (or *peaks data*, generally *m/z* and intensity values) as well #' as spectra metadata (so called *spectra variables*). Spectra variables @@ -563,6 +606,52 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' used by the `Spectra` to represent the data, data can also be added or #' replaced (again, using dedicated functions or using `$<-`). #' +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. See also [processingChunkSize()] for more information +#' on parallel processing. +#' +#' @param columns For `spectraData()` accessor: optional `character` with +#' column names (spectra variables) that should be included in the +#' returned `DataFrame`. By default, all columns are returned. +#' For `peaksData()` accessor: optional `character` with requested columns +#' in the individual `matrix` of the returned `list`. Defaults to +#' `c("mz", "value")` but any values returned by `peaksVariables(object)` +#' with `object` being the `Spectra` object are supported. +#' +#' @param f For `intensity()`, `mz()` and `peaksData()`: factor defining how +#' data should be chunk-wise loaded an processed. Defaults to +#' [processingChunkFactor()]. +#' +#' @param i For `asDataFrame()`: A `numeric` indicating which scans to coerce +#' to a `DataFrame` (default is `seq_along(object)`). +#' +#' @param initial For `tic()`: `logical(1)` whether the initially +#' reported total ion current should be reported, or whether the +#' total ion current should be (re)calculated on the actual data +#' (`initial = FALSE`, same as `ionCount()`). +#' +#' @param j For `[`: not supported. +#' +#' @param name For `$` and `$<-`: the name of the spectra variable to return +#' or set. +#' +#' @param object A `Spectra` object. +#' +#' @param spectraVars `character()` indicating what spectra variables to add to +#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all +#' available variables. +#' +#' @param use.names For `lengths()`: ignored. +#' +#' @param value A vector with values to replace the respective spectra +#' variable. Needs to be of the correct data type for the spectra variable. +#' +#' @param x A `Spectra` object. +#' +#' @param ... Additional arguments. +#' +#' #' @section Spectra variables: #' #' A common set of *core spectra variables* are defined for `Spectra`. These @@ -642,7 +731,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' The set of available functions to extract data from, or set data in, a #' `Spectra` object are (in alphabetical order) listed below. Note that there #' are also other functions to extract information from a `Spectra` object -#' documented LLLLLLL +#' documented in [addProcessing()]. #' #' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. #' See examples for details. Note that replacing values of a peaks variable @@ -651,10 +740,16 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' [applyProcessing()] needs to be called first to apply all cached data #' operations. #' +#' - `[[`, `[[<-`: access or set/add a single spectrum variable (column) in the +#' backend. +#' #' - `acquisitionNum()`: returns the acquisition number of each #' spectrum. Returns an `integer` of length equal to the number of #' spectra (with `NA_integer_` if not available). #' +#' - `asDataFrame()`: converts the `Spectra` to a `DataFrame` (in long format) +#' contining all data. Returns a `DataFrame`. +#' #' - `centroided()`, `centroided<-`: gets or sets the centroiding #' information of the spectra. `centroided()` returns a `logical` #' vector of length equal to the number of spectra with `TRUE` if a @@ -814,9 +909,16 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' @md #' +#' @seealso +#' +#' - [addProcessing()] for functions to analyze `Spectra`. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' #' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail #' #' @examples +#' #' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk #' ## backend. #' sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -854,17 +956,6 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' s <- Spectra(spd) #' s #' -#' ## Get the peak data (m/z and intensity values). -#' pks <- peaksData(s) -#' pks -#' pks[[1]] -#' pks[[2]] -#' -#' ## Note that we could get the same resulb by coercing the `Spectra` to -#' ## a `list` or `SimpleList`: -#' as(data, "list") -#' as(data, "SimpleList") -#' #' ## List all available spectra variables (i.e. spectrum data and metadata). #' spectraVariables(s) #' @@ -888,10 +979,26 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' s$spectrum_id #' #' ## Extract specific spectra variables. -#' spectraData(s columns = c("spectrum_id", "msLevel")) +#' spectraData(s, columns = c("spectrum_id", "msLevel")) +#' #' +#' ## -------- PEAKS VARIABLES AND DATA -------- +#' +#' ## Get the peak data (m/z and intensity values). +#' pks <- peaksData(s) +#' pks +#' pks[[1]] +#' pks[[2]] +#' +#' ## Note that we could get the same resulb by coercing the `Spectra` to +#' ## a `list` or `SimpleList`: +#' as(s, "list") +#' as(s, "SimpleList") #' -#' ## ---- PEAKS VARIABLES AND DATA ---- +#' ## Or use `mz()` and `intensity()` to extract the m/z and intensity values +#' ## separately +#' mz(s) +#' intensity(s) #' #' ## Some `MsBackend` classes provide support for arbitrary peaks variables #' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below @@ -919,6 +1026,8 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' ## Access just the pk_ann variable #' sps$pk_ann +#' +#' NULL #' @importFrom methods setAs @@ -933,10 +1042,6 @@ setAs("Spectra", "SimpleList", function(from, to) { #' @export #' #' @rdname spectraData -#' -#' @param spectraVars `character()` indicating what spectra variables to add to -#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all -#' available variables. asDataFrame <- function(object, i = seq_along(object), spectraVars = spectraVariables(object)) { stopifnot(inherits(object, "Spectra")) @@ -1285,6 +1390,38 @@ setReplaceMethod("$", "Spectra", function(x, name, value) { x }) +#' @rdname spectraData +#' +#' @export +setMethod("[[", "Spectra", function(x, i, j, ...) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to access.") + if (!missing(j)) + stop("'j' is not supported.") + if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) + stop("No spectra variable '", i, "' available") + if (i == "mz") + mz(x) + else if (i == "intensity") + intensity(x) + else + do.call("[[", list(x@backend, i)) +}) + +#' @rdname spectraData +#' +#' @export +setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to replace or create.") + if (!missing(j)) + stop("'j' is not supported.") + x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) + x +}) + ################################################################################ ## @@ -1292,18 +1429,236 @@ setReplaceMethod("$", "Spectra", function(x, name, value) { ## ################################################################################ -#' @title Merging, splitting and aggregating Spectra +#' @title Merging, aggregating and splitting Spectra #' -#' @rdname Spectra +#' @name combineSpectra +#' +#' @aliases combineSpectra +#' @aliases split +#' @aliases joinSpectraData +#' +#' @description +#' +#' Various functions are availabe to combine, aggregate or split data from one +#' of more `Spectra` objects. These are: +#' +#' - `c()` and `concatenateSpectra()`: combines several `Spectra` objects into +#' a single object. The resulting `Spectra` contains all data from all +#' individual `Spectra`, i.e. the union of all their spectra variables. +#' Concatenation will fail if the processing queue of any of the `Spectra` +#' objects is not empty or if different backends are used for the `Spectra` +#' objects. In such cases it is suggested to first change the backends of +#' all `Spectra` to the same type of backend (using the [setBackend()] +#' function and to eventually (if needed) apply the processing queue using +#' the [applyProcessing()] function. +#' +#' - `combineSpectra()`: combines sets of spectra (defined with parameter `f`) +#' into a single spectrum per set aggregating their MS data (i.e. their +#' *peaks data* matrices with the *m/z* and intensity values of their +#' mass peaks). The spectra variable values of the first spectrum per set +#' are reported for the combined spectrum. The peak matrices of the spectra +#' per set are combined using the function specified with parameter `FUN` +#' which uses by default the [combinePeaksData()] function. See the +#' documentation of [combinePeaksData()] for details on the aggregation of +#' the peak data and the package vignette for examples. +#' The sets of spectra can be specified with parameter `f` which is expected +#' to be a `factor` or `vector` of length equal to the length of the +#' `Spectra` specifying to which set a spectrum belongs to. The function +#' returns a `Spectra` of length equal to the unique levels of `f`. The +#' optional parameter `p` allows to define how the `Spectra` should be +#' split for potential parallel processing. The default is +#' `p = x$dataStorage` and hence a per storage file parallel processing is +#' applied for `Spectra` with on disk data representations (such as the +#' [MsBackendMzR()]). This also prevents that spectra from different data +#' files/samples are combined (eventually use e.g. `p = x$dataOrigin` or any +#' other spectra variables defining the originating samples for a spectrum). +#' Before combining the peaks data, all eventual present processing steps are +#' applied (by calling [applyProcessing()] on the `Spectra`). This function +#' will replace the original *m/z* and intensity values of a `Spectra` hence +#' it can not be called on a `Spectra` with a *read-only* backend. In such +#' cases, the backend should be changed to a *writeable* backend before +#' using the [setBackend()] function (to e.g. a [MsBackendMemory()] backend). +#' +#' - `joinSpectraData()`: Individual spectra variables can be directly +#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` +#' function allows to merge a `DataFrame` to the existing spectra +#' data of a `Spectra`. This function diverges from the [merge()] method in +#' two main ways: +#' - The `by.x` and `by.y` column names must be of length 1. +#' - If variable names are shared in `x` and `y`, the spectra +#' variables of `x` are not modified. It's only the `y` +#' variables that are appended with the suffix defined in +#' `suffix.y`. This is to avoid modifying any core spectra +#' variables that would lead to an invalid object. +#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not +#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) +#' throw a warning and only the last occurrence is kept. These +#' should be explored and ideally be removed using for +#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar +#' functions. +#' +#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` +#' of `Spectra` objects. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @param by.x A `character(1)` specifying the spectra variable used +#' for merging. Default is `"spectrumId"`. +#' +#' @param by.y A `character(1)` specifying the column used for +#' merging. Set to `by.x` if missing. +#' +#' @param drop For `split()`: not considered. +#' +#' @param f For `split()`: factor defining how to split `x`. See [base::split()] +#' for details. +#' For `combineSpectra()`: `factor` defining the grouping of the spectra +#' that should be combined. Defaults to `x$dataStorage`. +#' +#' @param FUN For `combineSpectra()`: function to combine the (peak matrices) +#' of the spectra. Defaults to [combinePeaksData()]. +#' +#' @param p For `combineSpectra()`: `factor` defining how to split the input +#' `Spectra` for parallel processing. Defaults to `x$dataStorage`, i.e., +#' depending on the used backend, per-file parallel processing will be +#' performed. +#' +#' @param suffix.y A `character(1)` specifying the suffix to be used +#' for making the names of columns in the merged spectra variables +#' unique. This suffix will be used to amend `names(y)`, while +#' `spectraVariables(x)` will remain unchanged. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `DataFrame` with the spectra variables to join/add. +#' +#' @param ... Additional arguments. +#' +#' @seealso +#' +#' - [combinePeaks()] for functions to aggregate mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. #' #' @importFrom MsCoreUtils vapply1c #' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra providing a `DataFrame` containing a MS data. +#' +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) +#' +#' s <- Spectra(spd) +#' s +#' +#' ## Create a second Spectra from mzML files and use the `MsBackendMzR` +#' ## on-disk backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex +#' +#' ## Subset to the first 100 spectra to reduce running time of the examples +#' sciex <- sciex[1:100] +#' +#' +#' ## -------- COMBINE SPECTRA -------- +#' +#' ## Combining the `Spectra` object `s` with the MS data from `sciex`. +#' ## Calling directly `c(s, sciex)` would result in an error because +#' ## both backends use a different backend. We thus have to first change +#' ## the backends to the same backend. We change the backend of the `sciex` +#' ## `Spectra` to a `MsBackendMemory`, the backend used by `s`. +#' +#' sciex <- setBackend(sciex, MsBackendMemory()) +#' +#' ## Combine the two `Spectra` +#' all <- c(s, sciex) +#' all +#' +#' ## The new `Spectra` objects contains the union of spectra variables from +#' ## both: +#' spectraVariables(all) +#' +#' ## The spectra variables that were not present in `s`: +#' setdiff(spectraVariables(all), spectraVariables(s)) +#' +#' ## The values for these were filled with missing values for spectra from +#' ## `s`: +#' all$peaksCount |> head() +#' +#' +#' ## -------- AGGREGATE SPECTRA -------- +#' +#' ## Sets of spectra can be combined into a single, representative spectrum +#' ## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +#' ## the spectra's m/z and intensity values) while using the values for all +#' ## spectra variables from the first spectrum per set. Below we define the +#' ## sets as all spectra measured in the *same second*, i.e. rounding their +#' ## retention time to the next closer integer value. +#' f <- round(rtime(sciex)) +#' head(f) +#' +#' cmp <- combineSpectra(sciex, f = f) +#' +#' ## The length of `cmp` is now equal to the length of unique levels in `f`: +#' length(cmp) +#' +#' ## The spectra variable value from the first spectrum per set is used in +#' ## the representative/combined spectrum: +#' cmp$rtime +#' +#' ## The peaks data was aggregated: the number of mass peaks of the first six +#' ## spectra from the original `Spectra`: +#' lengths(sciex) |> head() +#' +#' ## and for the first aggreagated spectra: +#' lengths(cmp) |> head() +#' +#' ## The default peaks data aggregation method joins all mass peaks. See +#' ## documentation of the `combinePeaksData()` function for more options. +#' +#' +#' ## -------- SPLITTING DATA -------- +#' +#' ## A `Spectra` can be split into a `list` of `Spectra` objects using the +#' ## `split()` function defining the sets into which the `Spectra` should +#' ## be splitted into with parameter `f`. +#' sciex_split <- split(sciex, f) +#' +#' length(sciex_split) +#' sciex_split |> head() +#' +#' +#' ## -------- ADDING SPECTRA DATA -------- +#' +#' ## Adding new spectra variables +#' sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +#' spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging +#' var1 = rnorm(10), +#' var2 = sample(letters, 10)) +#' spv +#' +#' sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") +#' +#' spectraVariables(sciex2) +#' spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +NULL + +#' @rdname combineSpectra +#' #' @exportMethod c setMethod("c", "Spectra", function(x, ...) { .concatenate_spectra(unname(list(unname(x), ...))) }) -#' @rdname Spectra +#' @rdname combineSpectra setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { bcknds <- split(x@backend, f, ...) lapply(bcknds, function(b) { @@ -1313,77 +1668,111 @@ setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { }) - ################################################################################ ## -## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## Aggregating peaks data ## ################################################################################ -#' @title Filtering and subsetting Spectra objects +#' @title Aggregating and combining mass peaks data #' -#' @aliases [,Spectra-method - -#' @rdname Spectra -setMethod("dropNaSpectraVariables", "Spectra", function(object) { - object@backend <- dropNaSpectraVariables(object@backend) - object -}) - -#' @rdname Spectra -setMethod( - "selectSpectraVariables", "Spectra", - function(object, spectraVariables = union(spectraVariables(object), - peaksVariables(object))) { - spectraVariables <- union(spectraVariables, "dataStorage") - object@backend <- selectSpectraVariables( - object@backend, spectraVariables = spectraVariables) - object - }) - - -#' @rdname Spectra +#' @name combinePeaks #' -#' @export -setMethod("[[", "Spectra", function(x, i, j, ...) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to access.") - if (!missing(j)) - stop("'j' is not supported.") - if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) - stop("No spectra variable '", i, "' available") - if (i == "mz") - mz(x) - else if (i == "intensity") - intensity(x) - else - do.call("[[", list(x@backend, i)) -}) - -#' @rdname Spectra +#' @description #' -#' @export -setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to replace or create.") - if (!missing(j)) - stop("'j' is not supported.") - x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) - x -}) - - -#' @rdname Spectra -setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { - if (!missing(j)) - stop("Subsetting 'Spectra' by columns is not (yet) supported") - if (missing(i)) - return(x) - slot(x, "backend", check = FALSE) <- x@backend[i = i] - x -}) +#' In addition to aggregating content of spectra variables (describe in +#' [combineSpectra()]) it is also possible to aggregate and combine mass peaks +#' data from individual spectra within a `Spectra`. These `combinePeaks()` +#' function combines mass peaks **within each spectrum** with a difference in +#' their m/z values that is smaller than the maximal acceptable difference +#' defined by `ppm` and `tolerance`. Parameters `intensityFun` and `mzFun` +#' allow to define functions to aggregate the intensity and m/z values for +#' each such group of peaks. With `weighted = TRUE` (the default), the m/z +#' value of the combined peak is calculated using an intensity-weighted mean +#' and parameter `mzFun` is ignored. The [MsCoreUtils::group()] function is +#' used for the grouping of mass peaks. Parameter `msLevel.` allows to define +#' selected MS levels for which peaks should be combined. This function +#' returns a `Spectra` with the same number of spectra than the input object, +#' but with possibly combined peaks within each spectrum. +#' Additional peak variables (other than `"mz"` and `"intensity"`) are +#' dropped (i.e. their values are replaced with `NA`) for combined peaks +#' unless they are constant across the combined peaks. See also +#' [reduceSpectra()] for a function to select a single *representative* +#' mass peak for each peak group. +#' +#' @param intensityFun Function to aggregate intensities for all peaks in +#' each peak group into a single intensity value. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mzFun Function to aggregate m/z values for all mass peaks within +#' each peak group into a single m/z value. This parameter is ignored if +#' `weighted = TRUE` (the default). +#' +#' @param object A `Spectra` object. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `ppm = 20`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `tolerance = 0`. +#' +#' @param weighted `logical(1)` whether m/z values of peaks within each peak +#' group should be aggregated into a single m/z value using an +#' intensity-weighted mean. Defaults to `weighted = TRUE`. +#' +#' @param ... ignored. +#' +#' @md +#' +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`'s +#' spectra data. +#' +#' - [combinePeaksData()] for the function to combine the mass peaks data. +#' +#' - [reduceSpectra()] and similar functions to filter mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' +#' ## Combine mass peaks per spectrum with a difference in their m/z value +#' ## that is smaller than 20 ppm. The intensity values of such peaks are +#' ## combined by summing their values, while for the m/z values the median +#' ## is reported +#' sciex_comb <- combinePeaks(sciex, ppm = 20, +#' intensityFun = sum, mzFun = median) +#' +#' ## Comparing the number of mass peaks before and after aggregation +#' lengths(sciex) |> head() +#' lengths(sciex_comb) |> head() +#' +#' ## Plotting the first spectrum before and after aggregation +#' par(mfrow = c(1, 2)) +#' plotSpectra(sciex[2L]) +#' plotSpectra(sciex_comb[2L]) +#' +#' ## Using `reduceSpectra()` to keep for each group of mass peaks with a +#' ## difference in their m/z values < 20ppm the one with the highest intensity. +#' sciex_red <- reduceSpectra(sciex, ppm = 20) +#' +#' ## Comparing the number of mass peaks before and after the operation +#' lengths(sciex) |> head() +#' lengths(sciex_red) |> head() +NULL #' @rdname hidden_aliases setMethod("combinePeaks", "list", function(object, ...) { @@ -1394,7 +1783,7 @@ setMethod("combinePeaks", "list", function(object, ...) { combinePeaksData(object, ...) }) -#' @rdname Spectra +#' @rdname combinePeaks #' #' @exportMethod combinePeaks setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, @@ -1413,7 +1802,620 @@ setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, object }) -#' @rdname Spectra + +################################################################################ +## +## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## +################################################################################ + +#' @title Filter and subset Spectra objects +#' +#' @name filterMsLevel +#' +#' @aliases [,Spectra-method +#' @aliases filterAcquisitionNum +#' @aliases filterDataOrigin +#' @aliases filterDataStorage +#' @aliases filterEmptySpectra +#' @aliases filterIsolationWindow +#' @aliases filterMsLevel +#' @aliases filterPolarity +#' @aliases filterPrecursorCharge +#' @aliases filterPrecursorIsotopes +#' @aliases filterPrecursorMzRange +#' @aliases filterPrecursorMzValues +#' @aliases filterPrecursorScan +#' @aliases filterRanges +#' @aliases filterRt +#' @aliases filterValues +#' @aliases dropNaSpectraVariables +#' @aliases selectSpectraVariables +#' @aliases filterIntensity +#' @aliases filterMzRange +#' @aliases filterMzValues +#' @aliases reduceSpectra +#' +#' @description +#' +#' A variety of functions to filter or subset `Spectra` objects are available. +#' These can be generally separated into two main classes: I) *classical* +#' subset operations that immediately reduce the number of spectra in the +#' object and II) filters that reduce the **content** of the object without +#' changing its length (i.e. the number of spectra). The latter can be further +#' subdivided into functions that affect the content of the `spectraData` (i.e. +#' the general spectrum metadata) and those that reduce the content of the +#' object's `peaksData` (i.e. the m/z and intensity values of a spectrum's +#' mass peaks). +#' +#' A description of functions from these 3 different categories are given below +#' in sections *Subset `Spectra`*, *Filter content of `spectraData()`* and +#' *Filter content of `peaksData()`*, respectively. +#' +#' +#' @section Subset `Spectra`: +#' +#' These functions affect the number of spectra in a `Spectra` object creating +#' a subset of the original object without affecting its content. +#' +#' - `[`: subsets the spectra keeping only selected elements (`i`). The method +#' **always** returns a `Spectra` object. +#' +#' - `filterAcquisitionNum()`: filters the object keeping only spectra matching +#' the provided acquisition numbers (argument `n`). If `dataOrigin` or +#' `dataStorage` is also provided, `object` is subsetted to the spectra with +#' an acquisition number equal to `n` **in spectra with matching dataOrigin +#' or dataStorage values** retaining all other spectra. +#' Returns the filtered `Spectra`. +#' +#' - `filterDataOrigin()`: filters the object retaining spectra matching the +#' provided `dataOrigin`. Parameter `dataOrigin` has to be of type +#' `character` and needs to match exactly the data origin value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataOrigin` parameter). +#' +#' - `filterDataStorage()`: filters the object retaining spectra stored in the +#' specified `dataStorage`. Parameter `dataStorage` has to be of type +#' `character` and needs to match exactly the data storage value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataStorage` parameter). +#' +#' - `filterEmptySpectra()`: removes empty spectra (i.e. spectra without peaks). +#' Returns the filtered `Spectra` object (with spectra in their +#' original order). +#' +#' - `filterIsolationWindow()`: retains spectra that contain `mz` in their +#' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` +#' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` +#' object (with spectra in their original order). +#' +#' - `filterMsLevel()`: filters object by MS level keeping only spectra matching +#' the MS level specified with argument `msLevel`. Returns the filtered +#' `Spectra` (with spectra in their original order). +#' +#' - `filterPolarity()`: filters the object keeping only spectra matching the +#' provided polarity. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterPrecursorCharge()`: retains spectra with the defined precursor +#' charge(s). +#' +#' - `filterPrecursorIsotopes()`: groups MS2 spectra based on their precursor +#' m/z and precursor intensity into predicted isotope groups and keep for each +#' only the spectrum representing the monoisotopic precursor. MS1 spectra +#' are returned as is. See documentation for `deisotopeSpectra()` below for +#' details on isotope prediction and parameter description. +#' +#' - `filterPrecursorMaxIntensity()`: filters the `Spectra` keeping for groups +#' of (MS2) spectra with similar precursor m/z values (given parameters +#' `ppm` and `tolerance`) the one with the highest precursor intensity. The +#' function filters only MS2 spectra and returns all MS1 spectra. If +#' precursor intensities are `NA` for all spectra within a spectra group, the +#' first spectrum of that groups is returned. +#' Note: some manufacturers don't provide precursor intensities. These can +#' however also be estimated with [estimatePrecursorIntensity()]. +#' +#' - `filterPrecursorMzRange()` (previously `filterPrecursorMz()` which is now +#' deprecated): retains spectra with a precursor m/z within the +#' provided m/z range. See examples for details on selecting spectra with +#' a precursor m/z for a target m/z accepting a small difference in *ppm*. +#' +#' - `filterPrecursorMzValues()`: retains spectra with precursor m/z matching +#' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with +#' missing precursor m/z value (e.g. MS1 spectra) are dropped. +#' +#' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. +#' MS2) of acquisition number `acquisitionNum`. Returns the filtered +#' `Spectra` (with spectra in their original order). Parameter `f` allows to +#' define which spectra belong to the same sample or original data file ( +#' defaults to `f = dataOrigin(object)`). +#' +#' - `filterRanges()`: allows filtering of the `Spectra` object based on user +#' defined *numeric* ranges (parameter `ranges`) for one or more available +#' spectra variables in object (spectra variable names can be specified with +#' parameter `spectraVariables`). Spectra for which the value of a spectra +#' variable is within it's defined range are retained. If multiple +#' ranges/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' - `filterRt()`: retains spectra of MS level `msLevel` with retention +#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) +#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterValues()`: allows filtering of the `Spectra` object based on +#' similarities of *numeric* values of one or more `spectraVariables(object)` +#' (parameter `spectraVariables`) to provided values (parameter `values`) +#' given acceptable differences (parameters tolerance and ppm). If multiple +#' values/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' +#' @section Filter content of `spectraData()`: +#' +#' The functions described in this section filter the content from a +#' `Spectra`'s spectra data, i.e. affect values of, or complete, spectra +#' variables. None of these functions reduces the object's number of spectra. +#' +#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the +#' object's `spectraData` that contain only missing values (`NA`). Note that +#' while columns with only `NA`s are removed, a `spectraData()` call after +#' `dropNaSpectraVariables()` might still show columns containing `NA` values +#' for *core* spectra variables. The total number of spectra is not changed +#' by this function. +#' +#' - `selectSpectraVariables()`: reduces the information within the object to +#' the selected spectra variables: all data for variables not specified will +#' be dropped. For mandatory columns (i.e., those listed by +#' [coreSpectraVariables()], such as *msLevel*, *rtime* ...) only +#' the values will be dropped but not the variable itself. Additional (or +#' user defined) spectra variables will be completely removed. +#' Returns the filtered `Spectra`. +#' +#' +#' @section Filter content of `peaksData()`: +#' +#' The functions described in this section filter the content of the +#' `Spectra`'s peaks data, i.e. either the number or the values (*m/z* or +#' intensity values) of the mass peaks. Also, the actual operation is only +#' executed once peaks data is accessed (through `peaksData()`, +#' `mz()` or `intensity()`) or `applyProcessing()` is called. +#' These operations don't affect the number of spectra in the `Spectra` object. +#' +#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the +#' monoisotopic peak for groups of isotopologues. Isotopologues are +#' estimated using the [isotopologues()] function from the +#' *MetaboCoreUtils* package. Note that +#' the default parameters for isotope prediction/detection have been +#' determined using data from the Human Metabolome Database (HMDB) and +#' isotopes for elements other than CHNOPS might not be detected. See +#' parameter `substDefinition` in the documentation of [isotopologues()] for +#' more information. The approach and code to define the parameters for +#' isotope prediction is described +#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' +#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier +#' artefact peaks from spectra (see examples below). The function iterates +#' through all intensity ordered peaks in a spectrum and removes all peaks +#' with an m/z within +/- `halfWindowSize` of the current peak if their +#' intensity is lower than `threshold` times the current peak's intensity. +#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` +#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` +#' being the maximum charge that should be considered and `isotopeTolerance` +#' the absolute acceptable tolerance for matching their m/z). +#' See [filterFourierTransformArtefacts()] for details and background and +#' `deisitopeSpectra()` for an alternative. +#' +#' - `filterIntensity()`: filters mass peaks in each spectrum keeping only +#' those with intensities that are within the provided range or match the +#' criteria of the provided function. For the former, parameter `intensity` +#' has to be a `numeric` defining the intensity range, for the latter a +#' `function` that takes the intensity values of the spectrum and returns +#' a `logical` whether the peak should be retained or not (see examples +#' below for details) - additional parameters to the function can be passed +#' with `...`. +#' To remove only peaks with intensities below a certain threshold, say +#' 100, use `intensity = c(100, Inf)`. Note: also a single value can be +#' passed with the `intensity` parameter in which case an upper limit of +#' `Inf` is used. +#' Note that this function removes also peaks with missing intensities +#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the +#' filtering to spectra of the specified MS level(s). +#' +#' - `filterMzRange()`: filters mass peaks in the object keeping or removing +#' those in each spectrum that are within the provided m/z range. Whether +#' peaks are retained or removed can be configured with parameter `keep` +#' (default `keep = TRUE`). +#' +#' - `filterMzValues()`: filters mass peaks in the object keeping all +#' peaks in each spectrum that match the provided m/z value(s) (for +#' `keep = TRUE`, the default) or removing all of them (for `keep = FALSE`). +#' The m/z matching considers also the absolute `tolerance` and m/z-relative +#' `ppm` values. `tolerance` and `ppm` have to be of length 1. +#' +#' - `filterPeaksRanges()`: filters mass peaks of a `Spectra` object using any +#' set of range-based filters on numeric spectra or peaks variables. See +#' [filterPeaksRanges()] for more information. +#' +#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with +#' an m/z equal or larger than the m/z of the precursor, depending on the +#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching +#' m/z (considering an absolute and relative acceptable difference depending +#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all +#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` +#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` +#' allows to restrict the filter to certain MS levels (by default the filter +#' is applied to all MS levels). Note that no peaks are removed if the +#' precursor m/z is `NA` (e.g. typically for MS1 spectra). +#' +#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in +#' (given `ppm` and `tolerance`) in each spectrum only the mass peak with the +#' highest intensity removing all other peaks hence *reducing* each +#' spectrum to the highest intensity peaks per *peak group*. +#' Peak groups are defined using the [group()] function from the +#' *MsCoreUtils* package. See also the [combinePeaks()] function for an +#' alternative function to combine peaks within each spectrum. +#' +#' @param acquisitionNum for `filterPrecursorScan()`: `integer` with the +#' acquisition number of the spectra to which the object should be +#' subsetted. +#' +#' @param charge For `deisotopeSpectra()`: expected charge of the ionized +#' compounds. See [isotopologues()] for details. +#' +#' @param dataOrigin For `filterDataOrigin()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occurr only for spectra of selected `dataOrigin`. +#' +#' @param dataStorage For `filterDataStorage()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occur only for spectra of selected `dataStorage`. +#' +#' @param drop For `[`: not considered. +#' +#' @param f For `filterPrecursorScan()`: defining which spectra +#' belong to the same original data file (sample): Defaults to +#' `f = dataOrigin(x)`. +#' +#' @param halfWindowSize For `filterFourierTransformArtefacts()`: `numeric(1)` +#' defining the m/z window left and right of a peak where to remove +#' fourier transform artefacts. +#' +#' @param i For `[`: `integer`, `logical` or `character` to subset the +#' object. +#' +#' @param intensity For `filterIntensity()`: `numeric` of length 1 or 2 +#' defining either the lower or the lower and upper intensity limit for the +#' filtering, or a `function` that takes the intensities as input and +#' returns a `logical` (same length then peaks in the spectrum) whether the +#' peak should be retained or not. Defaults to `intensity = c(0, Inf)` thus +#' only peaks with `NA` intensity are removed. +#' +#' @param isotopeTolerance For `filterFourierTransformArtefacts()`: the m/z +#' `tolerance` to be used to define whether peaks might be isotopes of +#' the current tested peak. +#' +#' @param j For `[`: not supported. +#' +#' @param keep For `filterMzValues()` and `filterMzRange()`: `logical(1)` +#' whether the matching peaks should be retained (`keep = TRUE`, the +#' default) or dropped (`keep = FALSE`). +#' +#' @param keepIsotopes For `filterFourierTransformArtefacts()`: whether isotope +#' peaks should not be removed as fourier artefacts. +#' +#' @param match For `filterRanges()` and `filterValues()`: `character(1) ` +#' defining whether the condition has to match for all provided +#' `ranges`/`values` (`match = "all"`; the default), or for any of them +#' (`match = "any"`) for spectra to be retained. +#' +#' @param maxCharge For `filterFourierTransformArtefacts()`: the maximum charge +#' to be considered for isotopes. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' For `filterMsLevel()`: the MS level to which `object` should be +#' subsetted. +#' +#' @param mz For `filterIsolationWindow()`: `numeric(1)` with the m/z value to +#' filter the object. For `filterPrecursorMz()` and `filterMzRange()`: +#' `numeric(2)` defining the lower and upper m/z boundary. +#' For `filterMzValues()` and `filterPrecursorMzValues()`: `numeric` with +#' the m/z values to match peaks or precursor m/z against. +#' For `filterPrecursorPeaks()`: `character(1)` defining whether mass peaks +#' with an m/z matching the spectrum's precursor m/z (`mz = "=="`, +#' the default) or mass peaks with a m/z that is equal or larger +#' (`mz = ">="`) should be removed. +#' +#' @param n for `filterAcquisitionNum()`: `integer` with the acquisition +#' numbers to filter for. +#' +#' @param object `Spectra` object. +#' +#' @param polarity for `filterPolarity()`: `integer` specifying the polarity to +#' to subset `object`. +#' +#' @param ppm For `filterMzValues()` and `reduceSpectra()`: `numeric(1)` +#' defining a relative, m/z-dependent, maximal accepted difference between +#' m/z values for peaks to be matched (or grouped). +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the relative +#' maximal accepted difference of precursor m/z values of spectra for +#' grouping them into *precursor groups*. For `filterPrecursorIsotopes()`: +#' passed directly to the [isotopologues()] function. +#' For `filterValues()`: `numeric` of any length allowing to define +#' a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `ppm[1]` will be +#' recycled. +#' +#' @param ranges for `filterRanges()`: A `numeric` vector of paired values +#' (upper and lower boundary) that define the ranges to filter the `object`. +#' These paired values need to be in the same order as the +#' `spectraVariables` parameter (see below). +#' +#' @param rt for `filterRt()`: `numeric(2)` defining the retention time range to +#' be used to subset/filter `object`. +#' +#' @param spectraVariables For `selectSpectraVariables()`: `character` with the +#' names of the spectra variables to which the backend should be +#' subsetted. For `filterRanges()` and `filterValues()`: `character` +#' vector specifying the column(s) from `spectraData(object)` on which +#' to filter the data and that correspond to the the names of the +#' spectra variables that should be used for the filtering. +#' +#' @param substDefinition For `deisotopeSpectra()` and +#' `filterPrecursorIsotopes()`: `matrix` or `data.frame` with definitions +#' of isotopic substitutions. Uses by default isotopic substitutions +#' defined from all compounds in the Human Metabolome Database (HMDB). See +#' [isotopologues()] or [isotopicSubstitutionMatrix()] in the +#' *MetaboCoreUtils* for details. +#' +#' @param threshold For `filterFourierTransformArtefacts()`: the relative +#' intensity (to a peak) below which peaks are considered fourier +#' artefacts. Defaults to `threshold = 0.2` hence removing peaks that +#' have an intensity below 0.2 times the intensity of the tested peak +#' (within the selected `halfWindowSize`). +#' +#' @param tolerance For `filterMzValues()` and `reduceSpectra()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched (or grouped). For +#' `containsMz()` it can also be of length equal `mz` to specify a different +#' tolerance for each m/z value. +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the +#' (constant) maximal accepted difference of precursor m/z values of +#' spectra for grouping them into *precursor groups*. For +#' `filterPrecursorIsotopes()`: passed directly to the [isotopologues()] +#' function. For `filterValues()`: `numeric` of any length allowing to +#' define a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `tolerance[1]` will be +#' recycled. Default is `tolerance = 0`. +#' +#' @param values for `filterValues()`: A `numeric` vector that define the +#' values to filter the Spectra data. These values need to be in the same +#' order as the `spectraVariables` parameter. +#' +#' @param x `Spectra` object. +#' +#' @param z For `filterPrecursorCharge()`: `integer()` with the precursor +#' charges to be used as filter. +#' +#' @param ... Additional arguments. +#' +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`. +#' +#' - [combinePeaks()] for functions to combine or aggregate a `Spectra`'s +#' `peaksData()` +#' +#' @md +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- SUBSET SPECTRA -------- +#' +#' ## Subset to the first 3 spectra +#' tmp <- sps_dda[1:3] +#' tmp +#' length(tmp) +#' +#' ## Subset to all MS2 spectra; this could be done with [, or, more +#' ## efficiently, with the `filterMsLevel` function: +#' sps_dda[msLevel(sps_dda) == 2L] +#' filterMsLevel(sps_dda, 2L) +#' +#' ## Filter the object keeping only MS2 spectra with an precursor m/z value +#' ## between a specified range: +#' filterPrecursorMzRange(sps_dda, c(80, 90)) +#' +#' ## Filter the object to MS2 spectra with an precursor m/z matching a +#' ## pre-defined value (given ppm and tolerance) +#' filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) +#' +#' ## The `filterRanges()` function allows to filter a `Spectra` based on +#' ## numerical ranges of any of its (numerical) spectra variables. +#' ## First, determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz", "peaksCount") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the ranges (pairs of values with lower and upper boundary) to be +#' ## used for the individual spectra variables. The first two values will be +#' ## used for the first spectra variable (e.g., `"rtime"` here), the next two +#' ## for the second (e.g. `"precursorMz"` here) and so on: +#' ranges <- c(30, 350, 200, 500, 350, 600) +#' +#' ## Input the parameters within the filterRanges function: +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges) +#' filt_spectra +#' +#' ## `filterRanges()` can also be used to filter a `Spectra` object with +#' ## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +#' sv <- c("rtime", "rtime") +#' ranges <- c(30, 100, 200, 300) +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges, match = "any") +#' filt_spectra +#' +#' ## While `filterRanges()` filtered on numeric ranges, `filterValues()` +#' ## allows to filter an object matching spectra variable values to user +#' ## provided values (allowing to configure allowed differences using the +#' ## `ppm` and `tolerance` parameters). +#' ## First determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the values that will be used to filter the spectra based on their +#' ## similarities to their respective `spectraVariables`. +#' ## The first values in the parameters values, tolerance and ppm will be +#' ## used for the first spectra variable (e.g. `"rtime"` here), the next for +#' ## the second (e.g. `"precursorMz"` here) and so on: +#' values <- c(350, 80) +#' tolerance <- c(100, 0.1) +#' ppm <- c(0, 50) +#' +#' ## Input the parameters within the `filterValues()` function: +#' filt_spectra <- filterValues(sps_dda, spectraVariables = sv, +#' values = values, tolerance = tolerance, ppm = ppm) +#' filt_spectra +#' +#' +#' ## -------- FILTER SPECTRA DATA -------- +#' +#' ## Remove spectra variables without content (i.e. with only missing values) +#' sps_noNA <- dropNaSpectraVariables(sps_dda) +#' +#' ## This reduced the size of the object slightly +#' print(object.size(sps_dda), unit = "MB") +#' print(object.size(sps_noNA), unit = "MB") +#' +#' ## With the `selectSpectraVariables()` function it is in addition possible +#' ## to subset the data of a `Spectra` to the selected columns/variables, +#' ## keeping only their data: +#' tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", +#' "scanIndex")) +#' print(object.size(tmp), units = "MB") +#' +#' ## Except the selected variables, all data is now removed. Accessing +#' ## core spectra variables still works, but returns only NA +#' rtime(tmp) |> head() +#' +#' +#' ## -------- FILTER PEAKS DATA -------- +#' +#' ## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +#' ## only those mass peaks with an m/z value matching the provided value(s). +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) +#' +#' ## The filtered `Spectra` has the same length +#' length(sps_dda) +#' length(sps_sub) +#' +#' ## But the number of mass peaks changed +#' lengths(sps_dda) |> head() +#' lengths(sps_sub) |> head() +#' +#' ## This function can also be used to remove specific peaks from a spectrum +#' ## by setting `keep = FALSE`. +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), +#' tolerance = 0.3, keep = FALSE) +#' lengths(sps_sub) |> head() +#' +#' ## With the `filterMzRange()` function it is possible to keep (or remove) +#' ## mass peaks with m/z values within a specified numeric range. +#' sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +#' lengths(sps_sub) |> head() +#' +#' ## See also the `filterPeaksRanges()` function for a more flexible framework +#' ## to filter mass peaks +#' +#' +#' ## Removing fourier transform artefacts seen in Orbitra data. +#' +#' ## Loading an Orbitrap spectrum with artefacts. +#' data(fft_spectrum) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +#' fft_spectrum +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' ## Using a few examples peaks in your data you can optimize the parameters +#' fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, +#' halfWindowSize = 0.2, +#' threshold = 0.005, +#' keepIsotopes = TRUE, +#' maxCharge = 5, +#' isotopeTolerance = 0.005 +#' ) +#' +#' fft_spectrum_filtered +#' length(mz(fft_spectrum_filtered)[[1]]) +#' plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' +#' ## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +#' ## by similarity of their m/z values) only one representative peak. This +#' ## function helps cleaning fragment spectra. +#' ## Filter the data set to MS2 spectra +#' ms2 <- filterMsLevel(sps_dda, 2L) +#' +#' ## For groups of fragment peaks with a difference in m/z < 0.1, keep only +#' ## the largest one. +#' ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +#' lengths(ms2) |> tail() +#' lengths(ms2_red) |> tail() +NULL + +#' @rdname filterMsLevel +setMethod("dropNaSpectraVariables", "Spectra", function(object) { + object@backend <- dropNaSpectraVariables(object@backend) + object +}) + +#' @rdname filterMsLevel +setMethod( + "selectSpectraVariables", "Spectra", + function(object, spectraVariables = union(spectraVariables(object), + peaksVariables(object))) { + spectraVariables <- union(spectraVariables, "dataStorage") + object@backend <- selectSpectraVariables( + object@backend, spectraVariables = spectraVariables) + object + }) + +#' @rdname filterMsLevel +#' +#' @export +setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { + if (!missing(j)) + stop("Subsetting 'Spectra' by columns is not (yet) supported") + if (missing(i)) + return(x) + slot(x, "backend", check = FALSE) <- x@backend[i = i] + x +}) + +#' @rdname filterMsLevel setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), dataStorage = character(), dataOrigin = character()) { @@ -1431,7 +2433,7 @@ setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterEmptySpectra", "Spectra", function(object) { object@backend <- object@backend[as.logical(lengths(object))] object@processing <- .logging(object@processing, @@ -1439,7 +2441,7 @@ setMethod("filterEmptySpectra", "Spectra", function(object) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterDataOrigin", "Spectra", function(object, dataOrigin = character()) { if (length(dataOrigin) && !is.character(dataOrigin)) @@ -1451,7 +2453,7 @@ setMethod("filterDataOrigin", "Spectra", function(object, object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterDataStorage", "Spectra", function(object, dataStorage = character()) { if (length(dataStorage) && !is.character(dataStorage)) @@ -1463,7 +2465,7 @@ setMethod("filterDataStorage", "Spectra", function(object, object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @exportMethod filterFourierTransformArtefacts setMethod("filterFourierTransformArtefacts", "Spectra", @@ -1481,7 +2483,7 @@ setMethod("filterFourierTransformArtefacts", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterIntensity #' @@ -1525,7 +2527,7 @@ setMethod("filterIntensity", "Spectra", }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { object@backend <- filterIsolationWindow(object@backend, mz = mz) object@processing <- .logging(object@processing, @@ -1534,7 +2536,7 @@ setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { object@backend <- filterMsLevel(object@backend, msLevel = msLevel.) object@processing <- .logging(object@processing, @@ -1543,7 +2545,7 @@ setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterMzRange #' @@ -1566,7 +2568,7 @@ setMethod("filterMzRange", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterMzValues #' @@ -1605,7 +2607,7 @@ setMethod("filterMzValues", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { object@backend <- filterPolarity(object@backend, polarity = polarity) object@processing <- .logging(object@processing, @@ -1614,7 +2616,7 @@ setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @export setMethod("filterPrecursorMz", "Spectra", @@ -1630,7 +2632,7 @@ setMethod("filterPrecursorMz", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorMzRange", "Spectra", function(object, mz = numeric()) { object@backend <- filterPrecursorMzRange(object@backend, mz) @@ -1641,7 +2643,7 @@ setMethod("filterPrecursorMzRange", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorMzValues", "Spectra", function(object, mz = numeric(), ppm = 20, tolerance = 0) { object@backend <- filterPrecursorMzValues( @@ -1653,7 +2655,7 @@ setMethod("filterPrecursorMzValues", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorCharge", "Spectra", function(object, z = integer()) { z <- unique(z) @@ -1665,7 +2667,7 @@ setMethod("filterPrecursorCharge", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorScan", "Spectra", function(object, acquisitionNum = integer(), f = dataOrigin(object)) { if (!all(f %in% unique(dataOrigin(object)))) @@ -1681,7 +2683,7 @@ setMethod("filterPrecursorScan", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterRt", "Spectra", function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { if (!is.numeric(msLevel.)) @@ -1700,7 +2702,7 @@ setMethod("filterRt", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterRanges", "Spectra", function(object, spectraVariables = character(), ranges = numeric(), match = c("all", "any")){ @@ -1715,7 +2717,7 @@ setMethod("filterRanges", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterValues", "Spectra", function(object, spectraVariables = character(), values = numeric(), ppm = 0, tolerance = 0, match = c("all", "any")){ @@ -1737,7 +2739,469 @@ setMethod("filterValues", "Spectra", #' @title Data manipulation and analysis methods #' -#' `reset()` to clean the lazy processing queue. +#' @name addProcessing +#' +#' @aliases addProcessing +#' @aliases applyProcessing +#' @aliases bin +#' @aliases containsMz +#' @aliases containsNeutralLoss +#' @aliases entropy +#' @aliases pickPeaks +#' @aliases replaceIntensitiesBelow +#' @aliases reset +#' @aliases smooth +#' @aliases spectrapply +#' +#' @description +#' +#' Various data analysis functions are available for `Spectra` objects. These +#' can be categorized into functions that either return a `Spectra` object +#' (with the manipulated data) and functions that directly return the +#' result from the calculation. For the former category, the data manipulations +#' are cached in the result object's *processing queue* and only exectuted +#' on-the-fly when the respective data gets extracted from the `Spectra` (see +#' section *The processing queue* for more information). +#' +#' For the second category, the calculations are directly executed and the +#' result, usually one value per spectrum, returned. Generally, to reduce +#' memory demand, a chunk-wise processing of the data is performed. +#' +#' +#' @section Data analysis methods returning a `Spectra`: +#' +#' The methods listed here return a `Spectra` object as a result. +#' +#' - `addProcessing()`: adds an arbitrary function that should be applied to the +#' peaks matrix of every spectrum in `object`. The function (can be passed +#' with parameter `FUN`) is expected to take a peaks matrix as input and to +#' return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +#' the first containing the m/z values of the peaks and the second the +#' corresponding intensities. The function has to have `...` in its +#' definition. Additional arguments can be passed with `...`. With parameter +#' `spectraVariables` it is possible to define additional spectra variables +#' from `object` that should be passed to the function `FUN`. These will be +#' passed by their name (e.g. specifying `spectraVariables = "precursorMz"` +#' will pass the spectra's precursor m/z as a parameter named `precursorMz` +#' to the function. The only exception is the spectra's MS level, these will +#' be passed to the function as a parameter called `spectrumMsLevel` (i.e. +#' with `spectraVariables = "msLevel"` the MS levels of each spectrum will be +#' submitted to the function as a parameter called `spectrumMsLevel`). +#' Examples are provided in the package vignette. +#' +#' - `bin()`: aggregates individual spectra into discrete (m/z) bins. Binning is +#' performed only on spectra of the specified MS level(s) (parameter +#' `msLevel`, by default all MS levels of `x`). The bins can be defined with +#' parameter `breaks` which by default are equally sized bins, with size +#' being defined by parameter `binSize`, from the minimal to the maximal m/z +#' of all spectra (of MS level `msLevel`) within `x`. The same bins are used +#' for all spectra in `x`. All intensity values for peaks falling into the +#' same bin are aggregated using the function provided with parameter `FUN` +#' (defaults to `FUN = sum`, i.e. all intensities are summed up). Note that +#' the binning operation is applied to the peak data on-the-fly upon data +#' access and it is possible to *revert* the operation with the `reset()` +#' function (see description of `reset()` below). +#' +#' - `countIdentifications`: counts the number of identifications each scan has +#' led to. See [countIdentifications()] for more details. +#' +#' - `pickPeaks()`: picks peaks on individual spectra using a moving +#' window-based approach (window size = `2 * halfWindowSize`). For noisy +#' spectra there are currently two different noise estimators available, +#' the *M*edian *A*bsolute *D*eviation (`method = "MAD"`) and +#' Friedman's Super Smoother (`method = "SuperSmoother"`), +#' as implemented in the [`MsCoreUtils::noise()`]. +#' The method supports also to optionally *refine* the m/z value of +#' the identified centroids by considering data points that belong (most +#' likely) to the same mass peak. Therefore the m/z value is calculated as an +#' intensity weighted average of the m/z values within the peak region. +#' The peak region is defined as the m/z values (and their respective +#' intensities) of the `2 * k` closest signals to the centroid or the closest +#' valleys (`descending = TRUE`) in the `2 * k` region. For the latter the `k` +#' has to be chosen general larger. See [`MsCoreUtils::refineCentroids()`] for +#' details. +#' If the ratio of the signal to the highest intensity of the peak is below +#' `threshold` it will be ignored for the weighted average. +#' +#' - `replaceIntensitiesBelow()`: replaces intensities below a specified +#' threshold with the provided `value`. Parameter `threshold` can be either +#' a single numeric value or a function which is applied to all non-`NA` +#' intensities of each spectrum to determine a threshold value for each +#' spectrum. The default is `threshold = min` which replaces all values +#' which are <= the minimum intensity in a spectrum with `value` (the +#' default for `value` is `0`). Note that the function specified with +#' `threshold` is expected to have a parameter `na.rm` since `na.rm = TRUE` +#' will be passed to the function. If the spectrum is in profile mode, +#' ranges of successive non-0 peaks <= `threshold` are set to 0. +#' Parameter `msLevel.` allows to apply this to only spectra of certain MS +#' level(s). +#' +#' - `scalePeaks()`: scales intensities of peaks within each spectrum depending +#' on parameter `by`. With `by = sum` (the default) peak intensities are +#' divided by the sum of peak intensities within each spectrum. The sum of +#' intensities is thus 1 for each spectrum after scaling. Parameter +#' `msLevel.` allows to apply the scaling of spectra of a certain MS level. +#' By default (`msLevel. = uniqueMsLevels(x)`) intensities for all +#' spectra will be scaled. +#' +#' - `smooth()`: smooths individual spectra using a moving window-based approach +#' (window size = `2 * halfWindowSize`). Currently, the +#' Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' weights depending on the distance of the center and calculated +#' `1/2^(-halfWindowSize:halfWindowSize)`) and +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' For details how to choose the correct `halfWindowSize` please see +#' [`MsCoreUtils::smooth()`]. +#' +#' +#' @section Data analysis methods returning the result from the calculation: +#' +#' The functions listed in this section return immediately the result from the +#' calculation. To reduce memory demand (and allow parallel processing) the +#' calculations a chunk-wise processing is generally performed. +#' +#' - `chunkapply()`: apply an arbitrary function to chunks of spectra. See +#' [chunkapply()] for details and examples. +#' +#' - `containsMz()`: checks for each of the spectra whether they contain mass +#' peaks with an m/z equal to `mz` (given acceptable difference as defined by +#' parameters `tolerance` and `ppm` - see [common()] for details). Parameter +#' `which` allows to define whether any (`which = "any"`, the default) or +#' all (`which = "all"`) of the `mz` have to match. The function returns +#' `NA` if `mz` is of length 0 or is `NA`. +#' +#' - `containsNeutralLoss()`: checks for each spectrum in `object` if it has a +#' peak with an m/z value equal to its precursor m/z - `neutralLoss` (given +#' acceptable difference as defined by parameters `tolerance` and `ppm`). +#' Returns `NA` for MS1 spectra (or spectra without a precursor m/z). +#' +#' - `entropy()`: calculates the entropy of each spectra based on the metrics +#' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +#' See also [nentropy()] in the *MsCoreUtils* package for details. +#' +#' - `estimatePrecursorIntensity()`: defines the precursor intensities for MS2 +#' spectra using the intensity of the matching MS1 peak from the +#' closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +#' respective MS2 spectrum). With `method = "interpolation"` it is also +#' possible to calculate the precursor intensity based on an interpolation of +#' intensity values (and retention times) of the matching MS1 peaks from the +#' previous and next MS1 spectrum. See [estimatePrecursorIntensity()] for +#' examples and more details. +#' +#' - `estimatePrecursorMz()`: **for DDA data**: allows to estimate a fragment +#' spectra's precursor m/z based on the reported precursor m/z and the data +#' from the previous MS1 spectrum. See [estimatePrecursorMz()] for details. +#' +#' - `neutralLoss()`: calculates neutral loss spectra for fragment spectra. See +#' [neutralLoss()] for detailed documentation. +#' +#' - `spectrapply()`: applies a given function to each individual spectrum or +#' sets of a `Spectra` object. By default, the `Spectra` is split into +#' individual spectra (i.e. `Spectra` of length 1) and the function `FUN` +#' is applied to each of them. An alternative splitting can be defined with +#' parameter `f`. Parameters for `FUN` can be passed using `...`. +#' The returned result and its order depend on the function `FUN` and how +#' `object` is split (hence on `f`, if provided). Parallel processing is +#' supported and can be configured with parameter `BPPARAM`, is however only +#' suggested for computational intense `FUN`. +#' As an alternative to the (eventual parallel) processing of the full +#' `Spectra`, `spectrapply()` supports also a chunk-wise processing. For this, +#' parameter `chunkSize` needs to be specified. `object` is then split into +#' chunks of size `chunkSize` which are then (stepwise) processed by `FUN`. +#' This guarantees a lower memory demand (especially for on-disk backends) +#' since only the data for one chunk needs to be loaded into memory in each +#' iteration. Note that by specifying `chunkSize`, parameters `f` and +#' `BPPARAM` will be ignored. +#' See also `chunkapply()` above or examples below for details on chunk-wise +#' processing. +#' +#' +#' @section The processing queue: +#' +#' Operations that modify mass peak data, i.e. the m/z and intensity values of +#' a `Spectra` are generally not applied immediately to the data but are +#' *cached* within the object's *processing queue*. These operations are then +#' applied to the data only upon request, for example when m/z and/or +#' intensity values are extracted. This lazy execution guarantees that the +#' same functionality can be applied to any `Spectra` object, regardless of +#' the type of backend that is used. Thus, data manipulation operations can +#' also be applied to data that is *read only*. As a side effect, this enables +#' also to *undo* operations using the `reset()` function. +#' +#' Functions related to the processing queue are: +#' +#' - `applyProcessing()`: for `Spectra` objects that use a **writeable** backend +#' only: apply all steps from the lazy processing queue to the peak data and +#' write it back to the data storage. Parameter `f` allows to specify how +#' `object` should be split for parallel processing. This should either be +#' equal to the `dataStorage`, or `f = rep(1, length(object))` to disable +#' parallel processing alltogether. Other partitionings might result in +#' errors (especially if a `MsBackendHdf5Peaks` backend is used). +#' +#' - `processingLog()`: returns a `character` vector with the processing log +#' messages. +#' +#' - `reset()`: restores the data to its original state (as much as possible): +#' removes any processing steps from the lazy processing queue and calls +#' `reset()` on the backend which, depending on the backend, can also undo +#' e.g. data filtering operations. Note that a `reset*(` call after +#' `applyProcessing()` will not have any effect. See examples below for more +#' information. +#' +#' @param binSize For `bin()`: `numeric(1)` defining the size for the m/z bins. +#' Defaults to `binSize = 1`. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. See also [processingChunkSize()] for +#' additional information on parallel processing. +#' +#' @param breaks For `bin()`: `numeric` defining the m/z breakpoints between +#' bins. +#' +#' @param by For `scalePeaks()`: function to calculate a single `numeric` from +#' intensity values of a spectrum by which all intensities (of +#' that spectrum) should be divided by. The default `by = sum` will +#' divide intensities of each spectrum by the sum of intensities of that +#' spectrum. +#' +#' @param chunkSize For `spectrapply()`: size of the chunks into which the +#' `Spectra` should be split. This parameter overrides parameters +#' `f` and `BPPARAM`. +#' +#' @param descending For `pickPeaks()`: `logical`, if `TRUE` just values +#' betwee the nearest valleys around the peak centroids are used. +# +#' @param f For `spectrapply()` and `applyProcessing()`: `factor` defining +#' how `object` should be splitted for eventual parallel processing. +#' Defaults to `factor()` for `spectrapply()` hence the object is not +#' splitted while it defaults to `f = processingChunkSize(object)` for +#' `applyProcessing()` splitting thus the object by default into chunks +#' depending on [processingChunkSize()]. +#' +#' @param FUN For `addProcessing()`: function to be applied to the peak matrix +#' of each spectrum in `object`. +#' For `bin()`: function to aggregate intensity values of peaks falling +#' into the same bin. Defaults to `FUN = sum` thus summing up intensities. +#' For `spectrapply()` and `chunkapply()`: function to be applied to +#' each individual or each chunk of `Spectra`. +#' +#' @param halfWindowSize For `pickPeaks()`: `integer(1)`, used in the +#' identification of the mass peaks: a local maximum has to be the +#' maximum in the window from `(i - halfWindowSize):(i + halfWindowSize)`. +#' For `smooth()`: `integer(1)`, used in the smoothing algorithm, the +#' window reaches from `(i - halfWindowSize):(i + halfWindowSize)`. +#' +#' @param k For `pickPeaks()`: `integer(1)`, number of values left and right of +#' the peak that should be considered in the weighted mean calculation. +#' +#' @param method For `pickPeaks()`: `character(1)`, the noise estimators that +#' should be used, currently the the *M*edian *A*bsolute *D*eviation +#' (`method = "MAD"`) and Friedman's Super Smoother +#' (`method = "SuperSmoother"`) are supported. +#' For `smooth()`: `character(1)`, the smoothing function that should be +#' used, currently, the Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mz For `containsMz()`: `numeric` with the m/z value(s) of the mass +#' peaks to check. +#' +#' @param neutralLoss for `containsNeutralLoss()`: `numeric(1)` defining the +#' value which should be subtracted from the spectrum's precursor m/z. +#' +#' @param normalized for `entropy()`: `logical(1)` whether the normalized +#' entropy should be calculated (default). See also [nentropy()] for +#' details. +#' +#' @param object A `Spectra` object. +#' +#' @param ppm For `containsMz()` and `neutralLoss()`: `numeric(1)` defining a +#' relative, m/z-dependent, maximal accepted difference between m/z values +#' for peaks to be matched. +#' +#' @param snr For `pickPeaks()`: `double(1)` defining the +#' *S*ignal-to-*N*oise-*R*atio. The intensity of a local maximum has to be +#' higher than `snr * noise` to be considered as peak. +#' +#' @param spectraVariables For `addProcessing()`: `character` with additional +#' spectra variables that should be passed along to the function defined +#' with `FUN`. See function description for details. +#' +#' @param threshold For `pickPeaks()`: a `numeric(1)` defining the proportion +#' of the maximal peak intensity. Only values above the threshold are +#' used for the weighted mean calculation. +#' For `replaceIntensitiesBelow()`: a `numeric(1)` defining the threshold +#' or a `function` to calculate the threshold for each spectrum on its +#' intensity values. Defaults to `threshold = min`. +#' +#' @param tolerance For `containsMz()` and `neutralLoss()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched. +#' +#' @param value For `replaceIntensitiesBelow()`: `numeric(1)` defining the +#' value with which intensities should be replaced with. +#' +#' @param which For `containsMz()`: either `"any"` or `"all"` defining whether +#' any (the default) or all provided `mz` have to be present in the +#' spectrum. +#' +#' @param x A `Spectra`. +#' +#' @param zero.rm For `bin()`: `logical(1)` indicating whether to remove bins +#' with zero intensity. Defaults to `TRUE`, meaning the function will +#' discard bins created with an intensity of 0 to enhance memory +#' efficiency. +#' +#' @param ... Additional arguments passed to internal and downstream functions. +#' +#' @return +#' +#' See the documentation of the individual functions for a description of the +#' return value. +#' +#' @md +#' +#' @seealso +#' +#' - [compareSpectra()] for calculation of spectra similarity scores. +#' +#' - [processingChunkSize()] for information on parallel and chunk-wise data +#' processing. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- FUNCTIONS RETURNING A SPECTRA -------- +#' +#' ## Replace peak intensities below 40 with a value of 1 +#' sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +#' sps_mod +#' +#' ## Get the intensities of the first spectrum before and after the +#' ## operation +#' intensity(sps_dda[1]) +#' intensity(sps_mod[1]) +#' +#' ## Remove all peaks with an intensity below 5. +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' +#' intensity(sps_mod) +#' +#' ## In addition it is possible to pass a function to `filterIntensity()`: in +#' ## the example below we want to keep only peaks that have an intensity which +#' ## is larger than one third of the maximal peak intensity in that spectrum. +#' keep_peaks <- function(x, prop = 3) { +#' x > max(x, na.rm = TRUE) / prop +#' } +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +#' intensity(sps_mod) +#' +#' ## We can also change the proportion by simply passing the `prop` parameter +#' ## to the function. To keep only peaks that have an intensity which is +#' ## larger than half of the maximum intensity: +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +#' intensity(sps_mod) +#' +#' ## With the `scalePeaks()` function we can alternatively scale the +#' ## intensities of mass peaks per spectrum to relative intensities. This +#' ## is specifically useful for fragment (MS2) spectra. We below thus +#' ## scale the intensities per spectrum by the total sum of intensities +#' ## (such that the sum of all intensities per spectrum is 1). +#' ## Below we scale the intensities of all MS2 spectra in our data set. +#' sps_mod <- scalePeaks(sps_dda, msLevel = 2L) +#' +#' ## MS1 spectra were not affected +#' sps_mod |> +#' filterMsLevel(1L) |> +#' intensity() +#' +#' ## Intensities of MS2 spectra were scaled +#' sps_mod |> +#' filterMsLevel(2L) |> +#' intensity() +#' +#' ## Since data manipulation operations are by default not directly applied to +#' ## the data but only cached in the internal processing queue, it is also +#' ## possible to remove these data manipulations with the `reset()` function: +#' tmp <- reset(sps_mod) +#' tmp +#' lengths(sps_dda) |> head() +#' lengths(sps_mod) |> head() +#' lengths(tmp) |> head() +#' +#' ## Data manipulation operations cached in the processing queue can also be +#' ## applied to the mass peaks data with the `applyProcessing()` function, if +#' ## the `Spectra` uses a backend that supports that (i.e. allows replacing +#' ## the mass peaks data). Below we first change the backend to a +#' ## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +#' ## mass peaks data +#' sps_dda <- setBackend(sps_dda, MsBackendMemory()) +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' sps_mod <- applyProcessing(sps_mod) +#' sps_mod +#' +#' ## While we can't *undo* this filtering operation now using the `reset()` +#' ## function, accessing the data would now be faster, because the operation +#' ## does no longer to be applied to the original data before returning to the +#' ## user. +#' +#' +#' ## -------- FUNCTIONS RETURNING THE RESULT -------- +#' +#' ## With the `spectrapply()` function it is possible to apply an +#' ## arbitrary function to each spectrum in a Spectra. +#' ## In the example below we calculate the mean intensity for each spectrum +#' ## in a subset of the sciex_im data. Note that we can access all variables +#' ## of each individual spectrum either with the `$` operator or the +#' ## corresponding method. +#' res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +#' head(res) +#' +#' ## As an alternative, applying a function `FUN` to a `Spectra` can be +#' ## performed *chunk-wise*. The advantage of this is, that only the data for +#' ## one chunk at a time needs to be loaded into memory reducing the memory +#' ## demand. This type of processing can be performed by specifying the size +#' ## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +#' ## parameter +#' spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) +#' +#' ## Precursor intensity estimation. Some manufacturers don't report the +#' ## precursor intensity for MS2 spectra: +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() +#' +#' ## This intensity can however be estimated from the previously measured +#' ## MS1 scan with the `estimatePrecursorIntensity()` function: +#' pi <- estimatePrecursorIntensity(sps_dda) +#' +#' ## This function returned the result as a `numeric` vector with one +#' ## value per spectrum: +#' pi +#' +#' ## We can replace the precursor intensity values of the originating +#' ## object: +#' sps_dda$precursorIntensity <- pi +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() +#' +NULL #' @exportMethod addProcessing #' @@ -1751,7 +3215,7 @@ setMethod("filterValues", "Spectra", #' #' @importFrom BiocGenerics updateObject #' -#' @rdname Spectra +#' @rdname addProcessing setMethod("addProcessing", "Spectra", function(object, FUN, ..., spectraVariables = character()) { if (missing(FUN)) @@ -1766,12 +3230,7 @@ setMethod("addProcessing", "Spectra", function(object, FUN, ..., object }) -#' @rdname Spectra -setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { - backendBpparam(object@backend, BPPARAM) -}) - -#' @rdname Spectra +#' @rdname addProcessing #' #' @importMethodsFrom ProtGenerics bin #' @@ -1799,7 +3258,7 @@ setMethod("bin", "Spectra", function(x, binSize = 1L, breaks = NULL, x }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod containsMz setMethod("containsMz", "Spectra", function(object, mz = numeric(), @@ -1825,7 +3284,7 @@ setMethod("containsMz", "Spectra", function(object, mz = numeric(), } }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod containsNeutralLoss setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, @@ -1848,7 +3307,7 @@ setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, } }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importFrom MsCoreUtils entropy nentropy #' @@ -1863,12 +3322,12 @@ setMethod("entropy", "Spectra", function(object, normalized = TRUE) { ) } else numeric() }) -#' @rdname Spectra +#' @rdname addProcessing setMethod("entropy", "ANY", function(object, ...) { MsCoreUtils::entropy(object) }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod pickPeaks setMethod("pickPeaks", "Spectra", @@ -1910,7 +3369,7 @@ setMethod("pickPeaks", "Spectra", object }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod replaceIntensitiesBelow setMethod("replaceIntensitiesBelow", "Spectra", @@ -1937,7 +3396,7 @@ setMethod("replaceIntensitiesBelow", "Spectra", object }) -#' @rdname Spectra +#' @rdname addProcessing setMethod("reset", "Spectra", function(object, ...) { object@backend <- reset(object@backend) object@processingQueue <- list() @@ -1948,7 +3407,7 @@ setMethod("reset", "Spectra", function(object, ...) { object }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importFrom ProtGenerics smooth #' @importFrom MsCoreUtils coefMA coefWMA coefSG @@ -1979,7 +3438,7 @@ setMethod("smooth", "Spectra", x }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importMethodsFrom ProtGenerics spectrapply #' @@ -2089,16 +3548,119 @@ setMethod( ################################################################################ #' @title Spectra similarity calculations - -#' @rdname Spectra #' -#' @exportMethod compareSpectra +#' @name compareSpectra +#' +#' @aliases compareSpectra +#' +#' @description +#' +#' `compareSpectra()` compares each spectrum in `x` with each spectrum in `y` +#' using the function provided with `FUN` (defaults to [ndotproduct()]). If +#' `y` is missing, each spectrum in `x` is compared with each other spectrum +#' in `x`. +#' The matching/mapping of peaks between the compared spectra is done with the +#' `MAPFUN` function. The default [joinPeaks()] matches peaks of both spectra +#' and allows to keep all peaks from the first spectrum (`type = "left"`), +#' from the second (`type = "right"`), from both (`type = "outer"`) and to +#' keep only matching peaks (`type = "inner"`); see [joinPeaks()] for more +#' information and examples). The `MAPFUN` function should have parameters +#' `x`, `y`, `xPrecursorMz` and `yPrecursorMz` as these values are passed to +#' the function. +#' +#' In addition to `joinPeaks()` also [joinPeaksGnps()] is supported for +#' GNPS-like similarity score calculations. Note that `joinPeaksGnps()` should +#' only be used in combination with `FUN = MsCoreUtils::gnps` +#' (see [joinPeaksGnps()] for more information and details). Use +#' `MAPFUN = joinPeaksNone` to disable internal peak matching/mapping if a +#' similarity scoring function is used that performs the matching internally. +#' +#' `FUN` is supposed to be a function to compare intensities of (matched) +#' peaks of the two spectra that are compared. The function needs to take two +#' matrices with columns `"mz"` and `"intensity"` as input and is supposed +#' to return a single numeric as result. In addition to the two peak matrices +#' the spectra's precursor m/z values are passed to the function as parameters +#' `xPrecursorMz` (precursor m/z of the `x` peak matrix) and `yPrecursorMz` +#' (precursor m/z of the `y` peak matrix). Additional parameters to functions +#' `FUN` and `MAPFUN` can be passed with `...`. Parameters `ppm` and +#' `tolerance` are passed to both `MAPFUN` and `FUN`. +#' The function returns a `matrix` with the results of `FUN` for each +#' comparison, number of rows equal to `length(x)` and number of columns +#' equal `length(y)` (i.e. element in row 2 and column 3 is the result from +#' the comparison of `x[2]` with `y[3]`). If `SIMPLIFY = TRUE` the `matrix` +#' is *simplified* to a `numeric` if length of `x` or `y` is one. See also +#' the vignette for additional examples, such as using spectral entropy +#' similarity in the scoring. +#' +#' @param FUN function to compare intensities of peaks between two spectra. +#' Defaults to [ndotproduct()]. +#' +#' @param MAPFUN For `compareSpectra()`: function to map/match peaks between +#' the two compared spectra. See [joinPeaks()] for more information and +#' possible functions. Defaults to [joinPeaks()]. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `Spectra` object. +#' +#' @param SIMPLIFY `logical(1)` defining whether the result matrix should be +#' *simplified* to a `numeric` if possible (i.e. if either `x` or `y` is +#' of length 1). +#' +#' @param ... Additional arguments passed to the internal functions. #' #' @importFrom MsCoreUtils ndotproduct #' #' @importMethodsFrom ProtGenerics compareSpectra #' #' @exportMethod compareSpectra +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' ## Restrict to MS2 (fragment) spectra: +#' sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) +#' +#' ## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +#' ## the normalized dotproduct method. +#' res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +#' ## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +#' ## the second row comparisons of spectrum 3 with spectra 10 to 20 +#' res +#' +#' ## We next calculate the pairwise similarity for the first 10 spectra +#' compareSpectra(sps_ms2[1:10]) +#' +#' ## Use compareSpectra to determine the number of common (matching) peaks +#' ## with a ppm of 10: +#' ## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +#' ## peaks that can be mapped betwen both spectra. The provided FUN returns +#' ## simply the number of matching peaks. +#' compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +#' +#' ## We repeat this calculation between all pairwise combinations +#' ## of the first 20 spectra +#' compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +NULL + +#' @rdname compareSpectra setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, FUN = ndotproduct, ..., SIMPLIFY = TRUE) { @@ -2109,7 +3671,7 @@ setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), mat <- as.vector(mat) mat }) -#' @rdname Spectra +#' @rdname compareSpectra setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, FUN = ndotproduct, ..., SIMPLIFY = TRUE) { @@ -2125,3 +3687,15 @@ setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), mat <- as.vector(mat) mat }) + + +################################################################################ +## +## methods with documentation in Spectra-functions.R +## +################################################################################ + +#' @rdname processingChunkSize +setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { + backendBpparam(object@backend, BPPARAM) +}) diff --git a/R/countIdentifications.R b/R/countIdentifications.R index b7ddb687..2f3e8c15 100644 --- a/R/countIdentifications.R +++ b/R/countIdentifications.R @@ -40,6 +40,10 @@ #' spectra variable `countIdentifications` with the number of #' identification for each scan. #' +#' @seealso +#' +#' [addProcessing()] for other data analysis functions. +#' #' @author Laurent Gatto #' #' @export diff --git a/R/peaks-functions.R b/R/peaks-functions.R index 7639538a..f34adde9 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -308,7 +308,13 @@ NULL #' #' @author Johannes Rainer, Michael Witting #' -#' @seealso [gnps()] +#' @seealso +#' +#' - [compareSpectra()] for the function to calculate similarities between +#' spectra. +#' +#' - [gnps()] in the *MsCoreUtils* package for more information on the GNPS +#' similarity score. #' #' @importFrom MsCoreUtils join ppm #' diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 19bd8b7c..0bf98b0a 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -938,7 +938,7 @@ This backend provides an \code{export()} method to export data from a \code{Spec The parameters are: \itemize{ \item \code{object}: an instance of the \code{MsBackendMzR} class. -\item \code{x}: the \linkS4class{Spectra} object to be exported. +\item \code{x}: the \link{Spectra} object to be exported. \item \code{file}: \code{character} with the (full) output file name(s). Should be of length 1 or equal \code{length(x)}. If a single file is specified, all spectra are exported to that file. Alternatively it is possible to specify @@ -952,7 +952,7 @@ backend and if \code{dataOrigin(x)} contains the original MS data file names. \item \code{BPPARAM}: parallel processing settings. } -See examples in \linkS4class{Spectra} or the vignette for more details and +See examples in \link{Spectra} or the vignette for more details and examples. The \code{MsBackendMzR} ignores parameter \code{columns} of the \code{peaksData()} diff --git a/man/Spectra.Rd b/man/Spectra.Rd index b4f87b54..1116f60c 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1,169 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R, R/Spectra.R -\name{applyProcessing} -\alias{applyProcessing} -\alias{concatenateSpectra} -\alias{combineSpectra} -\alias{joinSpectraData} -\alias{processingLog} -\alias{deisotopeSpectra} -\alias{reduceSpectra} -\alias{filterPrecursorMaxIntensity} -\alias{filterPrecursorIsotopes} -\alias{scalePeaks} -\alias{filterPrecursorPeaks} +% Please edit documentation in R/Spectra.R +\name{Spectra} \alias{Spectra} \alias{Spectra-class} -\alias{[,Spectra-method} -\alias{uniqueMsLevels} -\alias{uniqueMsLevels,Spectra-method} -\alias{combinePeaks} +\alias{setBackend} +\alias{export} \alias{Spectra,missing-method} \alias{Spectra,MsBackend-method} \alias{Spectra,character-method} \alias{Spectra,ANY-method} \alias{setBackend,Spectra,MsBackend-method} -\alias{c,Spectra-method} -\alias{split,Spectra,ANY-method} \alias{export,Spectra-method} -\alias{acquisitionNum,Spectra-method} -\alias{peaksData,Spectra-method} -\alias{peaksVariables,Spectra-method} -\alias{centroided,Spectra-method} -\alias{centroided<-,Spectra-method} -\alias{collisionEnergy,Spectra-method} -\alias{collisionEnergy<-,Spectra-method} -\alias{dataOrigin,Spectra-method} -\alias{dataOrigin<-,Spectra-method} -\alias{dataStorage,Spectra-method} -\alias{dropNaSpectraVariables,Spectra-method} -\alias{intensity,Spectra-method} -\alias{ionCount,Spectra-method} -\alias{isCentroided,Spectra-method} -\alias{isEmpty,Spectra-method} -\alias{isolationWindowLowerMz,Spectra-method} -\alias{isolationWindowLowerMz<-,Spectra-method} -\alias{isolationWindowTargetMz,Spectra-method} -\alias{isolationWindowTargetMz<-,Spectra-method} -\alias{isolationWindowUpperMz,Spectra-method} -\alias{isolationWindowUpperMz<-,Spectra-method} -\alias{containsMz,Spectra-method} -\alias{containsNeutralLoss,Spectra-method} -\alias{spectrapply,Spectra-method} -\alias{length,Spectra-method} -\alias{msLevel,Spectra-method} -\alias{mz,Spectra-method} -\alias{lengths,Spectra-method} -\alias{polarity,Spectra-method} -\alias{polarity<-,Spectra-method} -\alias{precScanNum,Spectra-method} -\alias{precursorCharge,Spectra-method} -\alias{precursorIntensity,Spectra-method} -\alias{precursorMz,Spectra-method} -\alias{rtime,Spectra-method} -\alias{rtime<-,Spectra-method} -\alias{scanIndex,Spectra-method} -\alias{selectSpectraVariables,Spectra-method} -\alias{smoothed,Spectra-method} -\alias{smoothed<-,Spectra-method} -\alias{spectraData,Spectra-method} -\alias{spectraData<-,Spectra-method} -\alias{spectraNames,Spectra-method} -\alias{spectraNames<-,Spectra-method} -\alias{spectraVariables,Spectra-method} -\alias{tic,Spectra-method} -\alias{$,Spectra-method} -\alias{$<-,Spectra-method} -\alias{[[,Spectra-method} -\alias{[[<-,Spectra-method} -\alias{filterAcquisitionNum,Spectra-method} -\alias{filterEmptySpectra,Spectra-method} -\alias{filterDataOrigin,Spectra-method} -\alias{filterDataStorage,Spectra-method} -\alias{filterFourierTransformArtefacts,Spectra-method} -\alias{filterIntensity,Spectra-method} -\alias{filterIsolationWindow,Spectra-method} -\alias{filterMsLevel,Spectra-method} -\alias{filterMzRange,Spectra-method} -\alias{filterMzValues,Spectra-method} -\alias{filterPolarity,Spectra-method} -\alias{filterPrecursorMz,Spectra-method} -\alias{filterPrecursorMzRange,Spectra-method} -\alias{filterPrecursorMzValues,Spectra-method} -\alias{filterPrecursorCharge,Spectra-method} -\alias{filterPrecursorScan,Spectra-method} -\alias{filterRt,Spectra-method} -\alias{reset,Spectra-method} -\alias{filterRanges,Spectra-method} -\alias{filterValues,Spectra-method} -\alias{bin,Spectra-method} -\alias{compareSpectra,Spectra,Spectra-method} -\alias{compareSpectra,Spectra,missing-method} -\alias{pickPeaks,Spectra-method} -\alias{replaceIntensitiesBelow,Spectra-method} -\alias{smooth,Spectra-method} -\alias{addProcessing,Spectra-method} -\alias{coreSpectraVariables} -\alias{backendBpparam,Spectra-method} -\alias{combinePeaks,Spectra-method} -\alias{entropy,Spectra-method} -\alias{entropy,ANY-method} \alias{dataStorageBasePath,Spectra-method} \alias{dataStorageBasePath<-,Spectra-method} -\alias{asDataFrame} \title{The Spectra class to manage and access MS data} \usage{ -applyProcessing( - object, - f = processingChunkFactor(object), - BPPARAM = bpparam(), - ... -) - -concatenateSpectra(x, ...) - -combineSpectra( - x, - f = x$dataStorage, - p = x$dataStorage, - FUN = combinePeaksData, - ..., - BPPARAM = bpparam() -) - -joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") - -processingLog(x) - -deisotopeSpectra( - x, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), - tolerance = 0, - ppm = 20, - charge = 1 -) - -reduceSpectra(x, tolerance = 0, ppm = 20) - -filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) - -filterPrecursorIsotopes( - x, - tolerance = 0, - ppm = 20, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") -) - -scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) - -filterPrecursorPeaks( - object, - tolerance = 0, - ppm = 20, - mz = c("==", ">="), - msLevel. = uniqueMsLevels(object) -) - \S4method{Spectra}{missing}( object, processingQueue = list(), @@ -209,431 +60,24 @@ filterPrecursorPeaks( BPPARAM = bpparam() ) -\S4method{c}{Spectra}(x, ...) - -\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) - \S4method{export}{Spectra}(object, backend, ...) -\S4method{acquisitionNum}{Spectra}(object) - -\S4method{peaksData}{Spectra}( - object, - columns = c("mz", "intensity"), - f = processingChunkFactor(object), - ..., - BPPARAM = bpparam() -) - -\S4method{peaksVariables}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) <- value - -\S4method{collisionEnergy}{Spectra}(object) - -\S4method{collisionEnergy}{Spectra}(object) <- value - -\S4method{dataOrigin}{Spectra}(object) - -\S4method{dataOrigin}{Spectra}(object) <- value - -\S4method{dataStorage}{Spectra}(object) - -\S4method{dropNaSpectraVariables}{Spectra}(object) - -\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{ionCount}{Spectra}(object) - -\S4method{isCentroided}{Spectra}(object, ...) - -\S4method{isEmpty}{Spectra}(x) - -\S4method{isolationWindowLowerMz}{Spectra}(object) - -\S4method{isolationWindowLowerMz}{Spectra}(object) <- value - -\S4method{isolationWindowTargetMz}{Spectra}(object) - -\S4method{isolationWindowTargetMz}{Spectra}(object) <- value - -\S4method{isolationWindowUpperMz}{Spectra}(object) - -\S4method{isolationWindowUpperMz}{Spectra}(object) <- value - -\S4method{containsMz}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - which = c("any", "all"), - BPPARAM = bpparam() -) - -\S4method{containsNeutralLoss}{Spectra}( - object, - neutralLoss = 0, - tolerance = 0, - ppm = 20, - BPPARAM = bpparam() -) - -\S4method{spectrapply}{Spectra}( - object, - FUN, - ..., - chunkSize = integer(), - f = factor(), - BPPARAM = SerialParam() -) - -\S4method{length}{Spectra}(x) - -\S4method{msLevel}{Spectra}(object) - -\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{lengths}{Spectra}(x, use.names = FALSE) - -\S4method{polarity}{Spectra}(object) - -\S4method{polarity}{Spectra}(object) <- value - -\S4method{precScanNum}{Spectra}(object) - -\S4method{precursorCharge}{Spectra}(object) - -\S4method{precursorIntensity}{Spectra}(object) - -\S4method{precursorMz}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) <- value - -\S4method{scanIndex}{Spectra}(object) - -\S4method{selectSpectraVariables}{Spectra}( - object, - spectraVariables = union(spectraVariables(object), peaksVariables(object)) -) - -\S4method{smoothed}{Spectra}(object) - -\S4method{smoothed}{Spectra}(object) <- value - -\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) - -\S4method{spectraData}{Spectra}(object) <- value - -\S4method{spectraNames}{Spectra}(object) - -\S4method{spectraNames}{Spectra}(object) <- value - -\S4method{spectraVariables}{Spectra}(object) - -\S4method{tic}{Spectra}(object, initial = TRUE) - -\S4method{$}{Spectra}(x, name) - -\S4method{$}{Spectra}(x, name) <- value - -\S4method{[[}{Spectra}(x, i, j, ...) - -\S4method{[[}{Spectra}(x, i, j, ...) <- value - -\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) - -\S4method{filterAcquisitionNum}{Spectra}( - object, - n = integer(), - dataStorage = character(), - dataOrigin = character() -) - -\S4method{filterEmptySpectra}{Spectra}(object) - -\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) - -\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) - -\S4method{filterFourierTransformArtefacts}{Spectra}( - object, - halfWindowSize = 0.05, - threshold = 0.2, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 -) - -\S4method{filterIntensity}{Spectra}( - object, - intensity = c(0, Inf), - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) - -\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) - -\S4method{filterMzRange}{Spectra}( - object, - mz = numeric(), - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterMzValues}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterPolarity}{Spectra}(object, polarity = integer()) - -\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) - -\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) - -\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) - -\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) - -\S4method{reset}{Spectra}(object, ...) - -\S4method{filterRanges}{Spectra}( - object, - spectraVariables = character(), - ranges = numeric(), - match = c("all", "any") -) - -\S4method{filterValues}{Spectra}( - object, - spectraVariables = character(), - values = numeric(), - ppm = 0, - tolerance = 0, - match = c("all", "any") -) - -\S4method{bin}{Spectra}( - x, - binSize = 1L, - breaks = NULL, - msLevel. = uniqueMsLevels(x), - FUN = sum, - zero.rm = TRUE -) - -\S4method{compareSpectra}{Spectra,Spectra}( - x, - y, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{compareSpectra}{Spectra,missing}( - x, - y = NULL, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{pickPeaks}{Spectra}( - object, - halfWindowSize = 2L, - method = c("MAD", "SuperSmoother"), - snr = 0, - k = 0L, - descending = FALSE, - threshold = 0, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{replaceIntensitiesBelow}{Spectra}( - object, - threshold = min, - value = 0, - msLevel. = uniqueMsLevels(object) -) - -\S4method{smooth}{Spectra}( - x, - halfWindowSize = 2L, - method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), - msLevel. = uniqueMsLevels(x), - ... -) - -\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) - -coreSpectraVariables() - -\S4method{uniqueMsLevels}{Spectra}(object, ...) - -\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) - -\S4method{combinePeaks}{Spectra}( - object, - tolerance = 0, - ppm = 20, - intensityFun = base::mean, - mzFun = base::mean, - weighted = TRUE, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{entropy}{Spectra}(object, normalized = TRUE) - -\S4method{entropy}{ANY}(object, ...) - \S4method{dataStorageBasePath}{Spectra}(object) \S4method{dataStorageBasePath}{Spectra}(object) <- value - -asDataFrame( - object, - i = seq_along(object), - spectraVars = spectraVariables(object) -) } \arguments{ -\item{object}{For \code{Spectra()}: either a \code{DataFrame} or \code{missing}. See -section on creation of \code{Spectra} objects for details. For all other -methods a \code{Spectra} object.} - -\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} -for details. For \code{setBackend()}: factor defining how to split the data -for parallelized copying of the spectra data to the new backend. For some -backends changing this parameter can lead to errors. -For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra -that should be combined. For \code{spectrapply()}: \code{factor} how \code{object} -should be splitted. For \code{filterPrecursorScan()}: defining which spectra -belong to the same original data file (sample): Defaults to -\code{f = dataOrigin(x)}. -For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how data -should be chunk-wise loaded an processed. Defaults to -\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} - -\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more -information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method -of the \linkS4class{MsBackend}.} - -\item{...}{Additional arguments.} - -\item{x}{A \code{Spectra} object.} - -\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input -\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., -depending on the used backend, per-file parallel processing will be -performed.} - -\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix -of each spectrum in \code{object}. For \code{compareSpectra()}: function to compare -intensities of peaks between two spectra with each other. -For \code{combineSpectra()}: function to combine the (peak matrices) of the -spectra. See section \emph{Data manipulations} and examples below for more -details. -For \code{bin()}: function to aggregate intensity values of peaks falling -into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. -For \code{spectrapply()} and \code{chunkapply()}: function to be applied to -\code{Spectra}.} - -\item{y}{A \code{Spectra} object. A \code{DataFrame} for \code{joinSpectraData()}.} - -\item{by.x}{A \code{character(1)} specifying the spectra variable used -for merging. Default is \code{"spectrumId"}.} - -\item{by.y}{A \code{character(1)} specifying the column used for -merging. Set to \code{by.x} if missing.} - -\item{suffix.y}{A \code{character(1)} specifying the suffix to be used -for making the names of columns in the merged spectra variables -unique. This suffix will be used to amend \code{names(y)}, while -\code{spectraVariables(x)} will remain unchanged.} - -\item{substDefinition}{For \code{deisotopeSpectra()} and -\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions -of isotopic substitutions. Uses by default isotopic substitutions -defined from all compounds in the Human Metabolome Database (HMDB). See -\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} for details.} - -\item{tolerance}{For \code{compareSpectra()}, \code{containsMz()}, -\code{deisotopeSpectra()}, \code{filterMzValues()} and \code{reduceSpectra()}: -\code{numeric(1)} allowing to define a constant maximal accepted difference -between m/z values for peaks to be matched (or grouped). For -\code{containsMz()} it can also be of length equal \code{mz} to specify a different -tolerance for each m/z value. -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the -(constant) maximal accepted difference of precursor m/z values of -spectra for grouping them into \emph{precursor groups}. For -\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} -function. For \code{filterValues()}: \code{numeric} of any length allowing to -define a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be -recycled. Default is \code{tolerance = 0}} - -\item{ppm}{For \code{compareSpectra()}, \code{containsMz()}, \code{deisotopeSpectra()}, -\code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} -defining a relative, m/z-dependent, maximal accepted difference between -m/z values for peaks to be matched (or grouped). -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative -maximal accepted difference of precursor m/z values of spectra for -grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: -passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. -For \code{filterValues()}: \code{numeric} of any length allowing to define -a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be -recycled.} - -\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized -compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} - -\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from -intensity values of a spectrum by which all intensities (of -that spectrum) should be divided by. The default \code{by = sum} will -divide intensities of each spectrum by the sum of intensities of that -spectrum.} - -\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which -the function should be applied (defaults to all MS levels of \code{object}. -For \code{filterMsLevel()}: the MS level to which \code{object} should be -subsetted.} - -\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to -filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: -\code{numeric(2)} defining the lower and upper m/z boundary. -For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with -the m/z values to match peaks or precursor m/z against.} +\item{object}{For \code{Spectra()}: an object to instantiate the \code{Spectra} +object and initialize the with data.. See section on creation of +\code{Spectra} objects for details. For all other methods a \code{Spectra} object.} \item{processingQueue}{For \code{Spectra()}: optional \code{list} of \linkS4class{ProcessingStep} objects.} \item{metadata}{For \code{Spectra()}: optional \code{list} with metadata information.} +\item{...}{Additional arguments.} + \item{backend}{For \code{Spectra()}: \linkS4class{MsBackend} to be used as backend. See section on creation of \code{Spectra} objects for details. For \code{setBackend()}: instance of \linkS4class{MsBackend} that supports \code{setBackend()} (i.e. for @@ -643,244 +87,56 @@ passing the full spectra data to the initialize method. See section on creation of \code{Spectra} objects for details. For \code{export()}: \linkS4class{MsBackend} to be used to export the data.} -\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be used -to import spectrum data from the provided files. See section \emph{Creation -of objects, conversion and changing the backend} for more details.} - -\item{drop}{For \code{[}, \code{split()}: not considered.} - -\item{columns}{For \code{spectraData()} accessor: optional \code{character} with -column names (spectra variables) that should be included in the -returned \code{DataFrame}. By default, all columns are returned. -For \code{peaksData()} accessor: optional \code{character} with requested columns -in the individual \code{matrix} of the returned \code{list}. Defaults to -\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} -with \code{object} being the \code{Spectra} object are supported.} - -\item{value}{replacement value for \verb{<-} methods. See individual -method description or expected data type.} - -\item{which}{for \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether -any (the default) or all provided \code{mz} have to be present in the -spectrum.} - -\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the -value which should be subtracted from the spectrum's precursor m/z.} - -\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which \code{Spectra} -should be split. This parameter overrides parameters \code{f} and \code{BPPARAM}.} - -\item{use.names}{For \code{lengths()}: ignored.} - -\item{spectraVariables}{\itemize{ -\item For \code{selectSpectraVariables()}: \code{character} with the -names of the spectra variables to which the backend should be -subsetted. -\itemize{ -\item For \code{addProcessing()}: \code{character} with additional spectra variables -that should be passed along to the function defined with \code{FUN}. See -function description for details. -\item For \code{filterRanges()} and \code{filterValues()}: \code{character} vector -specifying the column(s) from \code{spectraData(object)} on which to filter -the data and that correspond to the the names of the spectra variables -that should be used for the filtering. -} -}} - -\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially -reported total ion current should be reported, or whether the -total ion current should be (re)calculated on the actual data -(\code{initial = FALSE}, same as \code{ionCount()}).} - -\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return -or set.} - -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the -object. For \code{asDataFrame()} an \code{numeric} indicating which scans to coerce -to a \code{DataFrame} (default is \code{seq_along(object)}).} - -\item{j}{For \code{[}: not supported.} - -\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition -numbers to filter for.} - -\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occur only for spectra of selected \code{dataStorage}.} +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} -\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occurr only for spectra of selected \code{dataOrigin}.} +\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be +used to import spectrum data from the provided files. See section +\emph{Creation of objects} for more details.} -\item{halfWindowSize}{\itemize{ -\item For \code{pickPeaks()}: \code{integer(1)}, used in the -identification of the mass peaks: a local maximum has to be the maximum -in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\itemize{ -\item For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the -window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\item For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} defining the m/z -window left and right of a peak where to remove fourier transform -artefacts. -} -}} +\item{f}{For \code{setBackend()}: factor defining how to split the data +for parallelized copying of the spectra data to the new backend. For +some backends changing this parameter can lead to errors. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} -\item{threshold}{\itemize{ -\item For \code{pickPeaks()}: a \code{double(1)} defining the proportion of the maximal -peak intensity. Just values above are used for the weighted mean -calculation. -\itemize{ -\item For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold -or a \code{function} to calculate the threshold for each spectrum on its -intensity values. Defaults to \code{threshold = min}. -\item For \code{filterFourierTransformArtefacts()}: the relative intensity (to a -peak) below which peaks are considered fourier artefacts. Defaults to -\code{threshold = 0.2} hence removing peaks that have an intensity below 0.2 -times the intensity of the tested peak (within the selected -\code{halfWindowSize}). +\item{value}{For \code{dataStorageBasePath()}: A \code{character} vector that defines +the base directory where the data storage files can be found.} } -}} - -\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope -peaks should not be removed as fourier artefacts.} - -\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge -to be considered for isotopes.} - -\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z -\code{tolerance} to be used to define whether peaks might be isotopes of -the current tested peak.} - -\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 -defining either the lower or the lower and upper intensity limit for the -filtering, or a \code{function} that takes the intensities as input and -returns a \code{logical} (same length then peaks in the spectrum) whether the -peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus -only peaks with \code{NA} intensity are removed.} - -\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} -whether the matching peaks should be retained (\code{keep = TRUE}, the -default) or dropped (\code{keep = FALSE}).} - -\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to -to subset \code{object}.} - -\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor -charges to be used as filter.} - -\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the -acquisition number of the spectra to which the object should be -subsetted.} - -\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to -be used to subset/filter \code{object}.} - -\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values -(upper and lower boundary) that define the ranges to filter the \code{object}. -These paired values need to be in the same order as the -\code{spectraVariables} parameter (see below).} - -\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } -defining whether the condition has to match for all provided -\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them -(\code{match = "any"}) for spectra to be retained.} - -\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the -values to filter the Spectra data. These values need to be in the same -order as the \code{spectraVariables} parameter.} - -\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. -Defaults to \code{binSize = 1}.} - -\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between -bins.} - -\item{zero.rm}{\code{logical}. For \code{bin()}: indicating whether to remove bins -with zero intensity. Defaults to \code{TRUE}, meaning the function will -discard bins created with an intensity of 0 to enhance memory efficiency.} - -\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between the -two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and possible -functions.} - -\item{SIMPLIFY}{For \code{compareSpectra()} whether the result matrix should be -\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is -of length 1).} - -\item{method}{\itemize{ -\item For \code{pickPeaks()}: \code{character(1)}, the noise estimators that -should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation -(\code{method = "MAD"}) and Friedman's Super Smoother -(\code{method = "SuperSmoother"}) are supported. +\description{ +The \code{Spectra} class encapsules spectral mass spectrometry (MS) data and +related metadata. The MS data is represented by a \emph{backend} extending the +virual \link{MsBackend} class which provides the data to the \code{Spectra} object. +The \code{Spectra} class implements only data accessor, filtering and analysis +methods for the MS data and relies on its \emph{backend} to provide the MS data. +This allows to change data representations of a \code{Spectra} object depending +on the user's needs and properties of the data. Different backends and +their properties are explained in the \link{MsBackend} documentation. + +Documentation on other topics and functionality of \code{Spectra}can be found in: \itemize{ -\item For \code{smooth()}: \code{character(1)}, the smoothing function that should be -used, currently, the Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -} -}} - -\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the -\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be -higher than \code{snr * noise} to be considered as peak.} - -\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of -the peak that should be considered in the weighted mean calculation.} - -\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values between -the nearest valleys around the peak centroids are used.} - -\item{intensityFun}{For \code{combinePeaks()}: function to be used to aggregate -intensities for all peaks in each peak group into a single intensity -value.} - -\item{mzFun}{For \code{combinePeaks()}: function to aggregate m/z values for all -peaks within each peak group into a single m/z value. This parameter -is ignored if \code{weighted = TRUE} (the default).} - -\item{weighted}{For \code{combinePeaks()}: \code{logical(1)} whether m/z values of -peaks within each peak group should be aggregated into a single m/z -value using an intensity-weighted mean. Defaults to \code{weighted = TRUE}.} - -\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized -entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for -details.} - -\item{spectraVars}{\code{character()} indicating what spectra variables to add to -the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all -available variables.} -} -\value{ -See individual method description for the return value. +\item \code{\link[=spectraData]{spectraData()}} for accessing and using MS data through \code{Spectra} objects. +\item \code{\link[=filterMsLevel]{filterMsLevel()}} to subset and filter \code{Spectra} objects. +\item \code{\link[=plotSpectra]{plotSpectra()}} for visualization of \code{Spectra} orbjects. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \code{\link[=combineSpectra]{combineSpectra()}} for merging, aggregating and splitting of \code{Spectra} +objects. +\item \code{\link[=combinePeaks]{combinePeaks()}} for merging and aggregating \code{Spectra}'s mass peaks data. +\item \code{\link[=addProcessing]{addProcessing()}} for data analysis functions. +\item \code{\link[=compareSpectra]{compareSpectra()}} for spectra similarity calculations. } -\description{ -The \code{Spectra} class encapsules spectral mass spectrometry data and -related metadata. - -It supports multiple data backends, e.g. in-memory (\link{MsBackendMemory}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}}), on-disk as mzML (\code{\link[=MsBackendMzR]{MsBackendMzR()}}) or HDF5 -(\code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}). } \details{ The \code{Spectra} class uses by default a lazy data manipulation strategy, i.e. data manipulations such as performed with \code{replaceIntensitiesBelow()} are not applied immediately to the data, but applied on-the-fly to the -spectrum data once it is retrieved. For some backends that allow to write -data back to the data storage (such as the \code{\link[=MsBackendMemory]{MsBackendMemory()}}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it is possible to apply -to queue with the \code{applyProcessing} function. See the *Data manipulation and -analysis \emph{methods} section below for more details. - -For more information on parallel or chunk-wise processing (especially -helpful for very large data sets) see \code{\link[=processingChunkSize]{processingChunkSize()}}. - -To apply arbitrary functions to a \code{Spectra} use the \code{spectrapply()} function -(or directly \code{\link[=chunkapply]{chunkapply()}} for chunk-wise processing). See description of -the \code{spectrapply()} function below for details. - -For details on plotting spectra, see \code{\link[=plotSpectra]{plotSpectra()}}. +spectrum data once it is retrieved. This enables data manipulation +operations also for \emph{read only} data representations. For some backends that +allow to write data back to the data storage (such as the +\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it +is possible to apply to queue with the \code{\link[=applyProcessing]{applyProcessing()}} function (see +the \code{\link[=applyProcessing]{applyProcessing()}} function for details). Clarifications regarding scan/acquisition numbers and indices: \itemize{ @@ -897,15 +153,33 @@ the \code{acquisitionNum}) See also \href{https://github.com/lgatto/MSnbase/issues/525}{this issue}. } -\section{Creation of objects, conversion, changing the backend and export}{ +\section{Data stored in a \code{Spectra} object}{ + + +The \code{Spectra} object is a container for MS data that includes mass peak +data (\emph{m/z} and related intensity values, also referred to as \emph{peaks data} +in the context of \code{Spectra}) and metadata of individual spectra (so called +\emph{spectra variables}). While a core set of spectra variables (the +\code{coreSpectraVariables()}) are guaranteed to be provided by a +\code{Spectra}, it is possible to add arbitrary additional spectra variables to +a \code{Spectra} object. + +The \code{Spectra} object is designed to contain MS data of a (large) set of mass +spectra. The data is organized \emph{linearly} and can be thought of a list of +mass spectra, i.e. each element in the \code{Spectra} is one spectrum. +} + +\section{Creation of objects}{ \code{Spectra} classes can be created with the \code{Spectra()} constructor function which supports the following formats: \itemize{ \item parameter \code{object} is a \code{data.frame} or \code{DataFrame} containing the -spectrum data. The provided \code{backend} (by default a -\linkS4class{MsBackendMemory}) will be initialized with that data. +full spectrum data (spectra variables in columns as well as columns +with the individual MS peak data, \emph{m/z} and intensity). The provided +\code{backend} (by default a \linkS4class{MsBackendMemory}) will be initialized +with that data. \item parameter \code{object} is a \linkS4class{MsBackend} (assumed to be already initialized). \item parameter \code{object} is missing, in which case it is supposed that the data @@ -920,41 +194,79 @@ which allows to import spectra data from mzML, mzXML or CDF files. With \code{...} additional arguments can be passed to the backend's \code{\link[=backendInitialize]{backendInitialize()}} method. Parameter \code{backend} allows to specify which -\linkS4class{MsBackend} should be used for data storage. +\linkS4class{MsBackend} should be used for data representation and storage. +} + +\section{Data representation of a \code{Spectra}}{ + + +The MS data which can be accessed through the \code{Spectra} object is +\emph{represented} by its backend, which means that this backend defines how +and where the data is stored (e.g. in memory or on disk). The \code{Specrta} +object relies on the backend to provide the MS data whenever it needs it +for data processing. +Different backends with different properties, such as minimal memory +requirement or fast data access, are defined in the \emph{Spectra} package or +one of the MsBackend* packages. More information on backends and their +properties is provided in the documentation of \link{MsBackend}. + +On-disk backends keep only a limited amount of data in memory retrieving +most of the data (usually the MS peak data) upon request on-the-fly from +their on-disk data representations. Moving the on-disk data storage of such +a backend or a serialized object to a different location in the file +system will cause data corruption. The \code{dataStorageBasePath()} and +\verb{dataStorageBasePath<-} functions allow in such cases (and if thebackend +classes support this operation), to get or change the \emph{base} +path to the directory of the backend's data storage. In-memory backends +such as \link{MsBackendMemory} or \link{MsBackendDataFrame} keeping all MS data in +memory don't support, and need, this function, but for \link{MsBackendMzR} this +function can be used to update/adapt the path to the directory containing +the original data files. Thus, for \code{Spectra} objects (using this backend) +that were moved to another file system or computer, these functions allow to +adjust/adapt the base file path. +} + +\section{Changing data representation of a \code{Spectra}}{ -The backend of a \code{Spectra} object can be changed with the \code{setBackend()} -method that takes an instance of the new backend as second parameter -\code{backend}. A call to \code{setBackend(sps, backend = MsBackendDataFrame())} + +The data representation, i.e. the backend of a \code{Spectra} object can be +changed with the \code{setBackend()} method that takes an instance of the new +backend as second parameter \code{backend}. A call to +\code{setBackend(sps, backend = MsBackendDataFrame())} would for example change the backend of \code{sps} to the \emph{in-memory} \code{MsBackendDataFrame}. Changing to a backend is only supported if that backend has a \code{data} parameter in its \code{backendInitialize()} method and if \code{supportsSetBackend()} returns \code{TRUE} for that backend. \code{setBackend()} will -transfer the full spectra data from the originating backend as a -\code{DataFrame} to the new backend. -Most \emph{read-only} backends do not support \code{setBackend()}. It is for example -not possible to change the backend to a \emph{read-only} backend (such as -the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend). +transfer the full spectra data from the originating backend as a \code{DataFrame} +to the new backend. + +Generally, it is not possible to change \strong{to} a read-only backend such as +the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend. The definition of the function is: \code{setBackend(object, backend, ..., f = dataStorage(object), BPPARAM = bpparam())} and its parameters are: \itemize{ -\item parameter \code{object}: the \code{Spectra} object. -\item parameter \code{backend}: an instance of the new backend, e.g. -\verb{[MsBackendMemory()]}. -\item parameter \code{f}: factor allowing to parallelize the change of the backends. -By default the process of copying the spectra data from the original to the +\item \code{object}: the \code{Spectra} object. +\item \code{backend}: an instance of the new backend, e.g. \verb{[MsBackendMemory()]}. +\item \code{f}: factor allowing to parallelize the change of the backends. By +default the process of copying the spectra data from the original to the new backend is performed separately (and in parallel) for each file. Users are advised to use the default setting. -\item parameter \code{...}: optional additional arguments passed to the -\code{\link[=backendInitialize]{backendInitialize()}} method of the new \code{backend}. -\item parameter \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for +\item \code{...}: optional additional arguments passed to the \code{\link[=backendInitialize]{backendInitialize()}} +method of the new \code{backend}. +\item \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for details. } +} + +\section{Exporting data from a \code{Spectra} object}{ + Data from a \code{Spectra} object can be \strong{exported} to a file with the -\code{export()} function. The actual export of the data has to be performed by +\code{export()} function. The actual export of the data is performed by the \code{export} method of the \link{MsBackend} class defined with the mandatory -parameter \code{backend}. Note however that not all backend classes support +parameter \code{backend} which defines also the format in which the data +is exported. Note however that not all backend classes support export of data. From the \code{MsBackend} classes in the \code{Spectra} package currently only the \code{MsBackendMzR} backend supports data export (to mzML/mzXML file(s)); see the help page of the \linkS4class{MsBackend} for @@ -971,604 +283,12 @@ of the data (i.e. which has a defined \code{export} method). \item \code{...}: additional parameters specific for the \code{MsBackend} passed with parameter \code{backend}. } - -The \code{dataStorageBasePath()} and \verb{dataStorageBasePath<-} functions allow, for -backend classes that support this operation, to get or change the \emph{base} -path to the directory where the backend stores the data. In-memory backends -such as \link{MsBackendMemory} or \link{MsBackendDataFrame} keeping all MS data in -memory don't support, and need, this function, but for \link{MsBackendMzR} this -function can be used to update/adapt the path to the directory containing -the original data files. Thus, for \code{Spectra} objects (using this backend) -that were moved to another file system or computer, these functions allow to -adjust/adapt the base file path. -} - -\section{Accessing spectra data}{ - -\itemize{ -\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. -See examples for details. Note that replacing values of a peaks variable -is not supported with a non-empty processing queue, i.e. if any filtering -or data manipulations on the peaks data was performed. In these cases -\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data -operations. -\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the -backend. -\item \code{acquisitionNum()}: returns the acquisition number of each -spectrum. Returns an \code{integer} of length equal to the number of -spectra (with \code{NA_integer_} if not available). -\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding -information of the spectra. \code{centroided()} returns a \code{logical} -vector of length equal to the number of spectra with \code{TRUE} if a -spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} -if it is undefined. See also \code{isCentroided()} for estimating from -the spectrum data whether the spectrum is centroided. \code{value} -for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of -length equal to the number of spectra in \code{object}. -\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the -collision energy for all spectra in \code{object}. \code{collisionEnergy()} -returns a \code{numeric} with length equal to the number of spectra -(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a -\code{numeric} of length equal to the number of spectra in \code{object}. -\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with -their expected data type. -\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each -spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than -\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a -\code{character} vector (same length than \code{object}) with the replacement -values for the data origin of each spectrum. -\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) -with the data storage location of each spectrum. -\item \code{intensity()}: gets the intensity values from the spectra. Returns -a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each -spectrum). The length of the list is equal to the number of -\code{spectra} in \code{object}. -\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for -each spectrum. If the spectrum is empty (see \code{isEmpty()}), -\code{NA_real_} is returned. -\item \code{isCentroided()}: a heuristic approach assessing if the spectra in -\code{object} are in profile or centroided mode. The function takes -the \code{qtl}th quantile top peaks, then calculates the difference -between adjacent m/z value and returns \code{TRUE} if the first -quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for -the code.) -\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty -(i.e. does not contain any peaks). Returns a \code{logical} vector of -length equal number of spectra. -\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the -lower m/z boundary of the isolation window. -\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the -target m/z of the isolation window. -\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the -upper m/z boundary of the isolation window. -\item \code{containsMz()}: checks for each of the spectra whether they contain mass -peaks with an m/z equal to \code{mz} (given acceptable difference as defined by -parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter -\code{which} allows to define whether any (\code{which = "any"}, the default) or -all (\code{which = "all"}) of the \code{mz} have to match. The function returns -\code{NA} if \code{mz} is of length 0 or is \code{NA}. -\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a -peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given -acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). -Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). -\item \code{length()}: gets the number of spectra in the object. -\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per -spectrum. Returns an \code{integer} vector (length equal to the -number of spectra). For empty spectra, \code{0} is returned. -\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names -being spectrum names, length equal to the number of spectra) with the MS -level for each spectrum. -\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the -spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of -spectra, each element a \code{numeric} vector with the m/z values of -one spectrum. -\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks -data consist of the m/z and intensity values as well as possible additional -annotations (variables) of all peaks of each spectrum. The function -returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or -\code{data.frame}), with each array providing the values for the requested -\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter -\code{columns} is passed to the backend's \code{peaksData()} function to allow -the selection of specific (or additional) peaks variables (columns) that -should be extracted (if available). Importantly, -it is \strong{not} guaranteed that each backend supports this parameter (while -each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). -Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value -returned by \code{peaksVariables(object)} is supported. -Note also that it is possible to extract the peak data with -\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, -respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} -does not support the parameter \code{columns}. -\item \code{peaksVariables()}: lists the available variables for mass peaks provided -by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which -all backends need to support and provide), but some backends might provide -additional variables. -These variables correspond to the column names of the peak data array -returned by \code{peaksData()}. -\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each -spectrum. \code{polarity()} returns an \code{integer} vector (length equal -to the number of spectra), with \code{0} and \code{1} representing negative -and positive polarities, respectively. \verb{polarity<-} expects an -\code{integer} vector of length 1 or equal to the number of spectra. -\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, -\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), -intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) -and acquisition number (\code{interger}) of the precursor for MS level > -2 spectra from the object. Returns a vector of length equal to -the number of spectra in \code{object}. \code{NA} are reported for MS1 -spectra of if no precursor information is available. -\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) -for each spectrum. \code{rtime()} returns a \code{numeric} vector (length -equal to the number of spectra) with the retention time for each -spectrum. \verb{rtime<-} expects a numeric vector with length equal -to the number of spectra. -\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} -for each spectrum. This represents the relative index of the -spectrum within each file. Note that this can be different to the -\code{acquisitionNum} of the spectrum which represents the index of the -spectrum during acquisition/measurement (as reported in the mzML file). -\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is -\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal -to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector -of length 1 or equal to the number of spectra in \code{object}. -\item \code{spectraData()}: gets general spectrum metadata (annotation, also called -header). \code{spectraData()} returns a \code{DataFrame}. Note that this -method does by default \strong{not} return m/z or intensity values. -\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} -object with the one provided with \code{value}. The \verb{spectraData<-} function -expects a \code{DataFrame} to be passed as value with the same number of rows -as there a spectra in \code{object}. Note that replacing values of -peaks variables is not supported with a non-empty processing queue, i.e. -if any filtering or data manipulations on the peaks data was performed. -In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all -cached data operations and empty the processing queue. -\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. -\item \code{spectraVariables()}: returns a \code{character} vector with the -available spectra variables (columns, fields or attributes of each -spectrum) available in \code{object}. Note that \code{spectraVariables()} does not -list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional -annotations for each MS peak). Peak variables are returned by -\code{peaksVariables()}. -\item \code{tic()}: gets the total ion current/count (sum of signal of a -spectrum) for all spectra in \code{object}. By default, the value -reported in the original raw data file is returned. For an empty -spectrum, \code{0} is returned. -\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This -function is supposed to be more efficient than \code{unique(msLevel(object))}. -} -} - -\section{Filter spectra data}{ - - -Filter a \code{Spectra} object based on the spectra data. This includes subset -operations that immediately reduce the number of spectra in the object as -well as filters that reduce the \emph{content} of the \code{Spectra} object. -See section \emph{Filter peaks data} below for functions that filter the peaks -data of a \code{Spectra}. -\itemize{ -\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method -\strong{always} returns a \code{Spectra} object. -\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the -object's \code{spectraData} that contain only missing values (\code{NA}). Note that -while columns with only \code{NA}s are removed, a \code{spectraData()} call after -\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values -for \emph{core} spectra variables. The total number of spectra is not changed -by this function. -\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching -the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or -\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with -an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin -or dataStorage values} retaining all other spectra. -Returns the filtered \code{Spectra}. -\item \code{filterDataOrigin()}: filters the object retaining spectra matching the -provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type -\code{character} and needs to match exactly the data origin value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataOrigin} parameter). -\item \code{filterDataStorage()}: filters the object retaining spectra stored in the -specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type -\code{character} and needs to match exactly the data storage value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataStorage} parameter). -\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). -Returns the filtered \code{Spectra} object (with spectra in their -original order). -\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their -isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} -and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} -object (with spectra in their original order). -\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching -the MS level specified with argument \code{msLevel}. Returns the filtered -\code{Spectra} (with spectra in their original order). -\item \code{filterPolarity()}: filters the object keeping only spectra matching the -provided polarity. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor -charge(s). -\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor -m/z and precursor intensity into predicted isotope groups and keep for each -only the spectrum representing the monoisotopic precursor. MS1 spectra -are returned as is. See documentation for \code{deisotopeSpectra()} below for -details on isotope prediction and parameter description. -\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups -of (MS2) spectra with similar precursor m/z values (given parameters -\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The -function filters only MS2 spectra and returns all MS1 spectra. If -precursor intensities are \code{NA} for all spectra within a spectra group, the -first spectrum of that groups is returned. -Note: some manufacturers don't provide precursor intensities. These can -however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. -\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now -deprecated): retains spectra with a precursor m/z within the -provided m/z range. See examples for details on selecting spectra with -a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. -\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching -any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with -missing precursor m/z value (e.g. MS1 spectra) are dropped. -\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. -MS2) of acquisition number \code{acquisitionNum}. Returns the filtered -\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to -define which spectra belong to the same sample or original data file ( -defaults to \code{f = dataOrigin(object)}). -\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user -defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available -spectra variables in object (spectra variable names can be specified with -parameter \code{spectraVariables}). Spectra for which the value of a spectra -variable is within it's defined range are retained. If multiple -ranges/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention -times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) -\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on -similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} -(parameter \code{spectraVariables}) to provided values (parameter \code{values}) -given acceptable differences (parameters tolerance and ppm). If multiple -values/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{selectSpectraVariables()}: reduces the information within the object to -the selected spectra variables: all data for variables not specified will -be dropped. For mandatory columns (i.e., those listed by -\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only -the values will be dropped but not the variable itself. Additional (or -user defined) spectra variables will be completely removed. -Returns the filtered \code{Spectra}. -} -} - -\section{Filter or aggregate mass peak data}{ - - -Operations that filter or aggregate the mass peak data from each spectrum -without changing the number of spectra in a \code{Spectra} object. Also, the -actual subsetting/aggregation operation is only executed once peaks data is -accessed (through \code{peaksData()}, \code{mz()} or \code{intensity()}) or -\code{applyProcessing()} is called. -\itemize{ -\item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a -difference in their m/z values that is smaller than the maximal -acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters -\code{intensityFun} and \code{mzFun} allow to define functions to aggregate the -intensity and m/z values for each such group of peaks. With -\code{weighted = TRUE} (the default), the m/z value of the combined peak is -calculated using an intensity-weighted mean and parameter \code{mzFun} is -ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is used for the grouping of -mass peaks. Parameter \code{msLevel.} allows to define selected MS levels for -which peaks should be combined. This function returns a \code{Spectra} with -the same number of spectra than the input object, but with possibly -combined peaks within each spectrum. -dropped (i.e. their values are replaced with \code{NA}) for combined peaks -unless they are constant across the combined peaks. See also -\code{reduceSpectra()} for a function to select a single \emph{representative} -mass peak for each peak group. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the -\emph{MetaboCoreUtils} package. Note that -the default parameters for isotope prediction/detection have been -determined using data from the Human Metabolome Database (HMDB) and -isotopes for elements other than CHNOPS might not be detected. See -parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for -more information. The approach and code to define the parameters for -isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. -\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier -artefact peaks from spectra (see examples below). The function iterates -through all intensity ordered peaks in a spectrum and removes all peaks -with an m/z within +/- \code{halfWindowSize} of the current peak if their -intensity is lower than \code{threshold} times the current peak's intensity. -Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} -allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} -being the maximum charge that should be considered and \code{isotopeTolerance} -the absolute acceptable tolerance for matching their m/z). -See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and -\code{deisitopeSpectra()} for an alternative. -\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only -those with intensities that are within the provided range or match the -criteria of the provided function. For the former, parameter \code{intensity} -has to be a \code{numeric} defining the intensity range, for the latter a -\code{function} that takes the intensity values of the spectrum and returns -a \code{logical} whether the peak should be retained or not (see examples -below for details) - additional parameters to the function can be passed -with \code{...}. -To remove only peaks with intensities below a certain threshold, say -100, use \code{intensity = c(100, Inf)}. Note: also a single value can be -passed with the \code{intensity} parameter in which case an upper limit of -\code{Inf} is used. -Note that this function removes also peaks with missing intensities -(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the -filtering to spectra of the specified MS level(s). -\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing -those in each spectrum that are within the provided m/z range. Whether -peaks are retained or removed can be configured with parameter \code{keep} -(default \code{keep = TRUE}). -\item \code{filterMzValues()}: filters mass peaks in the object keeping all -peaks in each spectrum that match the provided m/z value(s) (for -\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). -The m/z matching considers also the absolute \code{tolerance} and m/z-relative -\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. -\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any -set of range-based filters on numeric spectra or peaks variables. See -\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. -\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with -an m/z equal or larger than the m/z of the precursor, depending on the -value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). -\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in -(given \code{ppm} and \code{tolerance}) in each spectrum only the peak with the -highest intensity removing all other peaks hence \emph{reducing} each -spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. See also the \code{combinePeaks()} function for an -alternative function to combine peaks within each spectrum. -} -} - -\section{Merging, aggregating and splitting}{ - - -Several \code{Spectra} objects can be concatenated into a single object with the -\code{c()} or the \code{concatenateSpectra()} function. Concatenation will fail if the -processing queue of any of the \code{Spectra} objects is not empty or if -different backends are used in the \code{Spectra} objects. Thus, in these cases, -prior to merging \code{Spectra} object it is suggested to change the backend to -a \code{MsBackendMemory} using the \code{setBackend()} function, and to \emph{apply} all -data processing steps using \code{applyProcessing()}. The spectra variables -of the resulting \code{Spectra} object is the union of the spectra variables of -the individual \code{Spectra} objects. -\itemize{ -\item \code{combineSpectra()}: combines MS data (i.e. mass peaks) from sets of -spectra into a single spectrum per set (in contrast to \code{combinePeaks()} -or \code{reduceSpectra()} that combine mass peaks \strong{within each spectrum}). -For each spectrum group (set), spectra variables from the first spectrum -are used and the peak matrices are combined using the function specified -with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please refer to the -\code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of the actual -combination of peaks across the sets of spectra and to the package -vignette for examples and alternative ways to aggregate spectra. -The sets of spectra can be specified with parameter \code{f}. -In addition it is possible to define, with parameter \code{p} if and how to -split the input data for parallel processing. -This defaults to \code{p = x$dataStorage} and hence a per-file parallel -processing is applied for \code{Spectra} with file-based backends (such as the -\code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Prior combination of the spectra all processings queued in the lazy -evaluation queue are applied. Be aware that calling \code{combineSpectra()} on a -\code{Spectra} object with certain backends that allow modifications might -\strong{overwrite} the original data. This does not happen with a -\code{MsBackendMemory} or \code{MsBackendDataFrame} backend, but with a -\code{MsBackendHdf5Peaks} backend the m/z and intensity values in the original -hdf5 file(s) will be overwritten. -The function returns a \code{Spectra} of length equal to the unique levels -of \code{f}. -\item \code{joinSpectraData()}: Individual spectra variables can be directly -added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} -function allows to merge a \code{DataFrame} to the existing spectra -data. This function diverges from the \code{\link[=merge]{merge()}} method in two -main ways: -\itemize{ -\item The \code{by.x} and \code{by.y} column names must be of length 1. -\item If variable names are shared in \code{x} and \code{y}, the spectra -variables of \code{x} are not modified. It's only the \code{y} -variables that are appended the suffix defined in -\code{suffix.y}. This is to avoid modifying any core spectra -variables that would lead to an invalid object. -\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not -allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) -throw a warning and only the last occurrence is kept. These -should be explored and ideally be removed using for -\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar -functions. -} -\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} -of \code{Spectra} objects. -} -} - -\section{Data manipulation and analysis methods}{ - - -Many data manipulation operations, such as those listed in this section, are -not applied immediately to the spectra, but added to a -\emph{lazy processing/manipulation queue}. Operations stored in this queue are -applied on-the-fly to spectra data each time it is accessed. This lazy -execution guarantees the same functionality for \code{Spectra} objects with -any backend, i.e. backends supporting to save changes to spectrum data -(\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} or \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) as -well as read-only backends (such as the \code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Note that for the former it is possible to apply the processing queue and -write the modified peak data back to the data storage with the -\code{applyProcessing()} function. -\itemize{ -\item \code{addProcessing()}: adds an arbitrary function that should be applied to the -peaks matrix of every spectrum in \code{object}. The function (can be passed -with parameter \code{FUN}) is expected to take a peaks matrix as input and to -return a peaks matrix. A peaks matrix is a numeric matrix with two columns, -the first containing the m/z values of the peaks and the second the -corresponding intensities. The function has to have \code{...} in its -definition. Additional arguments can be passed with \code{...}. With parameter -\code{spectraVariables} it is possible to define additional spectra variables -from \code{object} that should be passed to the function \code{FUN}. These will be -passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} -will pass the spectra's precursor m/z as a parameter named \code{precursorMz} -to the function. The only exception is the spectra's MS level, these will -be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. -with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be -submitted to the function as a parameter called \code{spectrumMsLevel}). -Examples are provided in the package vignette. -\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend -only: apply all steps from the lazy processing queue to the peak data and -write it back to the data storage. Parameter \code{f} allows to specify how -\code{object} should be split for parallel processing. This should either be -equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable -parallel processing alltogether. Other partitionings might result in -errors (especially if a \code{MsBackendHdf5Peaks} backend is used). -\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is -performed only on spectra of the specified MS level(s) (parameter -\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with -parameter \code{breaks} which by default are equally sized bins, with size -being defined by parameter \code{binSize}, from the minimal to the maximal m/z -of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used -for all spectra in \code{x}. All intensity values for peaks falling into the -same bin are aggregated using the function provided with parameter \code{FUN} -(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that -the binning operation is applied to the peak data on-the-fly upon data -access and it is possible to \emph{revert} the operation with the \code{reset()} -function (see description of \code{reset()} above). -\item \code{compareSpectra()}: compares each spectrum in \code{x} with each spectrum in \code{y} -using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If -\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum -in \code{x}. -The matching/mapping of peaks between the compared spectra is done with the -\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra -and allows to keep all peaks from the first spectrum (\code{type = "left"}), -from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to -keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more -information and examples). The \code{MAPFUN} function should have parameters -\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to -the function. In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is -supported for GNPS-like similarity score calculations. Note that -\code{joinPeaksGnps()} should only be used in combination with -\code{FUN = MsCoreUtils::gnps} (see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and -details). Use \code{MAPFUN = joinPeaksNone} to disable internal peak -matching/mapping if a similarity scoring function is used that performs -the matching internally. -\code{FUN} is supposed to be a function to compare intensities of (matched) -peaks of the two spectra that are compared. The function needs to take two -matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed -to return a single numeric as result. In addition to the two peak matrices -the spectra's precursor m/z values are passed to the function as parameters -\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} -(precursor m/z of the \code{y} peak matrix). Additional parameters to functions -\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and -\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. -The function returns a \code{matrix} with the results of \code{FUN} for each -comparison, number of rows equal to \code{length(x)} and number of columns -equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from -the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} -is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also -the vignette for additional examples, such as using spectral entropy -similarity in the scoring. -\item \code{entropy()}: calculates the entropy of each spectra based on the metrics -suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). -See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. -\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 -spectra using the intensity of the matching MS1 peak from the -closest MS1 spectrum (i.e. the last MS1 spectrum measured before the -respective MS2 spectrum). With \code{method = "interpolation"} it is also -possible to calculate the precursor intensity based on an interpolation of -intensity values (and retention times) of the matching MS1 peaks from the -previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for -examples and more details. -\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment -spectra's precursor m/z based on the reported precursor m/z and the data -from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. -\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See -\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. -\item \code{processingLog()}: returns a \code{character} vector with the processing log -messages. -\item \code{reset()}: restores the data to its original state (as much as possible): -removes any processing steps from the lazy processing queue and calls -\code{reset()} on the backend which, depending on the backend, can also undo -e.g. data filtering operations. Note that a \verb{reset*(} call after -\code{applyProcessing()} will not have any effect. See examples below for more -information. -\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending -on parameter \code{by}. With \code{by = sum} (the default) peak intensities are -divided by the sum of peak intensities within each spectrum. The sum of -intensities is thus 1 for each spectrum after scaling. Parameter -\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. -By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all -spectra will be scaled. -\item \code{spectrapply()}: applies a given function to each individual spectrum or -sets of a \code{Spectra} object. By default, the \code{Spectra} is split into -individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} -is applied to each of them. An alternative splitting can be defined with -parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. -The returned result and its order depend on the function \code{FUN} and how -\code{object} is split (hence on \code{f}, if provided). Parallel processing is -supported and can be configured with parameter \code{BPPARAM}, is however only -suggested for computational intense \code{FUN}. -As an alternative to the (eventual parallel) processing of the full -\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, -parameter \code{chunkSize} needs to be specified. \code{object} is then split into -chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. -This guarantees a lower memory demand (especially for on-disk backends) -since only the data for one chunk needs to be loaded into memory in each -iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and -\code{BPPARAM} will be ignored. -See also \code{\link[=chunkapply]{chunkapply()}} or examples below for details on chunk-wise -processing. -\item \code{smooth()}: smooths individual spectra using a moving window-based approach -(window size = \code{2 * halfWindowSize}). Currently, the -Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -weights depending on the distance of the center and calculated -\code{1/2^(-halfWindowSize:halfWindowSize)}) and -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -For details how to choose the correct \code{halfWindowSize} please see -\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. -\item \code{pickPeaks()}: picks peaks on individual spectra using a moving -window-based approach (window size = \code{2 * halfWindowSize}). For noisy -spectra there are currently two different noise estimators available, -the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and -Friedman's Super Smoother (\code{method = "SuperSmoother"}), -as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. -The method supports also to optionally \emph{refine} the m/z value of -the identified centroids by considering data points that belong (most -likely) to the same mass peak. Therefore the m/z value is calculated as an -intensity weighted average of the m/z values within the peak region. -The peak region is defined as the m/z values (and their respective -intensities) of the \code{2 * k} closest signals to the centroid or the closest -valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} -has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for -details. -If the ratio of the signal to the highest intensity of the peak is below -\code{threshold} it will be ignored for the weighted average. -\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified -threshold with the provided \code{value}. Parameter \code{threshold} can be either -a single numeric value or a function which is applied to all non-\code{NA} -intensities of each spectrum to determine a threshold value for each -spectrum. The default is \code{threshold = min} which replaces all values -which are <= the minimum intensity in a spectrum with \code{value} (the -default for \code{value} is \code{0}). Note that the function specified with -\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} -will be passed to the function. If the spectrum is in profile mode, -ranges of successive non-0 peaks <= \code{threshold} are set to 0. -Parameter \code{msLevel.} allows to apply this to only spectra of certain MS -level(s). -} } \examples{ +## -------- CREATION OF SPECTRA OBJECTS -------- + ## Create a Spectra providing a `DataFrame` containing the spectrum data. spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) @@ -1578,12 +298,6 @@ spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) data <- Spectra(spd) data -## Get the number of spectra -length(data) - -## Get the number of peaks per spectrum -lengths(data) - ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk ## backend. sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -1591,6 +305,9 @@ sciex_file <- dir(system.file("sciex", package = "msdata"), sciex <- Spectra(sciex_file, backend = MsBackendMzR()) sciex + +## -------- CHANGING DATA REPRESENTATIONS -------- + ## The MS data is on disk and will be read into memory on-demand. We can ## however change the backend to a MsBackendMemory backend which will ## keep all of the data in memory. @@ -1626,311 +343,7 @@ head(dataOrigin(sciex)) head(dataOrigin(sciex_im)) -## ---- ACCESSING AND ADDING DATA ---- - -## Get the MS level for each spectrum. -msLevel(data) - -## Alternatively, we could also use $ to access a specific spectra variable. -## This could also be used to add additional spectra variables to the -## object (see further below). -data$msLevel - -## Get the intensity and m/z values. -intensity(data) -mz(data) - -## Determine whether one of the spectra has a specific m/z value -containsMz(data, mz = 120.4) - -## Accessing spectra variables works for all backends: -intensity(sciex) -intensity(sciex_im) - -## Get the m/z for the first spectrum. -mz(data)[[1]] - -## Get the peak data (m/z and intensity values). -pks <- peaksData(data) -pks -pks[[1]] -pks[[2]] - -## Note that we could get the same resulb by coercing the `Spectra` to -## a `list` or `SimpleList`: -as(data, "list") -as(data, "SimpleList") - -## List all available spectra variables (i.e. spectrum data and metadata). -spectraVariables(data) - -## For all *core* spectrum variables accessor functions are available. These -## return NA if the variable was not set. -centroided(data) -dataStorage(data) -rtime(data) -precursorMz(data) - -## The core spectra variables are: -coreSpectraVariables() - -## Add an additional metadata column. -data$spectrum_id <- c("sp_1", "sp_2") - -## List spectra variables, "spectrum_id" is now also listed -spectraVariables(data) - -## Get the values for the new spectra variable -data$spectrum_id - -## Extract specific spectra variables. -spectraData(data, columns = c("spectrum_id", "msLevel")) - -## Drop spectra variable data and/or columns. -res <- selectSpectraVariables(data, c("mz", "intensity")) - -## This removed the additional columns "spectrum_id" and deleted all values -## for all spectra variables, except "mz" and "intensity". -spectraData(res) - -## Compared to the data before selectSpectraVariables. -spectraData(data) - - -## ---- SUBSETTING, FILTERING AND COMBINING - -## Subset to all MS2 spectra. -data[msLevel(data) == 2] - -## Same with the filterMsLevel function -filterMsLevel(data, 2) - -## Below we combine the `data` and `sciex_im` objects into a single one. -data_comb <- c(data, sciex_im) - -## The combined Spectra contains a union of all spectra variables: -head(data_comb$spectrum_id) -head(data_comb$rtime) -head(data_comb$dataStorage) -head(data_comb$dataOrigin) - -## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm -spd$precursorMz <- c(323.4, 543.2302) -data_filt <- Spectra(spd) -filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10)) - -## Filter a Spectra keeping only peaks matching certain m/z values -sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3) -mz(sps_sub) - -## This function can also be used to remove specific peaks from a spectrum -## by setting `keep = FALSE`. -sps_sub <- filterMzValues(data, mz = c(103, 104), - tolerance = 0.3, keep = FALSE) -mz(sps_sub) - -## Note that `filterMzValues()` keeps or removes all peaks with a matching -## m/z given the provided `ppm` and `tolerance` parameters. - -## Filter a Spectra keeping only peaks within a m/z range -sps_sub <- filterMzRange(data, mz = c(100, 300)) -mz(sps_sub) - -## Remove empty spectra variables -sciex_noNA <- dropNaSpectraVariables(sciex) - -## Available spectra variables before and after `dropNaSpectraVariables()` -spectraVariables(sciex) -spectraVariables(sciex_noNA) - - -## Adding new spectra variables -sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) -spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging - var1 = rnorm(10), - var2 = sample(letters, 10)) -spv - -sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") - -spectraVariables(sciex2) -spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] - -## Removing fourier transform artefacts seen in Orbitra data. - -## Loading an Orbitrap spectrum with artefacts. -data(fft_spectrum) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) -fft_spectrum -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using a few examples peaks in your data you can optimize the parameters -fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, - halfWindowSize = 0.2, - threshold = 0.005, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 - ) - -fft_spectrum_filtered -length(mz(fft_spectrum_filtered)[[1]]) -plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using filterRanges to filter spectra object based on variables available -## in `spectraData`. -## First, determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz", "peaksCount") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the ranges (pairs of values with lower and upper boundary) to be -## used for the individual spectra variables. The first two values will be -## used for the first spectra variable (e.g., rtime here), the next two for -## the second (e.g. precursorMz here) and so on: -ranges <- c(30, 350, 200,500, 350, 600) - -## Input the parameters within the filterRanges function: -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges) - -## Using `filterRanges()` to filter spectra object with multiple ranges for -## the same `spectraVariable` (e.g, here rtime) -sv <- c("rtime", "rtime") -ranges <- c(30, 100, 200, 300) -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges, match = "any") - -## Using filterValues in a similar way to a filter spectra object based on -## variables available in `spectraData`. However, this time not based on -## ranges but similarities to user input single values with given -## tolerance/ppm -## First determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the values that will be used to filter the spectra based on their -## similarities to their respective spectraVariables. -## The first values in the parameters values, tolerance and ppm will be -## used for the first spectra variable (e.g. rtime here), the next for the -## second (e.g. precursorMz here) and so on: -values <- c(350, 400) -tolerance <- c(100, 0) -ppm <- c(0,50) - -## Input the parameters within the `filterValues()` function: -filt_spectra <- filterValues(sciex, spectraVariables = sv, - values = values, tolerance = tolerance, ppm = ppm) - -## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ---- - -## Set the data to be centroided -centroided(data) <- TRUE - -## Replace peak intensities below 40 with 3. -res <- replaceIntensitiesBelow(data, threshold = 40, value = 3) -res - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Remove all peaks with an intensity below 40. -res <- filterIntensity(res, intensity = c(40, Inf)) - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Lengths of spectra is now different -lengths(mz(res)) -lengths(mz(data)) - -## In addition it is possible to pass a function to `filterIntensity()`: in -## the example below we want to keep only peaks that have an intensity which -## is larger than one third of the maximal peak intensity in that spectrum. -keep_peaks <- function(x, prop = 3) { - x > max(x, na.rm = TRUE) / prop -} -res2 <- filterIntensity(data, intensity = keep_peaks) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## We can also change the proportion by simply passing the `prop` parameter -## to the function. To keep only peaks that have an intensity which is -## larger than half of the maximum intensity: -res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## Since data manipulation operations are by default not directly applied to -## the data but only added to the internal lazy evaluation queue, it is also -## possible to remove these data manipulations with the `reset()` function: -res_rest <- reset(res) -res_rest -lengths(mz(res_rest)) -lengths(mz(res)) -lengths(mz(data)) - -## `reset()` after a `applyProcessing()` can not restore the data, because -## the data in the backend was changed. Similarly, `reset()` after any -## filter operations can not restore data for a `Spectra` with a -## `MsBackendMemory` or `MsBackendDataFrame`. -res_2 <- applyProcessing(res) -res_rest <- reset(res_2) -lengths(mz(res)) -lengths(mz(res_rest)) - - -## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using -## the normalized dotproduct method. -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20]) -## first row contains comparisons of spectrum 2 with spectra 10 to 20 and -## the second row comparisons of spectrum 3 with spectra 10 to 20 -res - -## To use a simple Pearson correlation instead we can define a function -## that takes the two peak matrices and calculates the correlation for -## their second columns (containing the intensity values). -correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) { - cor(x[, 2], y[, 2], use = use) -} -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20], - FUN = correlateSpectra) -res - -## Use compareSpectra to determine the number of common (matching) peaks -## with a ppm of 10: -## type = "inner" uses a *inner join* to match peaks, i.e. keeps only -## peaks that can be mapped betwen both spectra. The provided FUN returns -## simply the number of matching peaks. -compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner", - FUN = function(x, y, ...) nrow(x)) - -## Apply an arbitrary function to each spectrum in a Spectra. -## In the example below we calculate the mean intensity for each spectrum -## in a subset of the sciex_im data. Note that we can access all variables -## of each individual spectrum either with the `$` operator or the -## corresponding method. -res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]])) -head(res) - -## It is however important to note that dedicated methods to access the -## data (such as `intensity`) are much more efficient than using `lapply()`: -res <- lapply(intensity(sciex_im[1:20]), mean) -head(res) - -## As an alternative, applying a function `FUN` to a `Spectra` can be -## performed *chunk-wise*. The advantage of this is, that only the data for -## one chunk at a time needs to be loaded into memory reducing the memory -## demand. This type of processing can be performed by specifying the size -## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` -## parameter -spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) - -## ---- DATA EXPORT ---- +## -------- DATA EXPORT -------- ## Some `MsBackend` classes provide an `export()` method to export the data ## to the file format supported by the backend. @@ -1959,45 +372,7 @@ res <- Spectra(backendInitialize(MsBackendMzR(), fls[1])) mz(res) mz(data) - -## ---- PEAKS VARIABLES AND DATA ---- - -## Some `MsBackend` classes provide support for arbitrary peaks variables -## (in addition to the mandatory `"mz"` and `"intensity"` values. Below -## we create a simple data frame with an additional peak variable `"pk_ann"` -## and create a `Spectra` with a `MsBackendMemory` for that data. -## Importantly the number of values (per spectrum) need to be the same -## for all peak variables. - -tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) -tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) -tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) -tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) - -## Create the Spectra. With parameter `peaksVariables` we can define -## the columns in `tmp` that contain peaks variables. -sps <- Spectra(tmp, source = MsBackendMemory(), - peaksVariables = c("mz", "intensity", "pk_ann")) -peaksVariables(sps) - -## Extract just the m/z and intensity values -peaksData(sps)[[1L]] - -## Extract the full peaks data -peaksData(sps, columns = peaksVariables(sps))[[1L]] - -## Access just the pk_ann variable -sps$pk_ann - -## Convert a subset of the Spectra object to a long DataFrame. -asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) } \author{ -Nir Shahaf, Johannes Rainer - -Nir Shahaf - -Johannes Rainer - Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail } diff --git a/man/countIdentifications.Rd b/man/countIdentifications.Rd index c7904ef6..08afd04b 100644 --- a/man/countIdentifications.Rd +++ b/man/countIdentifications.Rd @@ -109,6 +109,9 @@ sp <- countIdentifications(sp) ## and three PSMs respectively. table(sp$countIdentifications, sp$msLevel) } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis functions. +} \author{ Laurent Gatto } diff --git a/man/estimatePrecursorIntensity.Rd b/man/estimatePrecursorIntensity.Rd index 97a2cde2..8780aab4 100644 --- a/man/estimatePrecursorIntensity.Rd +++ b/man/estimatePrecursorIntensity.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/Spectra.R \name{estimatePrecursorIntensity,Spectra-method} \alias{estimatePrecursorIntensity,Spectra-method} +\alias{estimatePrecursorIntensity} \title{Estimate Precursor Intensities} \usage{ \S4method{estimatePrecursorIntensity}{Spectra}( diff --git a/man/estimatePrecursorMz.Rd b/man/estimatePrecursorMz.Rd index f79bfa24..7bc9e6cd 100644 --- a/man/estimatePrecursorMz.Rd +++ b/man/estimatePrecursorMz.Rd @@ -83,6 +83,9 @@ plot(precursorMz(s), precursorMz(s) - pmz, xlab = "precursor m/z", ## we could then replace the reported precursor m/z values s$precursorMz <- pmz } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Mar Garcia-Aloy, Johannes Rainer } diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index a203f8c6..1249a50f 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -8,15 +8,6 @@ \alias{[,MsBackendDataFrame-method} \alias{ppm} \alias{bin,numeric-method} -\alias{containsMz} -\alias{containsNeutralLoss} -\alias{dropNaSpectraVariables} -\alias{entropy} -\alias{export} -\alias{pickPeaks} -\alias{replaceIntensitiesBelow} -\alias{reset} -\alias{selectSpectraVariables} \alias{show,MsBackendDataFrame-method} \alias{backendMerge,MsBackendDataFrame-method} \alias{acquisitionNum,MsBackendDataFrame-method} @@ -170,24 +161,6 @@ .check = TRUE ) -containsMz(object, ...) - -containsNeutralLoss(object, ...) - -dropNaSpectraVariables(object, ...) - -entropy(object, ...) - -export(object, ...) - -pickPeaks(object, ...) - -replaceIntensitiesBelow(object, threshold = min, ...) - -reset(object, ...) - -selectSpectraVariables(object, ...) - \S4method{show}{MsBackendDataFrame}(object) \S4method{backendMerge}{MsBackendDataFrame}(object, ...) diff --git a/man/joinPeaks.Rd b/man/joinPeaks.Rd index 29cabc8d..bc1fa688 100644 --- a/man/joinPeaks.Rd +++ b/man/joinPeaks.Rd @@ -142,7 +142,12 @@ joinPeaksGnps(x, y, pmz_x, pmz_y) joinPeaksGnps(x, y, pmz_x, yPrecursorMz = NA) } \seealso{ -\code{\link[=gnps]{gnps()}} +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for the function to calculate similarities between +spectra. +\item \code{\link[=gnps]{gnps()}} in the \emph{MsCoreUtils} package for more information on the GNPS +similarity score. +} } \author{ Johannes Rainer, Michael Witting diff --git a/man/neutralLoss.Rd b/man/neutralLoss.Rd index da1a887e..d27cd3c8 100644 --- a/man/neutralLoss.Rd +++ b/man/neutralLoss.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AllGenerics.R, R/Spectra-neutralLoss.R +% Please edit documentation in R/Spectra-neutralLoss.R \name{neutralLoss} \alias{neutralLoss} \alias{PrecursorMzParam} \alias{neutralLoss,Spectra,PrecursorMzParam-method} \title{Calculate Neutral Loss Spectra} \usage{ -neutralLoss(object, param, ...) - PrecursorMzParam( filterPeaks = c("none", "abovePrecursor", "belowPrecursor", "removePrecursor"), msLevel = c(2L, NA_integer_), @@ -18,13 +16,6 @@ PrecursorMzParam( \S4method{neutralLoss}{Spectra,PrecursorMzParam}(object, param, ...) } \arguments{ -\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral -loss spectra should be calculated.} - -\item{param}{One of the \emph{parameter} objects discussed below.} - -\item{...}{Currently ignored.} - \item{filterPeaks}{For \code{PrecursorMzParam()}: \code{character(1)} or \code{function} defining if and how fragment peaks should be filtered before calculation. Pre-defined options are: \code{"none"} (keep all peaks), \code{"abovePrecursor"} @@ -47,6 +38,13 @@ for details.} \item{tolerance}{\code{numeric(1)} with absolute acceptable difference in m/z values to filter peaks. Defaults to \code{tolerance = 0}. See function description for details.} + +\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral +loss spectra should be calculated.} + +\item{param}{One of the \emph{parameter} objects discussed below.} + +\item{...}{Currently ignored.} } \value{ A \code{\link[=Spectra]{Spectra()}} object with calculated neutral loss spectra. @@ -136,6 +134,9 @@ Aisporna A, Benton PH, Chen A, Derks RJE, Galano JM, Giera M and Siuzdak G Analysis in METLIN. Journal of the American Society for Mass Spectrometry. \doi{10.1021/jasms.1c00343} } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Johannes Rainer } diff --git a/man/processingChunkSize.Rd b/man/processingChunkSize.Rd index b47d8c69..a9382611 100644 --- a/man/processingChunkSize.Rd +++ b/man/processingChunkSize.Rd @@ -1,9 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R \name{processingChunkSize} \alias{processingChunkSize} \alias{processingChunkSize<-} \alias{processingChunkFactor} +\alias{backendBpparam,Spectra-method} \title{Parallel and chunk-wise processing of \code{Spectra}} \usage{ processingChunkSize(x) @@ -11,11 +12,18 @@ processingChunkSize(x) processingChunkSize(x) <- value processingChunkFactor(x) + +\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) } \arguments{ \item{x}{\code{Spectra}.} \item{value}{\code{integer(1)} defining the chunk size.} + +\item{object}{\code{Spectra} object.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information.} } \value{ \code{processingChunkSize()} returns the currently defined processing From 6cd260fd6d0277430ecedde7f4aa5fe2bf09c702 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 08:25:29 +0200 Subject: [PATCH 25/41] docs: restructure documentation --- DESCRIPTION | 4 +- R/Spectra.R | 4 + man/addProcessing.Rd | 547 +++++++++++++++++++++++++++++++++ man/combinePeaks.Rd | 110 +++++++ man/combineSpectra.Rd | 240 +++++++++++++++ man/compareSpectra.Rd | 131 ++++++++ man/filterMsLevel.Rd | 689 ++++++++++++++++++++++++++++++++++++++++++ man/spectraData.Rd | 598 ++++++++++++++++++++++++++++++++++++ 8 files changed, 2321 insertions(+), 2 deletions(-) create mode 100644 man/addProcessing.Rd create mode 100644 man/combinePeaks.Rd create mode 100644 man/combineSpectra.Rd create mode 100644 man/compareSpectra.Rd create mode 100644 man/filterMsLevel.Rd create mode 100644 man/spectraData.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a04e4ac3..0270d5db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,9 +40,9 @@ Authors@R: c(person(given = "RforMassSpectrometry Package Maintainer", Depends: R (>= 4.0.0), S4Vectors, - BiocParallel, - ProtGenerics (>= 1.37.1) + BiocParallel Imports: + ProtGenerics (>= 1.37.1), methods, IRanges, MsCoreUtils (>= 1.7.5), diff --git a/R/Spectra.R b/R/Spectra.R index 179ee58c..045cf88a 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1053,6 +1053,8 @@ asDataFrame <- function(object, i = seq_along(object), } #' @rdname spectraData +#' +#' @export setMethod("acquisitionNum", "Spectra", function(object) acquisitionNum(object@backend)) @@ -1195,6 +1197,8 @@ setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), }) #' @rdname spectraData +#' +#' @export setMethod( "peaksData", "Spectra", function(object, columns = c("mz", "intensity"), diff --git a/man/addProcessing.Rd b/man/addProcessing.Rd new file mode 100644 index 00000000..787aeabe --- /dev/null +++ b/man/addProcessing.Rd @@ -0,0 +1,547 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{applyProcessing} +\alias{applyProcessing} +\alias{processingLog} +\alias{scalePeaks} +\alias{addProcessing} +\alias{bin} +\alias{containsMz} +\alias{containsNeutralLoss} +\alias{entropy} +\alias{pickPeaks} +\alias{replaceIntensitiesBelow} +\alias{reset} +\alias{smooth} +\alias{spectrapply} +\alias{addProcessing,Spectra-method} +\alias{bin,Spectra-method} +\alias{containsMz,Spectra-method} +\alias{containsNeutralLoss,Spectra-method} +\alias{entropy,Spectra-method} +\alias{entropy,ANY-method} +\alias{pickPeaks,Spectra-method} +\alias{replaceIntensitiesBelow,Spectra-method} +\alias{reset,Spectra-method} +\alias{smooth,Spectra-method} +\alias{spectrapply,Spectra-method} +\title{Data manipulation and analysis methods} +\usage{ +applyProcessing( + object, + f = processingChunkFactor(object), + BPPARAM = bpparam(), + ... +) + +processingLog(x) + +scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) + +\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) + +\S4method{bin}{Spectra}( + x, + binSize = 1L, + breaks = NULL, + msLevel. = uniqueMsLevels(x), + FUN = sum, + zero.rm = TRUE +) + +\S4method{containsMz}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + which = c("any", "all"), + BPPARAM = bpparam() +) + +\S4method{containsNeutralLoss}{Spectra}( + object, + neutralLoss = 0, + tolerance = 0, + ppm = 20, + BPPARAM = bpparam() +) + +\S4method{entropy}{Spectra}(object, normalized = TRUE) + +\S4method{entropy}{ANY}(object, ...) + +\S4method{pickPeaks}{Spectra}( + object, + halfWindowSize = 2L, + method = c("MAD", "SuperSmoother"), + snr = 0, + k = 0L, + descending = FALSE, + threshold = 0, + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{replaceIntensitiesBelow}{Spectra}( + object, + threshold = min, + value = 0, + msLevel. = uniqueMsLevels(object) +) + +\S4method{reset}{Spectra}(object, ...) + +\S4method{smooth}{Spectra}( + x, + halfWindowSize = 2L, + method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), + msLevel. = uniqueMsLevels(x), + ... +) + +\S4method{spectrapply}{Spectra}( + object, + FUN, + ..., + chunkSize = integer(), + f = factor(), + BPPARAM = SerialParam() +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{f}{For \code{spectrapply()} and \code{applyProcessing()}: \code{factor} defining +how \code{object} should be splitted for eventual parallel processing. +Defaults to \code{factor()} for \code{spectrapply()} hence the object is not +splitted while it defaults to \code{f = processingChunkSize(object)} for +\code{applyProcessing()} splitting thus the object by default into chunks +depending on \code{\link[=processingChunkSize]{processingChunkSize()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for +additional information on parallel processing.} + +\item{...}{Additional arguments passed to internal and downstream functions.} + +\item{x}{A \code{Spectra}.} + +\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from +intensity values of a spectrum by which all intensities (of +that spectrum) should be divided by. The default \code{by = sum} will +divide intensities of each spectrum by the sum of intensities of that +spectrum.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix +of each spectrum in \code{object}. +For \code{bin()}: function to aggregate intensity values of peaks falling +into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. +For \code{spectrapply()} and \code{chunkapply()}: function to be applied to +each individual or each chunk of \code{Spectra}.} + +\item{spectraVariables}{For \code{addProcessing()}: \code{character} with additional +spectra variables that should be passed along to the function defined +with \code{FUN}. See function description for details.} + +\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. +Defaults to \code{binSize = 1}.} + +\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between +bins.} + +\item{zero.rm}{For \code{bin()}: \code{logical(1)} indicating whether to remove bins +with zero intensity. Defaults to \code{TRUE}, meaning the function will +discard bins created with an intensity of 0 to enhance memory +efficiency.} + +\item{mz}{For \code{containsMz()}: \code{numeric} with the m/z value(s) of the mass +peaks to check.} + +\item{tolerance}{For \code{containsMz()} and \code{neutralLoss()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched.} + +\item{ppm}{For \code{containsMz()} and \code{neutralLoss()}: \code{numeric(1)} defining a +relative, m/z-dependent, maximal accepted difference between m/z values +for peaks to be matched.} + +\item{which}{For \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether +any (the default) or all provided \code{mz} have to be present in the +spectrum.} + +\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the +value which should be subtracted from the spectrum's precursor m/z.} + +\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized +entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for +details.} + +\item{halfWindowSize}{For \code{pickPeaks()}: \code{integer(1)}, used in the +identification of the mass peaks: a local maximum has to be the +maximum in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. +For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the +window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}.} + +\item{method}{For \code{pickPeaks()}: \code{character(1)}, the noise estimators that +should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation +(\code{method = "MAD"}) and Friedman's Super Smoother +(\code{method = "SuperSmoother"}) are supported. +For \code{smooth()}: \code{character(1)}, the smoothing function that should be +used, currently, the Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported.} + +\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the +\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be +higher than \code{snr * noise} to be considered as peak.} + +\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of +the peak that should be considered in the weighted mean calculation.} + +\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values +betwee the nearest valleys around the peak centroids are used.} + +\item{threshold}{For \code{pickPeaks()}: a \code{numeric(1)} defining the proportion +of the maximal peak intensity. Only values above the threshold are +used for the weighted mean calculation. +For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold +or a \code{function} to calculate the threshold for each spectrum on its +intensity values. Defaults to \code{threshold = min}.} + +\item{value}{For \code{replaceIntensitiesBelow()}: \code{numeric(1)} defining the +value with which intensities should be replaced with.} + +\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which the +\code{Spectra} should be split. This parameter overrides parameters +\code{f} and \code{BPPARAM}.} +} +\value{ +See the documentation of the individual functions for a description of the +return value. +} +\description{ +Various data analysis functions are available for \code{Spectra} objects. These +can be categorized into functions that either return a \code{Spectra} object +(with the manipulated data) and functions that directly return the +result from the calculation. For the former category, the data manipulations +are cached in the result object's \emph{processing queue} and only exectuted +on-the-fly when the respective data gets extracted from the \code{Spectra} (see +section \emph{The processing queue} for more information). + +For the second category, the calculations are directly executed and the +result, usually one value per spectrum, returned. Generally, to reduce +memory demand, a chunk-wise processing of the data is performed. +} +\section{Data analysis methods returning a \code{Spectra}}{ + + +The methods listed here return a \code{Spectra} object as a result. +\itemize{ +\item \code{addProcessing()}: adds an arbitrary function that should be applied to the +peaks matrix of every spectrum in \code{object}. The function (can be passed +with parameter \code{FUN}) is expected to take a peaks matrix as input and to +return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +the first containing the m/z values of the peaks and the second the +corresponding intensities. The function has to have \code{...} in its +definition. Additional arguments can be passed with \code{...}. With parameter +\code{spectraVariables} it is possible to define additional spectra variables +from \code{object} that should be passed to the function \code{FUN}. These will be +passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} +will pass the spectra's precursor m/z as a parameter named \code{precursorMz} +to the function. The only exception is the spectra's MS level, these will +be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. +with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be +submitted to the function as a parameter called \code{spectrumMsLevel}). +Examples are provided in the package vignette. +\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is +performed only on spectra of the specified MS level(s) (parameter +\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with +parameter \code{breaks} which by default are equally sized bins, with size +being defined by parameter \code{binSize}, from the minimal to the maximal m/z +of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used +for all spectra in \code{x}. All intensity values for peaks falling into the +same bin are aggregated using the function provided with parameter \code{FUN} +(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that +the binning operation is applied to the peak data on-the-fly upon data +access and it is possible to \emph{revert} the operation with the \code{reset()} +function (see description of \code{reset()} below). +\item \code{countIdentifications}: counts the number of identifications each scan has +led to. See \code{\link[=countIdentifications]{countIdentifications()}} for more details. +\item \code{pickPeaks()}: picks peaks on individual spectra using a moving +window-based approach (window size = \code{2 * halfWindowSize}). For noisy +spectra there are currently two different noise estimators available, +the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and +Friedman's Super Smoother (\code{method = "SuperSmoother"}), +as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. +The method supports also to optionally \emph{refine} the m/z value of +the identified centroids by considering data points that belong (most +likely) to the same mass peak. Therefore the m/z value is calculated as an +intensity weighted average of the m/z values within the peak region. +The peak region is defined as the m/z values (and their respective +intensities) of the \code{2 * k} closest signals to the centroid or the closest +valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} +has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for +details. +If the ratio of the signal to the highest intensity of the peak is below +\code{threshold} it will be ignored for the weighted average. +\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified +threshold with the provided \code{value}. Parameter \code{threshold} can be either +a single numeric value or a function which is applied to all non-\code{NA} +intensities of each spectrum to determine a threshold value for each +spectrum. The default is \code{threshold = min} which replaces all values +which are <= the minimum intensity in a spectrum with \code{value} (the +default for \code{value} is \code{0}). Note that the function specified with +\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} +will be passed to the function. If the spectrum is in profile mode, +ranges of successive non-0 peaks <= \code{threshold} are set to 0. +Parameter \code{msLevel.} allows to apply this to only spectra of certain MS +level(s). +\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending +on parameter \code{by}. With \code{by = sum} (the default) peak intensities are +divided by the sum of peak intensities within each spectrum. The sum of +intensities is thus 1 for each spectrum after scaling. Parameter +\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. +By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all +spectra will be scaled. +\item \code{smooth()}: smooths individual spectra using a moving window-based approach +(window size = \code{2 * halfWindowSize}). Currently, the +Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +weights depending on the distance of the center and calculated +\code{1/2^(-halfWindowSize:halfWindowSize)}) and +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. +For details how to choose the correct \code{halfWindowSize} please see +\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. +} +} + +\section{Data analysis methods returning the result from the calculation}{ + + +The functions listed in this section return immediately the result from the +calculation. To reduce memory demand (and allow parallel processing) the +calculations a chunk-wise processing is generally performed. +\itemize{ +\item \code{chunkapply()}: apply an arbitrary function to chunks of spectra. See +\code{\link[=chunkapply]{chunkapply()}} for details and examples. +\item \code{containsMz()}: checks for each of the spectra whether they contain mass +peaks with an m/z equal to \code{mz} (given acceptable difference as defined by +parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter +\code{which} allows to define whether any (\code{which = "any"}, the default) or +all (\code{which = "all"}) of the \code{mz} have to match. The function returns +\code{NA} if \code{mz} is of length 0 or is \code{NA}. +\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a +peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given +acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). +Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). +\item \code{entropy()}: calculates the entropy of each spectra based on the metrics +suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. +\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 +spectra using the intensity of the matching MS1 peak from the +closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +respective MS2 spectrum). With \code{method = "interpolation"} it is also +possible to calculate the precursor intensity based on an interpolation of +intensity values (and retention times) of the matching MS1 peaks from the +previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for +examples and more details. +\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment +spectra's precursor m/z based on the reported precursor m/z and the data +from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. +\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See +\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. +\item \code{spectrapply()}: applies a given function to each individual spectrum or +sets of a \code{Spectra} object. By default, the \code{Spectra} is split into +individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} +is applied to each of them. An alternative splitting can be defined with +parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. +The returned result and its order depend on the function \code{FUN} and how +\code{object} is split (hence on \code{f}, if provided). Parallel processing is +supported and can be configured with parameter \code{BPPARAM}, is however only +suggested for computational intense \code{FUN}. +As an alternative to the (eventual parallel) processing of the full +\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, +parameter \code{chunkSize} needs to be specified. \code{object} is then split into +chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. +This guarantees a lower memory demand (especially for on-disk backends) +since only the data for one chunk needs to be loaded into memory in each +iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and +\code{BPPARAM} will be ignored. +See also \code{chunkapply()} above or examples below for details on chunk-wise +processing. +} +} + +\section{The processing queue}{ + + +Operations that modify mass peak data, i.e. the m/z and intensity values of +a \code{Spectra} are generally not applied immediately to the data but are +\emph{cached} within the object's \emph{processing queue}. These operations are then +applied to the data only upon request, for example when m/z and/or +intensity values are extracted. This lazy execution guarantees that the +same functionality can be applied to any \code{Spectra} object, regardless of +the type of backend that is used. Thus, data manipulation operations can +also be applied to data that is \emph{read only}. As a side effect, this enables +also to \emph{undo} operations using the \code{reset()} function. + +Functions related to the processing queue are: +\itemize{ +\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend +only: apply all steps from the lazy processing queue to the peak data and +write it back to the data storage. Parameter \code{f} allows to specify how +\code{object} should be split for parallel processing. This should either be +equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable +parallel processing alltogether. Other partitionings might result in +errors (especially if a \code{MsBackendHdf5Peaks} backend is used). +\item \code{processingLog()}: returns a \code{character} vector with the processing log +messages. +\item \code{reset()}: restores the data to its original state (as much as possible): +removes any processing steps from the lazy processing queue and calls +\code{reset()} on the backend which, depending on the backend, can also undo +e.g. data filtering operations. Note that a \verb{reset*(} call after +\code{applyProcessing()} will not have any effect. See examples below for more +information. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- FUNCTIONS RETURNING A SPECTRA -------- + +## Replace peak intensities below 40 with a value of 1 +sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +sps_mod + +## Get the intensities of the first spectrum before and after the +## operation +intensity(sps_dda[1]) +intensity(sps_mod[1]) + +## Remove all peaks with an intensity below 5. +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) + +intensity(sps_mod) + +## In addition it is possible to pass a function to `filterIntensity()`: in +## the example below we want to keep only peaks that have an intensity which +## is larger than one third of the maximal peak intensity in that spectrum. +keep_peaks <- function(x, prop = 3) { + x > max(x, na.rm = TRUE) / prop +} +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +intensity(sps_mod) + +## We can also change the proportion by simply passing the `prop` parameter +## to the function. To keep only peaks that have an intensity which is +## larger than half of the maximum intensity: +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +intensity(sps_mod) + +## With the `scalePeaks()` function we can alternatively scale the +## intensities of mass peaks per spectrum to relative intensities. This +## is specifically useful for fragment (MS2) spectra. We below thus +## scale the intensities per spectrum by the total sum of intensities +## (such that the sum of all intensities per spectrum is 1). +## Below we scale the intensities of all MS2 spectra in our data set. +sps_mod <- scalePeaks(sps_dda, msLevel = 2L) + +## MS1 spectra were not affected +sps_mod |> + filterMsLevel(1L) |> + intensity() + +## Intensities of MS2 spectra were scaled +sps_mod |> + filterMsLevel(2L) |> + intensity() + +## Since data manipulation operations are by default not directly applied to +## the data but only cached in the internal processing queue, it is also +## possible to remove these data manipulations with the `reset()` function: +tmp <- reset(sps_mod) +tmp +lengths(sps_dda) |> head() +lengths(sps_mod) |> head() +lengths(tmp) |> head() + +## Data manipulation operations cached in the processing queue can also be +## applied to the mass peaks data with the `applyProcessing()` function, if +## the `Spectra` uses a backend that supports that (i.e. allows replacing +## the mass peaks data). Below we first change the backend to a +## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +## mass peaks data +sps_dda <- setBackend(sps_dda, MsBackendMemory()) +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +sps_mod <- applyProcessing(sps_mod) +sps_mod + +## While we can't *undo* this filtering operation now using the `reset()` +## function, accessing the data would now be faster, because the operation +## does no longer to be applied to the original data before returning to the +## user. + + +## -------- FUNCTIONS RETURNING THE RESULT -------- + +## With the `spectrapply()` function it is possible to apply an +## arbitrary function to each spectrum in a Spectra. +## In the example below we calculate the mean intensity for each spectrum +## in a subset of the sciex_im data. Note that we can access all variables +## of each individual spectrum either with the `$` operator or the +## corresponding method. +res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +head(res) + +## As an alternative, applying a function `FUN` to a `Spectra` can be +## performed *chunk-wise*. The advantage of this is, that only the data for +## one chunk at a time needs to be loaded into memory reducing the memory +## demand. This type of processing can be performed by specifying the size +## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +## parameter +spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) + +## Precursor intensity estimation. Some manufacturers don't report the +## precursor intensity for MS2 spectra: +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +## This intensity can however be estimated from the previously measured +## MS1 scan with the `estimatePrecursorIntensity()` function: +pi <- estimatePrecursorIntensity(sps_dda) + +## This function returned the result as a `numeric` vector with one +## value per spectrum: +pi + +## We can replace the precursor intensity values of the originating +## object: +sps_dda$precursorIntensity <- pi +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +} +\seealso{ +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for calculation of spectra similarity scores. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +} diff --git a/man/combinePeaks.Rd b/man/combinePeaks.Rd new file mode 100644 index 00000000..a59b8f24 --- /dev/null +++ b/man/combinePeaks.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{combinePeaks} +\alias{combinePeaks} +\alias{combinePeaks,Spectra-method} +\title{Aggregating and combining mass peaks data} +\usage{ +\S4method{combinePeaks}{Spectra}( + object, + tolerance = 0, + ppm = 20, + intensityFun = base::mean, + mzFun = base::mean, + weighted = TRUE, + msLevel. = uniqueMsLevels(object), + ... +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{tolerance = 0}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{ppm = 20}.} + +\item{intensityFun}{Function to aggregate intensities for all peaks in +each peak group into a single intensity value.} + +\item{mzFun}{Function to aggregate m/z values for all mass peaks within +each peak group into a single m/z value. This parameter is ignored if +\code{weighted = TRUE} (the default).} + +\item{weighted}{\code{logical(1)} whether m/z values of peaks within each peak +group should be aggregated into a single m/z value using an +intensity-weighted mean. Defaults to \code{weighted = TRUE}.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{...}{ignored.} +} +\description{ +In addition to aggregating content of spectra variables (describe in +\code{\link[=combineSpectra]{combineSpectra()}}) it is also possible to aggregate and combine mass peaks +data from individual spectra within a \code{Spectra}. These \code{combinePeaks()} +function combines mass peaks \strong{within each spectrum} with a difference in +their m/z values that is smaller than the maximal acceptable difference +defined by \code{ppm} and \code{tolerance}. Parameters \code{intensityFun} and \code{mzFun} +allow to define functions to aggregate the intensity and m/z values for +each such group of peaks. With \code{weighted = TRUE} (the default), the m/z +value of the combined peak is calculated using an intensity-weighted mean +and parameter \code{mzFun} is ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is +used for the grouping of mass peaks. Parameter \code{msLevel.} allows to define +selected MS levels for which peaks should be combined. This function +returns a \code{Spectra} with the same number of spectra than the input object, +but with possibly combined peaks within each spectrum. +Additional peak variables (other than \code{"mz"} and \code{"intensity"}) are +dropped (i.e. their values are replaced with \code{NA}) for combined peaks +unless they are constant across the combined peaks. See also +\code{\link[=reduceSpectra]{reduceSpectra()}} for a function to select a single \emph{representative} +mass peak for each peak group. +} +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) + +## Combine mass peaks per spectrum with a difference in their m/z value +## that is smaller than 20 ppm. The intensity values of such peaks are +## combined by summing their values, while for the m/z values the median +## is reported +sciex_comb <- combinePeaks(sciex, ppm = 20, + intensityFun = sum, mzFun = median) + +## Comparing the number of mass peaks before and after aggregation +lengths(sciex) |> head() +lengths(sciex_comb) |> head() + +## Plotting the first spectrum before and after aggregation +par(mfrow = c(1, 2)) +plotSpectra(sciex[2L]) +plotSpectra(sciex_comb[2L]) + +## Using `reduceSpectra()` to keep for each group of mass peaks with a +## difference in their m/z values < 20ppm the one with the highest intensity. +sciex_red <- reduceSpectra(sciex, ppm = 20) + +## Comparing the number of mass peaks before and after the operation +lengths(sciex) |> head() +lengths(sciex_red) |> head() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}'s +spectra data. +\item \code{\link[=combinePeaksData]{combinePeaksData()}} for the function to combine the mass peaks data. +\item \code{\link[=reduceSpectra]{reduceSpectra()}} and similar functions to filter mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/combineSpectra.Rd b/man/combineSpectra.Rd new file mode 100644 index 00000000..d4f7bdb0 --- /dev/null +++ b/man/combineSpectra.Rd @@ -0,0 +1,240 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{concatenateSpectra} +\alias{concatenateSpectra} +\alias{combineSpectra} +\alias{joinSpectraData} +\alias{split} +\alias{c,Spectra-method} +\alias{split,Spectra,ANY-method} +\title{Merging, aggregating and splitting Spectra} +\usage{ +concatenateSpectra(x, ...) + +combineSpectra( + x, + f = x$dataStorage, + p = x$dataStorage, + FUN = combinePeaksData, + ..., + BPPARAM = bpparam() +) + +joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") + +\S4method{c}{Spectra}(x, ...) + +\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{...}{Additional arguments.} + +\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} +for details. +For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra +that should be combined. Defaults to \code{x$dataStorage}.} + +\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input +\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., +depending on the used backend, per-file parallel processing will be +performed.} + +\item{FUN}{For \code{combineSpectra()}: function to combine the (peak matrices) +of the spectra. Defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} + +\item{y}{A \code{DataFrame} with the spectra variables to join/add.} + +\item{by.x}{A \code{character(1)} specifying the spectra variable used +for merging. Default is \code{"spectrumId"}.} + +\item{by.y}{A \code{character(1)} specifying the column used for +merging. Set to \code{by.x} if missing.} + +\item{suffix.y}{A \code{character(1)} specifying the suffix to be used +for making the names of columns in the merged spectra variables +unique. This suffix will be used to amend \code{names(y)}, while +\code{spectraVariables(x)} will remain unchanged.} + +\item{drop}{For \code{split()}: not considered.} +} +\description{ +Various functions are availabe to combine, aggregate or split data from one +of more \code{Spectra} objects. These are: +\itemize{ +\item \code{c()} and \code{concatenateSpectra()}: combines several \code{Spectra} objects into +a single object. The resulting \code{Spectra} contains all data from all +individual \code{Spectra}, i.e. the union of all their spectra variables. +Concatenation will fail if the processing queue of any of the \code{Spectra} +objects is not empty or if different backends are used for the \code{Spectra} +objects. In such cases it is suggested to first change the backends of +all \code{Spectra} to the same type of backend (using the \code{\link[=setBackend]{setBackend()}} +function and to eventually (if needed) apply the processing queue using +the \code{\link[=applyProcessing]{applyProcessing()}} function. +\item \code{combineSpectra()}: combines sets of spectra (defined with parameter \code{f}) +into a single spectrum per set aggregating their MS data (i.e. their +\emph{peaks data} matrices with the \emph{m/z} and intensity values of their +mass peaks). The spectra variable values of the first spectrum per set +are reported for the combined spectrum. The peak matrices of the spectra +per set are combined using the function specified with parameter \code{FUN} +which uses by default the \code{\link[=combinePeaksData]{combinePeaksData()}} function. See the +documentation of \code{\link[=combinePeaksData]{combinePeaksData()}} for details on the aggregation of +the peak data and the package vignette for examples. +The sets of spectra can be specified with parameter \code{f} which is expected +to be a \code{factor} or \code{vector} of length equal to the length of the +\code{Spectra} specifying to which set a spectrum belongs to. The function +returns a \code{Spectra} of length equal to the unique levels of \code{f}. The +optional parameter \code{p} allows to define how the \code{Spectra} should be +split for potential parallel processing. The default is +\code{p = x$dataStorage} and hence a per storage file parallel processing is +applied for \code{Spectra} with on disk data representations (such as the +\code{\link[=MsBackendMzR]{MsBackendMzR()}}). This also prevents that spectra from different data +files/samples are combined (eventually use e.g. \code{p = x$dataOrigin} or any +other spectra variables defining the originating samples for a spectrum). +Before combining the peaks data, all eventual present processing steps are +applied (by calling \code{\link[=applyProcessing]{applyProcessing()}} on the \code{Spectra}). This function +will replace the original \emph{m/z} and intensity values of a \code{Spectra} hence +it can not be called on a \code{Spectra} with a \emph{read-only} backend. In such +cases, the backend should be changed to a \emph{writeable} backend before +using the \code{\link[=setBackend]{setBackend()}} function (to e.g. a \code{\link[=MsBackendMemory]{MsBackendMemory()}} backend). +\item \code{joinSpectraData()}: Individual spectra variables can be directly +added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} +function allows to merge a \code{DataFrame} to the existing spectra +data of a \code{Spectra}. This function diverges from the \code{\link[=merge]{merge()}} method in +two main ways: +\itemize{ +\item The \code{by.x} and \code{by.y} column names must be of length 1. +\item If variable names are shared in \code{x} and \code{y}, the spectra +variables of \code{x} are not modified. It's only the \code{y} +variables that are appended with the suffix defined in +\code{suffix.y}. This is to avoid modifying any core spectra +variables that would lead to an invalid object. +\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not +allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) +throw a warning and only the last occurrence is kept. These +should be explored and ideally be removed using for +\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar +functions. +} +\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} +of \code{Spectra} objects. +} +} +\examples{ + +## Create a Spectra providing a `DataFrame` containing a MS data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## Create a second Spectra from mzML files and use the `MsBackendMzR` +## on-disk backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Subset to the first 100 spectra to reduce running time of the examples +sciex <- sciex[1:100] + + +## -------- COMBINE SPECTRA -------- + +## Combining the `Spectra` object `s` with the MS data from `sciex`. +## Calling directly `c(s, sciex)` would result in an error because +## both backends use a different backend. We thus have to first change +## the backends to the same backend. We change the backend of the `sciex` +## `Spectra` to a `MsBackendMemory`, the backend used by `s`. + +sciex <- setBackend(sciex, MsBackendMemory()) + +## Combine the two `Spectra` +all <- c(s, sciex) +all + +## The new `Spectra` objects contains the union of spectra variables from +## both: +spectraVariables(all) + +## The spectra variables that were not present in `s`: +setdiff(spectraVariables(all), spectraVariables(s)) + +## The values for these were filled with missing values for spectra from +## `s`: +all$peaksCount |> head() + + +## -------- AGGREGATE SPECTRA -------- + +## Sets of spectra can be combined into a single, representative spectrum +## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +## the spectra's m/z and intensity values) while using the values for all +## spectra variables from the first spectrum per set. Below we define the +## sets as all spectra measured in the *same second*, i.e. rounding their +## retention time to the next closer integer value. +f <- round(rtime(sciex)) +head(f) + +cmp <- combineSpectra(sciex, f = f) + +## The length of `cmp` is now equal to the length of unique levels in `f`: +length(cmp) + +## The spectra variable value from the first spectrum per set is used in +## the representative/combined spectrum: +cmp$rtime + +## The peaks data was aggregated: the number of mass peaks of the first six +## spectra from the original `Spectra`: +lengths(sciex) |> head() + +## and for the first aggreagated spectra: +lengths(cmp) |> head() + +## The default peaks data aggregation method joins all mass peaks. See +## documentation of the `combinePeaksData()` function for more options. + + +## -------- SPLITTING DATA -------- + +## A `Spectra` can be split into a `list` of `Spectra` objects using the +## `split()` function defining the sets into which the `Spectra` should +## be splitted into with parameter `f`. +sciex_split <- split(sciex, f) + +length(sciex_split) +sciex_split |> head() + + +## -------- ADDING SPECTRA DATA -------- + +## Adding new spectra variables +sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging + var1 = rnorm(10), + var2 = sample(letters, 10)) +spv + +sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") + +spectraVariables(sciex2) +spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +} +\seealso{ +\itemize{ +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to aggregate mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/compareSpectra.Rd b/man/compareSpectra.Rd new file mode 100644 index 00000000..375671c4 --- /dev/null +++ b/man/compareSpectra.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{compareSpectra} +\alias{compareSpectra} +\alias{compareSpectra,Spectra,Spectra-method} +\alias{compareSpectra,Spectra,missing-method} +\title{Spectra similarity calculations} +\usage{ +\S4method{compareSpectra}{Spectra,Spectra}( + x, + y, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) + +\S4method{compareSpectra}{Spectra,missing}( + x, + y = NULL, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{y}{A \code{Spectra} object.} + +\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between +the two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and +possible functions. Defaults to \code{\link[=joinPeaks]{joinPeaks()}}.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{FUN}{function to compare intensities of peaks between two spectra. +Defaults to \code{\link[=ndotproduct]{ndotproduct()}}.} + +\item{...}{Additional arguments passed to the internal functions.} + +\item{SIMPLIFY}{\code{logical(1)} defining whether the result matrix should be +\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is +of length 1).} +} +\description{ +\code{compareSpectra()} compares each spectrum in \code{x} with each spectrum in \code{y} +using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If +\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum +in \code{x}. +The matching/mapping of peaks between the compared spectra is done with the +\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra +and allows to keep all peaks from the first spectrum (\code{type = "left"}), +from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to +keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more +information and examples). The \code{MAPFUN} function should have parameters +\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to +the function. + +In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is supported for +GNPS-like similarity score calculations. Note that \code{joinPeaksGnps()} should +only be used in combination with \code{FUN = MsCoreUtils::gnps} +(see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and details). Use +\code{MAPFUN = joinPeaksNone} to disable internal peak matching/mapping if a +similarity scoring function is used that performs the matching internally. + +\code{FUN} is supposed to be a function to compare intensities of (matched) +peaks of the two spectra that are compared. The function needs to take two +matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed +to return a single numeric as result. In addition to the two peak matrices +the spectra's precursor m/z values are passed to the function as parameters +\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} +(precursor m/z of the \code{y} peak matrix). Additional parameters to functions +\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and +\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. +The function returns a \code{matrix} with the results of \code{FUN} for each +comparison, number of rows equal to \code{length(x)} and number of columns +equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from +the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} +is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also +the vignette for additional examples, such as using spectral entropy +similarity in the scoring. +} +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + +## Restrict to MS2 (fragment) spectra: +sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) + +## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +## the normalized dotproduct method. +res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +## the second row comparisons of spectrum 3 with spectra 10 to 20 +res + +## We next calculate the pairwise similarity for the first 10 spectra +compareSpectra(sps_ms2[1:10]) + +## Use compareSpectra to determine the number of common (matching) peaks +## with a ppm of 10: +## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +## peaks that can be mapped betwen both spectra. The provided FUN returns +## simply the number of matching peaks. +compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) + +## We repeat this calculation between all pairwise combinations +## of the first 20 spectra +compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/filterMsLevel.Rd b/man/filterMsLevel.Rd new file mode 100644 index 00000000..0ea3698b --- /dev/null +++ b/man/filterMsLevel.Rd @@ -0,0 +1,689 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{deisotopeSpectra} +\alias{deisotopeSpectra} +\alias{reduceSpectra} +\alias{filterPrecursorMaxIntensity} +\alias{filterPrecursorIsotopes} +\alias{filterPrecursorPeaks} +\alias{filterMsLevel} +\alias{[,Spectra-method} +\alias{filterAcquisitionNum} +\alias{filterDataOrigin} +\alias{filterDataStorage} +\alias{filterEmptySpectra} +\alias{filterIsolationWindow} +\alias{filterPolarity} +\alias{filterPrecursorCharge} +\alias{filterPrecursorMzRange} +\alias{filterPrecursorMzValues} +\alias{filterPrecursorScan} +\alias{filterRanges} +\alias{filterRt} +\alias{filterValues} +\alias{dropNaSpectraVariables} +\alias{selectSpectraVariables} +\alias{filterIntensity} +\alias{filterMzRange} +\alias{filterMzValues} +\alias{dropNaSpectraVariables,Spectra-method} +\alias{selectSpectraVariables,Spectra-method} +\alias{filterAcquisitionNum,Spectra-method} +\alias{filterEmptySpectra,Spectra-method} +\alias{filterDataOrigin,Spectra-method} +\alias{filterDataStorage,Spectra-method} +\alias{filterFourierTransformArtefacts,Spectra-method} +\alias{filterIntensity,Spectra-method} +\alias{filterIsolationWindow,Spectra-method} +\alias{filterMsLevel,Spectra-method} +\alias{filterMzRange,Spectra-method} +\alias{filterMzValues,Spectra-method} +\alias{filterPolarity,Spectra-method} +\alias{filterPrecursorMz,Spectra-method} +\alias{filterPrecursorMzRange,Spectra-method} +\alias{filterPrecursorMzValues,Spectra-method} +\alias{filterPrecursorCharge,Spectra-method} +\alias{filterPrecursorScan,Spectra-method} +\alias{filterRt,Spectra-method} +\alias{filterRanges,Spectra-method} +\alias{filterValues,Spectra-method} +\title{Filter and subset Spectra objects} +\usage{ +deisotopeSpectra( + x, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), + tolerance = 0, + ppm = 20, + charge = 1 +) + +reduceSpectra(x, tolerance = 0, ppm = 20) + +filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) + +filterPrecursorIsotopes( + x, + tolerance = 0, + ppm = 20, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") +) + +filterPrecursorPeaks( + object, + tolerance = 0, + ppm = 20, + mz = c("==", ">="), + msLevel. = uniqueMsLevels(object) +) + +\S4method{dropNaSpectraVariables}{Spectra}(object) + +\S4method{selectSpectraVariables}{Spectra}( + object, + spectraVariables = union(spectraVariables(object), peaksVariables(object)) +) + +\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) + +\S4method{filterAcquisitionNum}{Spectra}( + object, + n = integer(), + dataStorage = character(), + dataOrigin = character() +) + +\S4method{filterEmptySpectra}{Spectra}(object) + +\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) + +\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) + +\S4method{filterFourierTransformArtefacts}{Spectra}( + object, + halfWindowSize = 0.05, + threshold = 0.2, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 +) + +\S4method{filterIntensity}{Spectra}( + object, + intensity = c(0, Inf), + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) + +\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) + +\S4method{filterMzRange}{Spectra}( + object, + mz = numeric(), + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterMzValues}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterPolarity}{Spectra}(object, polarity = integer()) + +\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) + +\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) + +\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) + +\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) + +\S4method{filterRanges}{Spectra}( + object, + spectraVariables = character(), + ranges = numeric(), + match = c("all", "any") +) + +\S4method{filterValues}{Spectra}( + object, + spectraVariables = character(), + values = numeric(), + ppm = 0, + tolerance = 0, + match = c("all", "any") +) +} +\arguments{ +\item{x}{\code{Spectra} object.} + +\item{substDefinition}{For \code{deisotopeSpectra()} and +\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions +of isotopic substitutions. Uses by default isotopic substitutions +defined from all compounds in the Human Metabolome Database (HMDB). See +\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} in the +\emph{MetaboCoreUtils} for details.} + +\item{tolerance}{For \code{filterMzValues()} and \code{reduceSpectra()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched (or grouped). For +\code{containsMz()} it can also be of length equal \code{mz} to specify a different +tolerance for each m/z value. +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the +(constant) maximal accepted difference of precursor m/z values of +spectra for grouping them into \emph{precursor groups}. For +\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} +function. For \code{filterValues()}: \code{numeric} of any length allowing to +define a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be +recycled. Default is \code{tolerance = 0}.} + +\item{ppm}{For \code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} +defining a relative, m/z-dependent, maximal accepted difference between +m/z values for peaks to be matched (or grouped). +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative +maximal accepted difference of precursor m/z values of spectra for +grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: +passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. +For \code{filterValues()}: \code{numeric} of any length allowing to define +a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be +recycled.} + +\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized +compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} + +\item{object}{\code{Spectra} object.} + +\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to +filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: +\code{numeric(2)} defining the lower and upper m/z boundary. +For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with +the m/z values to match peaks or precursor m/z against. +For \code{filterPrecursorPeaks()}: \code{character(1)} defining whether mass peaks +with an m/z matching the spectrum's precursor m/z (\code{mz = "=="}, +the default) or mass peaks with a m/z that is equal or larger +(\code{mz = ">="}) should be removed.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}. +For \code{filterMsLevel()}: the MS level to which \code{object} should be +subsetted.} + +\item{spectraVariables}{For \code{selectSpectraVariables()}: \code{character} with the +names of the spectra variables to which the backend should be +subsetted. For \code{filterRanges()} and \code{filterValues()}: \code{character} +vector specifying the column(s) from \code{spectraData(object)} on which +to filter the data and that correspond to the the names of the +spectra variables that should be used for the filtering.} + +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the +object.} + +\item{j}{For \code{[}: not supported.} + +\item{...}{Additional arguments.} + +\item{drop}{For \code{[}: not considered.} + +\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition +numbers to filter for.} + +\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occur only for spectra of selected \code{dataStorage}.} + +\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occurr only for spectra of selected \code{dataOrigin}.} + +\item{halfWindowSize}{For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} +defining the m/z window left and right of a peak where to remove +fourier transform artefacts.} + +\item{threshold}{For \code{filterFourierTransformArtefacts()}: the relative +intensity (to a peak) below which peaks are considered fourier +artefacts. Defaults to \code{threshold = 0.2} hence removing peaks that +have an intensity below 0.2 times the intensity of the tested peak +(within the selected \code{halfWindowSize}).} + +\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope +peaks should not be removed as fourier artefacts.} + +\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge +to be considered for isotopes.} + +\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z +\code{tolerance} to be used to define whether peaks might be isotopes of +the current tested peak.} + +\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 +defining either the lower or the lower and upper intensity limit for the +filtering, or a \code{function} that takes the intensities as input and +returns a \code{logical} (same length then peaks in the spectrum) whether the +peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus +only peaks with \code{NA} intensity are removed.} + +\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} +whether the matching peaks should be retained (\code{keep = TRUE}, the +default) or dropped (\code{keep = FALSE}).} + +\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to +to subset \code{object}.} + +\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor +charges to be used as filter.} + +\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the +acquisition number of the spectra to which the object should be +subsetted.} + +\item{f}{For \code{filterPrecursorScan()}: defining which spectra +belong to the same original data file (sample): Defaults to +\code{f = dataOrigin(x)}.} + +\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to +be used to subset/filter \code{object}.} + +\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values +(upper and lower boundary) that define the ranges to filter the \code{object}. +These paired values need to be in the same order as the +\code{spectraVariables} parameter (see below).} + +\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } +defining whether the condition has to match for all provided +\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them +(\code{match = "any"}) for spectra to be retained.} + +\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the +values to filter the Spectra data. These values need to be in the same +order as the \code{spectraVariables} parameter.} +} +\description{ +A variety of functions to filter or subset \code{Spectra} objects are available. +These can be generally separated into two main classes: I) \emph{classical} +subset operations that immediately reduce the number of spectra in the +object and II) filters that reduce the \strong{content} of the object without +changing its length (i.e. the number of spectra). The latter can be further +subdivided into functions that affect the content of the \code{spectraData} (i.e. +the general spectrum metadata) and those that reduce the content of the +object's \code{peaksData} (i.e. the m/z and intensity values of a spectrum's +mass peaks). + +A description of functions from these 3 different categories are given below +in sections \emph{Subset \code{Spectra}}, \emph{Filter content of \code{spectraData()}} and +\emph{Filter content of \code{peaksData()}}, respectively. +} +\section{Subset \code{Spectra}}{ + + +These functions affect the number of spectra in a \code{Spectra} object creating +a subset of the original object without affecting its content. +\itemize{ +\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method +\strong{always} returns a \code{Spectra} object. +\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching +the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or +\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with +an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin +or dataStorage values} retaining all other spectra. +Returns the filtered \code{Spectra}. +\item \code{filterDataOrigin()}: filters the object retaining spectra matching the +provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type +\code{character} and needs to match exactly the data origin value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataOrigin} parameter). +\item \code{filterDataStorage()}: filters the object retaining spectra stored in the +specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type +\code{character} and needs to match exactly the data storage value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataStorage} parameter). +\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). +Returns the filtered \code{Spectra} object (with spectra in their +original order). +\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their +isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} +and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} +object (with spectra in their original order). +\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching +the MS level specified with argument \code{msLevel}. Returns the filtered +\code{Spectra} (with spectra in their original order). +\item \code{filterPolarity()}: filters the object keeping only spectra matching the +provided polarity. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor +charge(s). +\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor +m/z and precursor intensity into predicted isotope groups and keep for each +only the spectrum representing the monoisotopic precursor. MS1 spectra +are returned as is. See documentation for \code{deisotopeSpectra()} below for +details on isotope prediction and parameter description. +\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups +of (MS2) spectra with similar precursor m/z values (given parameters +\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The +function filters only MS2 spectra and returns all MS1 spectra. If +precursor intensities are \code{NA} for all spectra within a spectra group, the +first spectrum of that groups is returned. +Note: some manufacturers don't provide precursor intensities. These can +however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. +\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now +deprecated): retains spectra with a precursor m/z within the +provided m/z range. See examples for details on selecting spectra with +a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. +\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching +any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with +missing precursor m/z value (e.g. MS1 spectra) are dropped. +\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. +MS2) of acquisition number \code{acquisitionNum}. Returns the filtered +\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to +define which spectra belong to the same sample or original data file ( +defaults to \code{f = dataOrigin(object)}). +\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user +defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available +spectra variables in object (spectra variable names can be specified with +parameter \code{spectraVariables}). Spectra for which the value of a spectra +variable is within it's defined range are retained. If multiple +ranges/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention +times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) +\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on +similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} +(parameter \code{spectraVariables}) to provided values (parameter \code{values}) +given acceptable differences (parameters tolerance and ppm). If multiple +values/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +} +} + +\section{Filter content of \code{spectraData()}}{ + + +The functions described in this section filter the content from a +\code{Spectra}'s spectra data, i.e. affect values of, or complete, spectra +variables. None of these functions reduces the object's number of spectra. +\itemize{ +\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the +object's \code{spectraData} that contain only missing values (\code{NA}). Note that +while columns with only \code{NA}s are removed, a \code{spectraData()} call after +\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values +for \emph{core} spectra variables. The total number of spectra is not changed +by this function. +\item \code{selectSpectraVariables()}: reduces the information within the object to +the selected spectra variables: all data for variables not specified will +be dropped. For mandatory columns (i.e., those listed by +\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only +the values will be dropped but not the variable itself. Additional (or +user defined) spectra variables will be completely removed. +Returns the filtered \code{Spectra}. +} +} + +\section{Filter content of \code{peaksData()}}{ + + +The functions described in this section filter the content of the +\code{Spectra}'s peaks data, i.e. either the number or the values (\emph{m/z} or +intensity values) of the mass peaks. Also, the actual operation is only +executed once peaks data is accessed (through \code{peaksData()}, +\code{mz()} or \code{intensity()}) or \code{applyProcessing()} is called. +These operations don't affect the number of spectra in the \code{Spectra} object. +\itemize{ +\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the +monoisotopic peak for groups of isotopologues. Isotopologues are +estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the +\emph{MetaboCoreUtils} package. Note that +the default parameters for isotope prediction/detection have been +determined using data from the Human Metabolome Database (HMDB) and +isotopes for elements other than CHNOPS might not be detected. See +parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for +more information. The approach and code to define the parameters for +isotope prediction is described +\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. +\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier +artefact peaks from spectra (see examples below). The function iterates +through all intensity ordered peaks in a spectrum and removes all peaks +with an m/z within +/- \code{halfWindowSize} of the current peak if their +intensity is lower than \code{threshold} times the current peak's intensity. +Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} +allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} +being the maximum charge that should be considered and \code{isotopeTolerance} +the absolute acceptable tolerance for matching their m/z). +See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and +\code{deisitopeSpectra()} for an alternative. +\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only +those with intensities that are within the provided range or match the +criteria of the provided function. For the former, parameter \code{intensity} +has to be a \code{numeric} defining the intensity range, for the latter a +\code{function} that takes the intensity values of the spectrum and returns +a \code{logical} whether the peak should be retained or not (see examples +below for details) - additional parameters to the function can be passed +with \code{...}. +To remove only peaks with intensities below a certain threshold, say +100, use \code{intensity = c(100, Inf)}. Note: also a single value can be +passed with the \code{intensity} parameter in which case an upper limit of +\code{Inf} is used. +Note that this function removes also peaks with missing intensities +(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the +filtering to spectra of the specified MS level(s). +\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing +those in each spectrum that are within the provided m/z range. Whether +peaks are retained or removed can be configured with parameter \code{keep} +(default \code{keep = TRUE}). +\item \code{filterMzValues()}: filters mass peaks in the object keeping all +peaks in each spectrum that match the provided m/z value(s) (for +\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). +The m/z matching considers also the absolute \code{tolerance} and m/z-relative +\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. +\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any +set of range-based filters on numeric spectra or peaks variables. See +\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. +\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with +an m/z equal or larger than the m/z of the precursor, depending on the +value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). +\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in +(given \code{ppm} and \code{tolerance}) in each spectrum only the mass peak with the +highest intensity removing all other peaks hence \emph{reducing} each +spectrum to the highest intensity peaks per \emph{peak group}. +Peak groups are defined using the \code{\link[=group]{group()}} function from the +\emph{MsCoreUtils} package. See also the \code{\link[=combinePeaks]{combinePeaks()}} function for an +alternative function to combine peaks within each spectrum. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- SUBSET SPECTRA -------- + +## Subset to the first 3 spectra +tmp <- sps_dda[1:3] +tmp +length(tmp) + +## Subset to all MS2 spectra; this could be done with [, or, more +## efficiently, with the `filterMsLevel` function: +sps_dda[msLevel(sps_dda) == 2L] +filterMsLevel(sps_dda, 2L) + +## Filter the object keeping only MS2 spectra with an precursor m/z value +## between a specified range: +filterPrecursorMzRange(sps_dda, c(80, 90)) + +## Filter the object to MS2 spectra with an precursor m/z matching a +## pre-defined value (given ppm and tolerance) +filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) + +## The `filterRanges()` function allows to filter a `Spectra` based on +## numerical ranges of any of its (numerical) spectra variables. +## First, determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz", "peaksCount") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the ranges (pairs of values with lower and upper boundary) to be +## used for the individual spectra variables. The first two values will be +## used for the first spectra variable (e.g., `"rtime"` here), the next two +## for the second (e.g. `"precursorMz"` here) and so on: +ranges <- c(30, 350, 200, 500, 350, 600) + +## Input the parameters within the filterRanges function: +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges) +filt_spectra + +## `filterRanges()` can also be used to filter a `Spectra` object with +## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +sv <- c("rtime", "rtime") +ranges <- c(30, 100, 200, 300) +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges, match = "any") +filt_spectra + +## While `filterRanges()` filtered on numeric ranges, `filterValues()` +## allows to filter an object matching spectra variable values to user +## provided values (allowing to configure allowed differences using the +## `ppm` and `tolerance` parameters). +## First determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the values that will be used to filter the spectra based on their +## similarities to their respective `spectraVariables`. +## The first values in the parameters values, tolerance and ppm will be +## used for the first spectra variable (e.g. `"rtime"` here), the next for +## the second (e.g. `"precursorMz"` here) and so on: +values <- c(350, 80) +tolerance <- c(100, 0.1) +ppm <- c(0, 50) + +## Input the parameters within the `filterValues()` function: +filt_spectra <- filterValues(sps_dda, spectraVariables = sv, + values = values, tolerance = tolerance, ppm = ppm) +filt_spectra + + +## -------- FILTER SPECTRA DATA -------- + +## Remove spectra variables without content (i.e. with only missing values) +sps_noNA <- dropNaSpectraVariables(sps_dda) + +## This reduced the size of the object slightly +print(object.size(sps_dda), unit = "MB") +print(object.size(sps_noNA), unit = "MB") + +## With the `selectSpectraVariables()` function it is in addition possible +## to subset the data of a `Spectra` to the selected columns/variables, +## keeping only their data: +tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", + "scanIndex")) +print(object.size(tmp), units = "MB") + +## Except the selected variables, all data is now removed. Accessing +## core spectra variables still works, but returns only NA +rtime(tmp) |> head() + + +## -------- FILTER PEAKS DATA -------- + +## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +## only those mass peaks with an m/z value matching the provided value(s). +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) + +## The filtered `Spectra` has the same length +length(sps_dda) +length(sps_sub) + +## But the number of mass peaks changed +lengths(sps_dda) |> head() +lengths(sps_sub) |> head() + +## This function can also be used to remove specific peaks from a spectrum +## by setting `keep = FALSE`. +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), + tolerance = 0.3, keep = FALSE) +lengths(sps_sub) |> head() + +## With the `filterMzRange()` function it is possible to keep (or remove) +## mass peaks with m/z values within a specified numeric range. +sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +lengths(sps_sub) |> head() + +## See also the `filterPeaksRanges()` function for a more flexible framework +## to filter mass peaks + + +## Removing fourier transform artefacts seen in Orbitra data. + +## Loading an Orbitrap spectrum with artefacts. +data(fft_spectrum) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +fft_spectrum +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +## Using a few examples peaks in your data you can optimize the parameters +fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, + halfWindowSize = 0.2, + threshold = 0.005, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 + ) + +fft_spectrum_filtered +length(mz(fft_spectrum_filtered)[[1]]) +plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + + +## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +## by similarity of their m/z values) only one representative peak. This +## function helps cleaning fragment spectra. +## Filter the data set to MS2 spectra +ms2 <- filterMsLevel(sps_dda, 2L) + +## For groups of fragment peaks with a difference in m/z < 0.1, keep only +## the largest one. +ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +lengths(ms2) |> tail() +lengths(ms2_red) |> tail() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}. +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to combine or aggregate a \code{Spectra}'s +\code{peaksData()} +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +} diff --git a/man/spectraData.Rd b/man/spectraData.Rd new file mode 100644 index 00000000..49d2bee3 --- /dev/null +++ b/man/spectraData.Rd @@ -0,0 +1,598 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{spectraData} +\alias{spectraData} +\alias{acquisitionNum} +\alias{centroided} +\alias{collisionEnergy} +\alias{dataOrigin} +\alias{dataStorage} +\alias{intensity} +\alias{ionCount} +\alias{isCentroided} +\alias{isEmpty} +\alias{isolationWindowLowerMz} +\alias{isolationWindowUpperMz} +\alias{isolationWindowTargetMz} +\alias{lengths} +\alias{msLevel} +\alias{mz} +\alias{peaksData} +\alias{peaksVariables} +\alias{polarity} +\alias{precursorCharge} +\alias{precursorIntensity} +\alias{precursorMz} +\alias{rtime} +\alias{scanIndex} +\alias{smoothed} +\alias{spectraNames} +\alias{spectraVariables} +\alias{tic} +\alias{uniqueMsLevels} +\alias{asDataFrame} +\alias{acquisitionNum,Spectra-method} +\alias{centroided,Spectra-method} +\alias{centroided<-,Spectra-method} +\alias{collisionEnergy,Spectra-method} +\alias{collisionEnergy<-,Spectra-method} +\alias{coreSpectraVariables} +\alias{dataOrigin,Spectra-method} +\alias{dataOrigin<-,Spectra-method} +\alias{dataStorage,Spectra-method} +\alias{intensity,Spectra-method} +\alias{ionCount,Spectra-method} +\alias{isCentroided,Spectra-method} +\alias{isEmpty,Spectra-method} +\alias{isolationWindowLowerMz,Spectra-method} +\alias{isolationWindowLowerMz<-,Spectra-method} +\alias{isolationWindowTargetMz,Spectra-method} +\alias{isolationWindowTargetMz<-,Spectra-method} +\alias{isolationWindowUpperMz,Spectra-method} +\alias{isolationWindowUpperMz<-,Spectra-method} +\alias{length,Spectra-method} +\alias{lengths,Spectra-method} +\alias{msLevel,Spectra-method} +\alias{mz,Spectra-method} +\alias{peaksData,Spectra-method} +\alias{peaksVariables,Spectra-method} +\alias{polarity,Spectra-method} +\alias{polarity<-,Spectra-method} +\alias{precScanNum,Spectra-method} +\alias{precursorCharge,Spectra-method} +\alias{precursorIntensity,Spectra-method} +\alias{precursorMz,Spectra-method} +\alias{rtime,Spectra-method} +\alias{rtime<-,Spectra-method} +\alias{scanIndex,Spectra-method} +\alias{smoothed,Spectra-method} +\alias{smoothed<-,Spectra-method} +\alias{spectraData,Spectra-method} +\alias{spectraData<-,Spectra-method} +\alias{spectraNames,Spectra-method} +\alias{spectraNames<-,Spectra-method} +\alias{spectraVariables,Spectra-method} +\alias{tic,Spectra-method} +\alias{uniqueMsLevels,Spectra-method} +\alias{$,Spectra-method} +\alias{$<-,Spectra-method} +\alias{[[,Spectra-method} +\alias{[[<-,Spectra-method} +\title{Accessing mass spectrometry data} +\usage{ +asDataFrame( + object, + i = seq_along(object), + spectraVars = spectraVariables(object) +) + +\S4method{acquisitionNum}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) <- value + +\S4method{collisionEnergy}{Spectra}(object) + +\S4method{collisionEnergy}{Spectra}(object) <- value + +coreSpectraVariables() + +\S4method{dataOrigin}{Spectra}(object) + +\S4method{dataOrigin}{Spectra}(object) <- value + +\S4method{dataStorage}{Spectra}(object) + +\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{ionCount}{Spectra}(object) + +\S4method{isCentroided}{Spectra}(object, ...) + +\S4method{isEmpty}{Spectra}(x) + +\S4method{isolationWindowLowerMz}{Spectra}(object) + +\S4method{isolationWindowLowerMz}{Spectra}(object) <- value + +\S4method{isolationWindowTargetMz}{Spectra}(object) + +\S4method{isolationWindowTargetMz}{Spectra}(object) <- value + +\S4method{isolationWindowUpperMz}{Spectra}(object) + +\S4method{isolationWindowUpperMz}{Spectra}(object) <- value + +\S4method{length}{Spectra}(x) + +\S4method{lengths}{Spectra}(x, use.names = FALSE) + +\S4method{msLevel}{Spectra}(object) + +\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{peaksData}{Spectra}( + object, + columns = c("mz", "intensity"), + f = processingChunkFactor(object), + ..., + BPPARAM = bpparam() +) + +\S4method{peaksVariables}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) <- value + +\S4method{precScanNum}{Spectra}(object) + +\S4method{precursorCharge}{Spectra}(object) + +\S4method{precursorIntensity}{Spectra}(object) + +\S4method{precursorMz}{Spectra}(object) + +\S4method{rtime}{Spectra}(object) + +\S4method{rtime}{Spectra}(object) <- value + +\S4method{scanIndex}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) <- value + +\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) + +\S4method{spectraData}{Spectra}(object) <- value + +\S4method{spectraNames}{Spectra}(object) + +\S4method{spectraNames}{Spectra}(object) <- value + +\S4method{spectraVariables}{Spectra}(object) + +\S4method{tic}{Spectra}(object, initial = TRUE) + +\S4method{uniqueMsLevels}{Spectra}(object, ...) + +\S4method{$}{Spectra}(x, name) + +\S4method{$}{Spectra}(x, name) <- value + +\S4method{[[}{Spectra}(x, i, j, ...) + +\S4method{[[}{Spectra}(x, i, j, ...) <- value +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{i}{For \code{asDataFrame()}: A \code{numeric} indicating which scans to coerce +to a \code{DataFrame} (default is \code{seq_along(object)}).} + +\item{spectraVars}{\code{character()} indicating what spectra variables to add to +the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all +available variables.} + +\item{value}{A vector with values to replace the respective spectra +variable. Needs to be of the correct data type for the spectra variable.} + +\item{f}{For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how +data should be chunk-wise loaded an processed. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} + +\item{...}{Additional arguments.} + +\item{x}{A \code{Spectra} object.} + +\item{use.names}{For \code{lengths()}: ignored.} + +\item{columns}{For \code{spectraData()} accessor: optional \code{character} with +column names (spectra variables) that should be included in the +returned \code{DataFrame}. By default, all columns are returned. +For \code{peaksData()} accessor: optional \code{character} with requested columns +in the individual \code{matrix} of the returned \code{list}. Defaults to +\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} +with \code{object} being the \code{Spectra} object are supported.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for more information +on parallel processing.} + +\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially +reported total ion current should be reported, or whether the +total ion current should be (re)calculated on the actual data +(\code{initial = FALSE}, same as \code{ionCount()}).} + +\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return +or set.} + +\item{j}{For \code{[}: not supported.} +} +\description{ +As detailed in the documentation of the \link{Spectra} class, a \code{Spectra} object +is a container for mass spectrometry (MS) data that includes both the mass +peaks data (or \emph{peaks data}, generally \emph{m/z} and intensity values) as well +as spectra metadata (so called \emph{spectra variables}). Spectra variables +generally define one value per spectrum, while for peaks variables one value +per mass peak is defined and hence multiple values per spectrum (depending +on the number of mass peaks of a spectrum). + +Data can be extracted from a \code{Spectra} object using dedicated accessor +functions or also using the \code{$} operator. Depending on the backend class +used by the \code{Spectra} to represent the data, data can also be added or +replaced (again, using dedicated functions or using \verb{$<-}). +} +\section{Spectra variables}{ + + +A common set of \emph{core spectra variables} are defined for \code{Spectra}. These +have a pre-defined data type and each \code{Spectra} will return a value for +these if requested. If no value for a spectra variable is defined, a missing +value (of the correct data type) is returned. The list of core spectra +variables and their respective data type is: +\itemize{ +\item \emph{acquisitionNum} \code{integer(1)}: the index of acquisition of a spectrum +during an MS run. +\item \emph{centroided} \code{logical(1)}: whether the spectrum is in profile or centroid +mode. +\item \emph{collisionEnergy} \code{numeric(1)}: collision energy used to create an MSn +spectrum. +\item \emph{dataOrigin} \code{character(1)}: the \emph{origin} of the spectrum's data, e.g. the +mzML file from which it was read. +\item \emph{dataStorage} \code{character(1)}: the (current) storage location of the +spectrum data. This value depends on the backend used to handle and +provide the data. For an \emph{in-memory} backend like the \code{MsBackendDataFrame} +this will be \code{""}, for an on-disk backend such as the +\code{MsBackendHdf5Peaks} it will be the name of the HDF5 file where the +spectrum's peak data is stored. +\item \emph{isolationWindowLowerMz} \code{numeric(1)}: lower m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowTargetMz} \code{numeric(1)}: the target m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowUpperMz} \code{numeric(1)}: upper m/z for the isolation window +in which the (MSn) spectrum was measured. +\item \emph{msLevel} \code{integer(1)}: the MS level of the spectrum. +\item \emph{polarity} \code{integer(1)}: the polarity of the spectrum (\code{0} and \code{1} +representing negative and positive polarity, respectively). +\item \emph{precScanNum} \code{integer(1)}: the scan (acquisition) number of the precursor +for an MSn spectrum. +\item \emph{precursorCharge} \code{integer(1)}: the charge of the precursor of an MSn +spectrum. +\item \emph{precursorIntensity} \code{numeric(1)}: the intensity of the precursor of an +MSn spectrum. +\item \emph{precursorMz} \code{numeric(1)}: the m/z of the precursor of an MSn spectrum. +\item \emph{rtime} \code{numeric(1)}: the retention time of a spectrum. +\item \emph{scanIndex} \code{integer(1)}: the index of a spectrum within a (raw) file. +\item \emph{smoothed} \code{logical(1)}: whether the spectrum was smoothed. +} + +For each of these spectra variable a dedicated accessor function is defined +(such as \code{msLevel()} or \code{rtime()}) that allows to extract the values of +that spectra variable for all spectra in a \code{Spectra} object. Also, +replacement functions are defined, but not all backends might support +replacing values for spectra variables. As described above, additional +spectra variables can be defined or added. The \code{spectraVariables()} function +can be used to + +Values for multiple spectra variables, or all spectra vartiables* can be +extracted with the \code{spectraData()} function. +} + +\section{Peaks variables}{ + + +\code{Spectra} also provide mass peak data with the \emph{m/z} and intensity values +being the \emph{core} peaks variables: +\itemize{ +\item \emph{intensity} \code{numeric}: intensity values for the spectrum's peaks. +\item \emph{mz} \code{numeric}: the m/z values for the spectrum's peaks. +} + +Values for these can be extracted with the \code{mz()} and \code{intensity()} +functions, or the \code{peaksData()} function. The former functions return a +\code{NumericList} with the respective values, while the latter returns a \code{List} +with \code{numeric} two-column matrices. The list of peaks matrices can also +be extracted using \code{as(x, "list")} or \code{as(x, "SimpleList")} with \code{x} being +a \code{Spectra} object. + +Some \code{Spectra}/backends provide also values for additional peaks variables. +The set of available peaks variables can be extracted with the +\code{peaksVariables()} function. +} + +\section{Functions to access MS data}{ + + +The set of available functions to extract data from, or set data in, a +\code{Spectra} object are (in alphabetical order) listed below. Note that there +are also other functions to extract information from a \code{Spectra} object +documented in \code{\link[=addProcessing]{addProcessing()}}. +\itemize{ +\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. +See examples for details. Note that replacing values of a peaks variable +is not supported with a non-empty processing queue, i.e. if any filtering +or data manipulations on the peaks data was performed. In these cases +\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data +operations. +\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the +backend. +\item \code{acquisitionNum()}: returns the acquisition number of each +spectrum. Returns an \code{integer} of length equal to the number of +spectra (with \code{NA_integer_} if not available). +\item \code{asDataFrame()}: converts the \code{Spectra} to a \code{DataFrame} (in long format) +contining all data. Returns a \code{DataFrame}. +\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding +information of the spectra. \code{centroided()} returns a \code{logical} +vector of length equal to the number of spectra with \code{TRUE} if a +spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} +if it is undefined. See also \code{isCentroided()} for estimating from +the spectrum data whether the spectrum is centroided. \code{value} +for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of +length equal to the number of spectra in \code{object}. +\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the +collision energy for all spectra in \code{object}. \code{collisionEnergy()} +returns a \code{numeric} with length equal to the number of spectra +(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a +\code{numeric} of length equal to the number of spectra in \code{object}. +\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with +their expected data type. +\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each +spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than +\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a +\code{character} vector (same length than \code{object}) with the replacement +values for the data origin of each spectrum. +\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) +with the data storage location of each spectrum. +\item \code{intensity()}: gets the intensity values from the spectra. Returns +a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each +spectrum). The length of the list is equal to the number of +\code{spectra} in \code{object}. +\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for +each spectrum. If the spectrum is empty (see \code{isEmpty()}), +\code{NA_real_} is returned. +\item \code{isCentroided()}: a heuristic approach assessing if the spectra in +\code{object} are in profile or centroided mode. The function takes +the \code{qtl}th quantile top peaks, then calculates the difference +between adjacent m/z value and returns \code{TRUE} if the first +quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for +the code.) +\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty +(i.e. does not contain any peaks). Returns a \code{logical} vector of +length equal number of spectra. +\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the +lower m/z boundary of the isolation window. +\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the +target m/z of the isolation window. +\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the +upper m/z boundary of the isolation window. +\item \code{length()}: gets the number of spectra in the object. +\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per +spectrum. Returns an \code{integer} vector (length equal to the +number of spectra). For empty spectra, \code{0} is returned. +\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names +being spectrum names, length equal to the number of spectra) with the MS +level for each spectrum. +\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the +spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of +spectra, each element a \code{numeric} vector with the m/z values of +one spectrum. +\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks +data consist of the m/z and intensity values as well as possible additional +annotations (variables) of all peaks of each spectrum. The function +returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or +\code{data.frame}), with each array providing the values for the requested +\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter +\code{columns} is passed to the backend's \code{peaksData()} function to allow +the selection of specific (or additional) peaks variables (columns) that +should be extracted (if available). Importantly, +it is \strong{not} guaranteed that each backend supports this parameter (while +each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). +Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value +returned by \code{peaksVariables(object)} is supported. +Note also that it is possible to extract the peak data with +\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, +respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} +does not support the parameter \code{columns}. +\item \code{peaksVariables()}: lists the available variables for mass peaks provided +by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which +all backends need to support and provide), but some backends might provide +additional variables. +These variables correspond to the column names of the peak data array +returned by \code{peaksData()}. +\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each +spectrum. \code{polarity()} returns an \code{integer} vector (length equal +to the number of spectra), with \code{0} and \code{1} representing negative +and positive polarities, respectively. \verb{polarity<-} expects an +\code{integer} vector of length 1 or equal to the number of spectra. +\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, +\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), +intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) +and acquisition number (\code{interger}) of the precursor for MS level > +2 spectra from the object. Returns a vector of length equal to +the number of spectra in \code{object}. \code{NA} are reported for MS1 +spectra of if no precursor information is available. +\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) +for each spectrum. \code{rtime()} returns a \code{numeric} vector (length +equal to the number of spectra) with the retention time for each +spectrum. \verb{rtime<-} expects a numeric vector with length equal +to the number of spectra. +\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} +for each spectrum. This represents the relative index of the +spectrum within each file. Note that this can be different to the +\code{acquisitionNum} of the spectrum which represents the index of the +spectrum during acquisition/measurement (as reported in the mzML file). +\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is +\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal +to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector +of length 1 or equal to the number of spectra in \code{object}. +\item \code{spectraData()}: gets general spectrum metadata (annotation, also called +header). \code{spectraData()} returns a \code{DataFrame}. Note that this +method does by default \strong{not} return m/z or intensity values. +\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} +object with the one provided with \code{value}. The \verb{spectraData<-} function +expects a \code{DataFrame} to be passed as value with the same number of rows +as there a spectra in \code{object}. Note that replacing values of +peaks variables is not supported with a non-empty processing queue, i.e. +if any filtering or data manipulations on the peaks data was performed. +In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all +cached data operations and empty the processing queue. +\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. +\item \code{spectraVariables()}: returns a \code{character} vector with the +available spectra variables (columns, fields or attributes of each +spectrum) available in \code{object}. Note that \code{spectraVariables()} does not +list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional +annotations for each MS peak). Peak variables are returned by +\code{peaksVariables()}. +\item \code{tic()}: gets the total ion current/count (sum of signal of a +spectrum) for all spectra in \code{object}. By default, the value +reported in the original raw data file is returned. For an empty +spectrum, \code{0} is returned. +\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This +function is supposed to be more efficient than \code{unique(msLevel(object))}. +} +} + +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Get the number of spectra in the data set +length(sciex) + +## Get the number of mass peaks per spectrum - limit to the first 6 +lengths(sciex) |> head() + +## Get the MS level for each spectrum - limit to the first 6 spectra +msLevel(sciex) |> head() + +## Alternatively, we could also use $ to access a specific spectra variable. +## This could also be used to add additional spectra variables to the +## object (see further below). +sciex$msLevel |> head() + +## Get the intensity and m/z values. +intensity(sciex) +mz(sciex) + +## Convert a subset of the Spectra object to a long DataFrame. +asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) + +## Create a Spectra providing a `DataFrame` containing the spectrum data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## List all available spectra variables (i.e. spectrum data and metadata). +spectraVariables(s) + +## For all *core* spectrum variables accessor functions are available. These +## return NA if the variable was not set. +centroided(s) +dataStorage(s) +rtime(s) +precursorMz(s) + +## The core spectra variables are: +coreSpectraVariables() + +## Add an additional metadata column. +s$spectrum_id <- c("sp_1", "sp_2") + +## List spectra variables, "spectrum_id" is now also listed +spectraVariables(s) + +## Get the values for the new spectra variable +s$spectrum_id + +## Extract specific spectra variables. +spectraData(s, columns = c("spectrum_id", "msLevel")) + + +## -------- PEAKS VARIABLES AND DATA -------- + +## Get the peak data (m/z and intensity values). +pks <- peaksData(s) +pks +pks[[1]] +pks[[2]] + +## Note that we could get the same resulb by coercing the `Spectra` to +## a `list` or `SimpleList`: +as(s, "list") +as(s, "SimpleList") + +## Or use `mz()` and `intensity()` to extract the m/z and intensity values +## separately +mz(s) +intensity(s) + +## Some `MsBackend` classes provide support for arbitrary peaks variables +## (in addition to the mandatory `"mz"` and `"intensity"` values. Below +## we create a simple data frame with an additional peak variable `"pk_ann"` +## and create a `Spectra` with a `MsBackendMemory` for that data. +## Importantly the number of values (per spectrum) need to be the same +## for all peak variables. + +tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) +tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) +tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) +tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) + +## Create the Spectra. With parameter `peaksVariables` we can define +## the columns in `tmp` that contain peaks variables. +sps <- Spectra(tmp, source = MsBackendMemory(), + peaksVariables = c("mz", "intensity", "pk_ann")) +peaksVariables(sps) + +## Extract just the m/z and intensity values +peaksData(sps)[[1L]] + +## Extract the full peaks data +peaksData(sps, columns = peaksVariables(sps))[[1L]] + +## Access just the pk_ann variable +sps$pk_ann + + +} +\seealso{ +\itemize{ +\item \code{\link[=addProcessing]{addProcessing()}} for functions to analyze \code{Spectra}. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +} From e80b18b750e5f5429cd924c94887aff36f6f5616 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 08:31:43 +0200 Subject: [PATCH 26/41] deps: import msLevel<- --- NAMESPACE | 1 + R/MsBackendDataFrame.R | 2 ++ 2 files changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 0a9fbcf9..d354b3c5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -229,6 +229,7 @@ importMethodsFrom(ProtGenerics,"intensity<-") importMethodsFrom(ProtGenerics,"isolationWindowLowerMz<-") importMethodsFrom(ProtGenerics,"isolationWindowTargetMz<-") importMethodsFrom(ProtGenerics,"isolationWindowUpperMz<-") +importMethodsFrom(ProtGenerics,"msLevel<-") importMethodsFrom(ProtGenerics,"mz<-") importMethodsFrom(ProtGenerics,"peaksData<-") importMethodsFrom(ProtGenerics,"polarity<-") diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index c13052b7..041be31e 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -279,6 +279,8 @@ setMethod("msLevel", "MsBackendDataFrame", function(object, ...) { }) #' @rdname hidden_aliases +#' +#' @importMethodsFrom ProtGenerics msLevel<- setReplaceMethod("msLevel", "MsBackendDataFrame", function(object, value) { if (!is.integer(value) && is.numeric(value)) value <- as.integer(value) From eb24ad07e6660a9d36a727d5b32a596e31abdfb9 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 09:03:24 +0200 Subject: [PATCH 27/41] fix: fix exports --- NAMESPACE | 1 + R/MsBackend.R | 167 +++++++++++++++++++++++++++++++++++++++++ R/MsBackendDataFrame.R | 2 - man/MsBackend.Rd | 2 + 4 files changed, 170 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d354b3c5..7e79b6c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ exportMethods("intensity<-") exportMethods("isolationWindowLowerMz<-") exportMethods("isolationWindowTargetMz<-") exportMethods("isolationWindowUpperMz<-") +exportMethods("msLevel<-") exportMethods("mz<-") exportMethods("peaksData<-") exportMethods("polarity<-") diff --git a/R/MsBackend.R b/R/MsBackend.R index b89e5303..09741cd0 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -15,6 +15,7 @@ #' @aliases dataStorageBasePath,MsBackendMzR-method #' @aliases dataStorageBasePath<- #' @aliases dataStorageBasePath<-,MsBackendMzR-method +#' @aliases msLeveL<-,MsBackend-method #' #' @description #' @@ -474,6 +475,8 @@ #' vector (of length equal to the number of spectra) with the MS #' level for each spectrum (or `NA_integer_` if not available). #' +#' - `msLevel<-`: replaces the spectra's MS level. +#' #' - `mz()`: gets the mass-to-charge ratios (m/z) from the #' spectra. Returns a [NumericList()] or length equal to the number of #' spectra, each element a `numeric` vector with the m/z values of @@ -902,6 +905,8 @@ setValidity("MsBackend", function(object) { #' @exportMethod backendBpparam #' #' @rdname MsBackend +#' +#' @export setMethod("backendBpparam", signature = "MsBackend", function(object, BPPARAM = bpparam()) { BPPARAM @@ -912,6 +917,8 @@ setMethod("backendBpparam", signature = "MsBackend", #' @importMethodsFrom ProtGenerics backendInitialize #' #' @rdname MsBackend +#' +#' @export setMethod("backendInitialize", signature = "MsBackend", function(object, ...) { validObject(object) object @@ -927,6 +934,8 @@ setMethod("backendMerge", "list", function(object, ...) { #' @exportMethod backendMerge #' #' @rdname MsBackend +#' +#' @export setMethod("backendMerge", "MsBackend", function(object, ...) { stop("Not implemented for ", class(object), ".") }) @@ -936,11 +945,15 @@ setMethod("backendMerge", "MsBackend", function(object, ...) { #' @exportMethod backendParallelFactor #' #' @rdname MsBackend +#' +#' @export setMethod("backendParallelFactor", "MsBackend", function(object, ...) { factor() }) #' @rdname MsBackend +#' +#' @export setMethod("export", "MsBackend", function(object, ...) { stop(class(object), " does not support export of data; please provide a ", "backend that supports data export with parameter 'backend'.") @@ -951,6 +964,8 @@ setMethod("export", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics acquisitionNum #' #' @rdname MsBackend +#' +#' @export setMethod("acquisitionNum", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -960,6 +975,8 @@ setMethod("acquisitionNum", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics peaksData #' #' @rdname MsBackend +#' +#' @export setMethod("peaksData", "MsBackend", function(object, columns = c("mz", "intensity")) { stop("Not implemented for ", class(object), ".") @@ -970,6 +987,8 @@ setMethod("peaksData", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics peaksVariables #' #' @rdname MsBackend +#' +#' @export setMethod("peaksVariables", "MsBackend", function(object) { c("mz", "intensity") }) @@ -981,6 +1000,8 @@ setMethod("peaksVariables", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics centroided #' #' @rdname MsBackend +#' +#' @export setMethod("centroided", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -990,6 +1011,8 @@ setMethod("centroided", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics centroided<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("centroided", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -999,6 +1022,8 @@ setReplaceMethod("centroided", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics collisionEnergy #' #' @rdname MsBackend +#' +#' @export setMethod("collisionEnergy", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1008,6 +1033,8 @@ setMethod("collisionEnergy", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics collisionEnergy<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("collisionEnergy", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1017,6 +1044,8 @@ setReplaceMethod("collisionEnergy", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics dataOrigin #' #' @rdname MsBackend +#' +#' @export setMethod("dataOrigin", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1026,6 +1055,8 @@ setMethod("dataOrigin", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics dataOrigin<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("dataOrigin", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1035,6 +1066,8 @@ setReplaceMethod("dataOrigin", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics dataStorage #' #' @rdname MsBackend +#' +#' @export setMethod("dataStorage", "MsBackend", function(object) { stop("Method 'dataStorage' is not implemented for ", class(object), ".") }) @@ -1044,6 +1077,8 @@ setMethod("dataStorage", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics dataStorage<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("dataStorage", "MsBackend", function(object, value) { stop("Method 'dataStorage' is not implemented for ", class(object), ".") }) @@ -1051,6 +1086,8 @@ setReplaceMethod("dataStorage", "MsBackend", function(object, value) { #' @exportMethod dropNaSpectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod("dropNaSpectraVariables", "MsBackend", function(object) { svs <- spectraVariables(object) svs <- svs[!(svs %in% c("mz", "intensity"))] @@ -1069,6 +1106,8 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics filterAcquisitionNum #' #' @rdname MsBackend +#' +#' @export setMethod("filterAcquisitionNum", "MsBackend", function(object, n, file, ...) { stop("Not implemented for ", class(object), ".") }) @@ -1078,6 +1117,8 @@ setMethod("filterAcquisitionNum", "MsBackend", function(object, n, file, ...) { #' @importMethodsFrom ProtGenerics filterDataOrigin #' #' @rdname MsBackend +#' +#' @export setMethod("filterDataOrigin", "MsBackend", function(object, dataOrigin = character()) { if (length(dataOrigin)) { @@ -1093,6 +1134,8 @@ setMethod("filterDataOrigin", "MsBackend", #' @importMethodsFrom ProtGenerics filterDataStorage #' #' @rdname MsBackend +#' +#' @export setMethod("filterDataStorage", "MsBackend", function(object, dataStorage = character()) { if (length(dataStorage)) { @@ -1108,6 +1151,8 @@ setMethod("filterDataStorage", "MsBackend", #' @importMethodsFrom ProtGenerics filterEmptySpectra #' #' @rdname MsBackend +#' +#' @export setMethod("filterEmptySpectra", "MsBackend", function(object, ...) { if (!length(object)) return(object) object[as.logical(lengths(object))] @@ -1118,6 +1163,8 @@ setMethod("filterEmptySpectra", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics filterIsolationWindow #' #' @rdname MsBackend +#' +#' @export setMethod("filterIsolationWindow", "MsBackend", function(object, mz = numeric(), ...) { if (length(mz)) { @@ -1134,6 +1181,8 @@ setMethod("filterIsolationWindow", "MsBackend", #' @importMethodsFrom ProtGenerics filterMsLevel #' #' @rdname MsBackend +#' +#' @export setMethod("filterMsLevel", "MsBackend", function(object, msLevel = integer()) { if (length(msLevel)) { @@ -1146,6 +1195,8 @@ setMethod("filterMsLevel", "MsBackend", #' @importMethodsFrom ProtGenerics filterPolarity #' #' @rdname MsBackend +#' +#' @export setMethod("filterPolarity", "MsBackend", function(object, polarity = integer()) { if (length(polarity)) @@ -1158,6 +1209,8 @@ setMethod("filterPolarity", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMzRange #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMzRange", "MsBackend", function(object, mz = numeric()) { if (length(mz)) { @@ -1170,6 +1223,8 @@ setMethod("filterPrecursorMzRange", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMz #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMz", "MsBackend", function(object, mz = numeric()) { filterPrecursorMzRange(object, mz) @@ -1180,6 +1235,8 @@ setMethod("filterPrecursorMz", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMzValues #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMzValues", "MsBackend", function(object, mz = numeric(), ppm = 20, tolerance = 0) { if (length(mz)) { @@ -1193,6 +1250,8 @@ setMethod("filterPrecursorMzValues", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorCharge #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorCharge", "MsBackend", function(object, z = integer()) { if (length(z)) { @@ -1206,6 +1265,8 @@ setMethod("filterPrecursorCharge", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorScan #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorScan", "MsBackend", function(object, acquisitionNum = integer(), f = dataOrigin(object)) { if (length(acquisitionNum) && length(f)) { @@ -1226,6 +1287,8 @@ setMethod("filterPrecursorScan", "MsBackend", #' @importFrom MsCoreUtils between #' #' @rdname MsBackend +#' +#' @export setMethod("filterRanges", "MsBackend", function(object, spectraVariables = character(), ranges = numeric(), match = c("all", "any")){ @@ -1267,6 +1330,8 @@ setMethod("filterRanges", "MsBackend", #' @importMethodsFrom ProtGenerics filterRt #' #' @rdname MsBackend +#' +#' @export setMethod("filterRt", "MsBackend", function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { if (length(rt)) { @@ -1284,6 +1349,8 @@ setMethod("filterRt", "MsBackend", #' @importFrom MsCoreUtils ppm #' #' @rdname MsBackend +#' +#' @export setMethod("filterValues", "MsBackend", function(object, spectraVariables = character(), values = numeric(), ppm = 0, tolerance = 0, match = c("all", "any")){ @@ -1329,6 +1396,8 @@ setMethod("filterValues", "MsBackend", #' @importMethodsFrom ProtGenerics intensity #' #' @rdname MsBackend +#' +#' @export setMethod("intensity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1338,6 +1407,8 @@ setMethod("intensity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics intensity<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("intensity", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1349,6 +1420,8 @@ setReplaceMethod("intensity", "MsBackend", function(object, value) { #' @importFrom MsCoreUtils vapply1d #' #' @rdname MsBackend +#' +#' @export setMethod("ionCount", "MsBackend", function(object) { vapply1d(intensity(object), sum, na.rm = TRUE) }) @@ -1359,6 +1432,8 @@ setMethod("ionCount", "MsBackend", function(object) { #' @importFrom MsCoreUtils vapply1l #' #' @rdname MsBackend +#' +#' @export setMethod("isCentroided", "MsBackend", function(object, ...) { vapply1l(peaksData(object), .peaks_is_centroided) }) @@ -1368,6 +1443,8 @@ setMethod("isCentroided", "MsBackend", function(object, ...) { #' @rdname MsBackend #' #' @importMethodsFrom S4Vectors isEmpty +#' +#' @export setMethod("isEmpty", "MsBackend", function(x) { stop("Not implemented for ", class(x), ".") }) @@ -1377,6 +1454,8 @@ setMethod("isEmpty", "MsBackend", function(x) { #' @importMethodsFrom ProtGenerics isolationWindowLowerMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowLowerMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1386,6 +1465,8 @@ setMethod("isolationWindowLowerMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowLowerMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowLowerMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1396,6 +1477,8 @@ setReplaceMethod("isolationWindowLowerMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isolationWindowTargetMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowTargetMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1405,6 +1488,8 @@ setMethod("isolationWindowTargetMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowTargetMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowTargetMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1415,6 +1500,8 @@ setReplaceMethod("isolationWindowTargetMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isolationWindowUpperMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowUpperMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1424,6 +1511,8 @@ setMethod("isolationWindowUpperMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowUpperMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowUpperMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1434,6 +1523,8 @@ setReplaceMethod("isolationWindowUpperMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isReadOnly #' #' @rdname MsBackend +#' +#' @export setMethod("isReadOnly", "MsBackend", function(object) { object@readonly }) @@ -1441,6 +1532,8 @@ setMethod("isReadOnly", "MsBackend", function(object) { #' @exportMethod length #' #' @rdname MsBackend +#' +#' @export setMethod("length", "MsBackend", function(x) { stop("Not implemented for ", class(x), ".") }) @@ -1450,15 +1543,26 @@ setMethod("length", "MsBackend", function(x) { #' @importMethodsFrom ProtGenerics msLevel #' #' @rdname MsBackend +#' +#' @export setMethod("msLevel", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) +#' @importMethodsFrom ProtGenerics msLevel<- +#' +#' @export +setReplaceMethod("msLevel", "MsBackend", function(object, value) { + stop("Not implemented for ", class(object), ".") +}) + #' @exportMethod mz #' #' @importMethodsFrom ProtGenerics mz #' #' @rdname MsBackend +#' +#' @export setMethod("mz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1468,6 +1572,8 @@ setMethod("mz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics mz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("mz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1482,6 +1588,8 @@ setMethod("lengths", "MsBackend", function(x, use.names = FALSE) { #' @importMethodsFrom ProtGenerics polarity #' #' @rdname MsBackend +#' +#' @export setMethod("polarity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1491,6 +1599,8 @@ setMethod("polarity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics polarity<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("polarity", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1500,6 +1610,8 @@ setReplaceMethod("polarity", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics precScanNum #' #' @rdname MsBackend +#' +#' @export setMethod("precScanNum", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1509,6 +1621,8 @@ setMethod("precScanNum", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorCharge #' #' @rdname MsBackend +#' +#' @export setMethod("precursorCharge", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1518,6 +1632,8 @@ setMethod("precursorCharge", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorIntensity #' #' @rdname MsBackend +#' +#' @export setMethod("precursorIntensity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1527,6 +1643,8 @@ setMethod("precursorIntensity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorMz #' #' @rdname MsBackend +#' +#' @export setMethod("precursorMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1534,7 +1652,10 @@ setMethod("precursorMz", "MsBackend", function(object) { #' @exportMethod peaksData<- #' #' @importMethodsFrom ProtGenerics peaksData<- +#' #' @rdname MsBackend +#' +#' @export setReplaceMethod("peaksData", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1542,6 +1663,8 @@ setReplaceMethod("peaksData", "MsBackend", function(object, value) { #' @exportMethod reset #' #' @rdname MsBackend +#' +#' @export setMethod("reset", "MsBackend", function(object) { object }) @@ -1551,6 +1674,8 @@ setMethod("reset", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics rtime #' #' @rdname MsBackend +#' +#' @export setMethod("rtime", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1560,6 +1685,8 @@ setMethod("rtime", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics rtime<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("rtime", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1569,6 +1696,8 @@ setReplaceMethod("rtime", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics scanIndex #' #' @rdname MsBackend +#' +#' @export setMethod("scanIndex", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1576,6 +1705,8 @@ setMethod("scanIndex", "MsBackend", function(object) { #' @exportMethod selectSpectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod( "selectSpectraVariables", "MsBackend", function(object, spectraVariables = spectraVariables(object)) { @@ -1587,6 +1718,8 @@ setMethod( #' @importMethodsFrom ProtGenerics smoothed #' #' @rdname MsBackend +#' +#' @export setMethod("smoothed", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1598,6 +1731,8 @@ setMethod("smoothed", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics smoothed<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("smoothed", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1605,6 +1740,8 @@ setReplaceMethod("smoothed", "MsBackend", function(object, value) { #' @exportMethod spectraData #' #' @rdname MsBackend +#' +#' @export setMethod( "spectraData", "MsBackend", function(object, columns = spectraVariables(object)) { @@ -1614,6 +1751,8 @@ setMethod( #' @exportMethod spectraData<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("spectraData", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1623,6 +1762,8 @@ setReplaceMethod("spectraData", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics spectraNames #' #' @rdname MsBackend +#' +#' @export setMethod("spectraNames", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1632,6 +1773,8 @@ setMethod("spectraNames", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics spectraNames<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("spectraNames", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1641,6 +1784,8 @@ setReplaceMethod("spectraNames", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics spectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod("spectraVariables", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1650,6 +1795,8 @@ setMethod("spectraVariables", "MsBackend", function(object) { #' @importMethodsFrom S4Vectors split #' #' @rdname MsBackend +#' +#' @export setMethod("split", "MsBackend", function(x, f, drop = FALSE, ...) { split.default(x, f, drop = drop, ...) }) @@ -1659,6 +1806,8 @@ setMethod("split", "MsBackend", function(x, f, drop = FALSE, ...) { #' @exportMethod supportsSetBackend #' #' @rdname MsBackend +#' +#' @export setMethod("supportsSetBackend", "MsBackend", function(object, ...) { !isReadOnly(object) }) @@ -1668,6 +1817,8 @@ setMethod("supportsSetBackend", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics tic #' #' @rdname MsBackend +#' +#' @export setMethod("tic", "MsBackend", function(object, initial = TRUE) { stop("Not implemented for ", class(object), ".") }) @@ -1675,6 +1826,8 @@ setMethod("tic", "MsBackend", function(object, initial = TRUE) { #' @exportMethod [ #' #' @rdname MsBackend +#' +#' @export setMethod("[", "MsBackend", function(x, i, j, ..., drop = FALSE) { stop("Not implemented for ", class(x), ".") }) @@ -1682,6 +1835,8 @@ setMethod("[", "MsBackend", function(x, i, j, ..., drop = FALSE) { #' @exportMethod $ #' #' @rdname MsBackend +#' +#' @export setMethod("$", "MsBackend", function(x, name) { stop("Not implemented for ", class(x), ".") }) @@ -1689,6 +1844,8 @@ setMethod("$", "MsBackend", function(x, name) { #' @exportMethod $<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("$", "MsBackend", function(x, name, value) { stop("Not implemented for ", class(x), ".") }) @@ -1696,6 +1853,8 @@ setReplaceMethod("$", "MsBackend", function(x, name, value) { #' @exportMethod [[ #' #' @rdname MsBackend +#' +#' @export setMethod("[[", "MsBackend", function(x, i, j, ...) { if (!is.character(i)) stop("'i' is supposed to be a character defining the spectra ", @@ -1708,6 +1867,8 @@ setMethod("[[", "MsBackend", function(x, i, j, ...) { #' @exportMethod [[<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("[[", "MsBackend", function(x, i, j, ..., value) { if (!is.character(i)) stop("'i' is supposed to be a character defining the spectra ", @@ -1722,6 +1883,8 @@ setReplaceMethod("[[", "MsBackend", function(x, i, j, ..., value) { #' @importMethodsFrom ProtGenerics uniqueMsLevels #' #' @rdname MsBackend +#' +#' @export setMethod("uniqueMsLevels", "MsBackend", function(object, ...) { unique(msLevel(object)) }) @@ -1729,6 +1892,8 @@ setMethod("uniqueMsLevels", "MsBackend", function(object, ...) { #' @exportMethod dataStorageBasePath #' #' @rdname MsBackend +#' +#' @export setMethod("dataStorageBasePath", "MsBackend", function(object) { NA_character_ }) @@ -1736,6 +1901,8 @@ setMethod("dataStorageBasePath", "MsBackend", function(object) { #' @exportMethod dataStorageBasePath<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod( "dataStorageBasePath", "MsBackend", function(object, value) { warning(class(object)[1L], " does not support changing", diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index 041be31e..c13052b7 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -279,8 +279,6 @@ setMethod("msLevel", "MsBackendDataFrame", function(object, ...) { }) #' @rdname hidden_aliases -#' -#' @importMethodsFrom ProtGenerics msLevel<- setReplaceMethod("msLevel", "MsBackendDataFrame", function(object, value) { if (!is.integer(value) && is.numeric(value)) value <- as.integer(value) diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 0bf98b0a..d6752031 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -21,6 +21,7 @@ \alias{dataStorageBasePath,MsBackendMzR-method} \alias{dataStorageBasePath<-} \alias{dataStorageBasePath<-,MsBackendMzR-method} +\alias{msLeveL<-,MsBackend-method} \alias{backendBpparam,MsBackend-method} \alias{backendInitialize,MsBackend-method} \alias{backendMerge,list-method} @@ -720,6 +721,7 @@ number of spectra). For empty spectra, \code{0} is returned. \item \code{msLevel()}: gets the spectra's MS level. Returns an \code{integer} vector (of length equal to the number of spectra) with the MS level for each spectrum (or \code{NA_integer_} if not available). +\item \verb{msLevel<-}: replaces the spectra's MS level. \item \code{mz()}: gets the mass-to-charge ratios (m/z) from the spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of spectra, each element a \code{numeric} vector with the m/z values of From 476941d44f975e1ea57082db8ffc1620a7e8ab8a Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 09:05:20 +0200 Subject: [PATCH 28/41] ci: run GHA only on push --- .github/workflows/check-bioc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index b0c1e8df..85b29ffa 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -22,7 +22,8 @@ on: push: - pull_request: + paths-ignore: + - 'README.md' name: R-CMD-check-bioc From 21383da49af5fe121f2c56653fbaf09f4fed274b Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 09:07:00 +0200 Subject: [PATCH 29/41] docs: add missing documentation --- R/MsBackend.R | 2 ++ man/MsBackend.Rd | 3 +++ 2 files changed, 5 insertions(+) diff --git a/R/MsBackend.R b/R/MsBackend.R index 09741cd0..e0d67630 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -1551,6 +1551,8 @@ setMethod("msLevel", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics msLevel<- #' +#' @rdname MsBackend +#' #' @export setReplaceMethod("msLevel", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index d6752031..2874d082 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -69,6 +69,7 @@ \alias{isReadOnly,MsBackend-method} \alias{length,MsBackend-method} \alias{msLevel,MsBackend-method} +\alias{msLevel<-,MsBackend-method} \alias{mz,MsBackend-method} \alias{mz<-,MsBackend-method} \alias{lengths,MsBackend-method} @@ -214,6 +215,8 @@ \S4method{msLevel}{MsBackend}(object) +\S4method{msLevel}{MsBackend}(object) <- value + \S4method{mz}{MsBackend}(object) \S4method{mz}{MsBackend}(object) <- value From 20ea340fd6c13b9412554741f21406642fbea7cb Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 26 Sep 2024 16:57:35 +0200 Subject: [PATCH 30/41] feat: add new extractByIndex method - Add an `extractByIndex()` method to subset/extract content from a `MsBackend`. Using this method in contrast to `[` avoids errors with some parallel processing setups in which a `[` method for a backend might not be found (see https://github.com/rformassspectrometry/MsBackendMetaboLights/issues/5). --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 5 +++++ R/AllGenerics.R | 2 ++ R/MsBackend.R | 27 +++++++++++++++++++++++ R/MsBackendCached.R | 9 ++++++++ R/MsBackendDataFrame.R | 10 +++++++++ R/MsBackendHdf5Peaks.R | 14 ++++++++++++ R/MsBackendMemory.R | 12 ++++++++++ R/Spectra-functions.R | 5 +++-- R/Spectra.R | 11 ++++++---- man/MsBackend.Rd | 21 ++++++++++++++++-- man/MsBackendCached.Rd | 7 ++++-- man/hidden_aliases.Rd | 9 ++++++++ tests/testthat/test_MsBackend.R | 3 +++ tests/testthat/test_MsBackendCached.R | 12 ++++++++++ tests/testthat/test_MsBackendDataFrame.R | 22 +++++++++++++++++++ tests/testthat/test_MsBackendHdf5Peaks.R | 4 ++++ tests/testthat/test_MsBackendMemory.R | 28 +++++++++++++++++++++++- tests/testthat/test_MsBackendMzR.R | 2 ++ vignettes/MsBackend.Rmd | 22 +++++++++++++++++++ 21 files changed, 216 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0270d5db..91db6af4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.9 +Version: 1.15.10 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NAMESPACE b/NAMESPACE index 7e79b6c9..e3e4970c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -86,6 +86,7 @@ exportMethods(dropNaSpectraVariables) exportMethods(entropy) exportMethods(estimatePrecursorIntensity) exportMethods(export) +exportMethods(extractByIndex) exportMethods(filterAcquisitionNum) exportMethods(filterDataOrigin) exportMethods(filterDataStorage) diff --git a/NEWS.md b/NEWS.md index 828350b4..3cc44fb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.10 + +- Add new `extractSpectra()` generic and implementation for `MsBackend`. Fixes + [issue #5](https://github.com/rformassspectrometry/MsBackendMetaboLights/issues/5). + ## Changes in 1.15.9 - Restructure and reorganize documentation for `Spectra`. diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 5ec6d054..d02aa13c 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -19,6 +19,8 @@ setGeneric("entropy", function(object, ...) standardGeneric("entropy")) setGeneric("export", function(object, ...) standardGeneric("export")) +setGeneric("extractByIndex", function(object, i) + standardGeneric("extractByIndex")) setGeneric("filterFourierTransformArtefacts", function(object, ...) standardGeneric("filterFourierTransformArtefacts")) setGeneric("neutralLoss", function(object, param, ...) diff --git a/R/MsBackend.R b/R/MsBackend.R index e0d67630..d7216c53 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -328,6 +328,18 @@ #' *mzML* or *mzXML* format. See the documentation for the `MsBackendMzR` #' class below for more information. #' +#' - `extractByIndex()`: function to subset a backend to selected elements +#' defined by the provided index. Similar to `[`, this method should allow +#' extracting (or to subset) the data in any order. In contrast to `[`, +#' however, `i` is expected to be an `integer` (while `[` should also +#' support `logical` and eventually `character`). While being apparently +#' redundant to `[`, this methods avoids package namespace errors/problems +#' that can result in implementations of `[` being not found by R (which +#' can happen sometimes in parallel processing using the [SnowParam()]). This +#' method is used internally by `Spectra` to extract/subset its backend. +#' Implementation is optional, as the default implementation for `MsBackend` +#' will use `[`. +#' #' - `filterAcquisitionNum()`: filters the object keeping only spectra matching #' the provided acquisition numbers (argument `n`). If `dataOrigin` or #' `dataStorage` is also provided, `object` is subsetted to the spectra with @@ -1101,6 +1113,21 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { selectSpectraVariables(object, c(svs[keep], "mz", "intensity")) }) +#' @rdname MsBackend +#' +#' @export +setMethod("extractByIndex", c("MsBackend", "ANY"), function(object, i) { + object[i = i] +}) + +#' @rdname MsBackend +#' +#' @export +setMethod("extractByIndex", c("MsBackend", "missing"), function(object, i) { + message("extractByIndex,MsBackend,missing") + object +}) + #' @exportMethod filterAcquisitionNum #' #' @importMethodsFrom ProtGenerics filterAcquisitionNum diff --git a/R/MsBackendCached.R b/R/MsBackendCached.R index 5628037d..caf7f743 100644 --- a/R/MsBackendCached.R +++ b/R/MsBackendCached.R @@ -294,6 +294,15 @@ setMethod("dataStorage", "MsBackendCached", function(object) { rep("", length(object)) }) +#' @rdname MsBackendCached +setMethod("extractByIndex", c("MsBackendCached", "ANY"), + function(object, i) { + slot(object, "localData", check = FALSE) <- + object@localData[i, , drop = FALSE] + object@nspectra <- nrow(object@localData) + object +}) + #' @rdname MsBackendCached setMethod("length", "MsBackendCached", function(x) { x@nspectra diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index c13052b7..c89e192d 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -181,6 +181,14 @@ setReplaceMethod("dataStorage", "MsBackendDataFrame", function(object, value) { object }) +#' @rdname hidden_aliases +setMethod("extractByIndex", c("MsBackendDataFrame", "ANY"), + function(object, i) { + slot(object, "spectraData", check = FALSE) <- + extractROWS(object@spectraData, i) + object + }) + #' @rdname hidden_aliases setMethod("intensity", "MsBackendDataFrame", function(object) { if (any(colnames(object@spectraData) == "intensity")) @@ -544,6 +552,8 @@ setReplaceMethod("$", "MsBackendDataFrame", function(x, name, value) { #' @importFrom MsCoreUtils i2index #' #' @rdname hidden_aliases +#' +#' @export setMethod("[", "MsBackendDataFrame", function(x, i, j, ..., drop = FALSE) { .subset_backend_data_frame(x, i) }) diff --git a/R/MsBackendHdf5Peaks.R b/R/MsBackendHdf5Peaks.R index e5482803..ebcb8ea2 100644 --- a/R/MsBackendHdf5Peaks.R +++ b/R/MsBackendHdf5Peaks.R @@ -291,6 +291,20 @@ setMethod("[", "MsBackendHdf5Peaks", function(x, i, j, ..., drop = FALSE) { x }) +#' @rdname hidden_aliases +#' +#' @aliases [,MsBackendHdf5Peaks-method +setMethod("extractByIndex", c("MsBackendHdf5Peaks", "ANY"), + function(object, i) { + fls <- unique(object@spectraData$dataStorage) + slot(object, "spectraData", check = FALSE) <- + extractROWS(object@spectraData, i) + slot(object, "modCount", check = FALSE) <- + object@modCount[match( + unique(object@spectraData$dataStorage), fls)] + object +}) + #' @rdname hidden_aliases setMethod("backendMerge", "MsBackendHdf5Peaks", function(object, ...) { object <- unname(c(object, ...)) diff --git a/R/MsBackendMemory.R b/R/MsBackendMemory.R index d38722ab..594fc799 100644 --- a/R/MsBackendMemory.R +++ b/R/MsBackendMemory.R @@ -192,6 +192,18 @@ setReplaceMethod("dataStorage", "MsBackendMemory", function(object, value) { object }) +#' @rdname hidden_aliases +setMethod("extractByIndex", c("MsBackendMemory", "ANY"), function(object, i) { + slot(object, "spectraData", check = FALSE) <- + object@spectraData[i, , drop = FALSE] + if (length(object@peaksData)) + slot(object, "peaksData", check = FALSE) <- object@peaksData[i] + if (length(object@peaksDataFrame)) + slot(object, "peaksDataFrame", check = FALSE) <- + object@peaksDataFrame[i] + object +}) + #' @rdname hidden_aliases setMethod("intensity", "MsBackendMemory", function(object) { if (length(object)) { diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 033a2b2d..99014163 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -236,8 +236,9 @@ applyProcessing <- function(object, f = processingChunkFactor(object), }, queue = queue, pv = pv, svars = svars, BPPARAM = BPPARAM) bknds <- backendMerge(bknds) if (is.unsorted(f)) - bknds <- bknds[order(unlist(split(seq_along(bknds), f), - use.names = FALSE))] + bknds <- extractByIndex( + bknds, order(unlist(split(seq_along(bknds), f), + use.names = FALSE))) object@backend <- bknds } else { if (length(svars)) diff --git a/R/Spectra.R b/R/Spectra.R index 045cf88a..7564b0a4 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -515,8 +515,9 @@ setMethod( ## That below ensures the backend is returned in its original ## order - unsplit does unfortunately not work. if (is.unsorted(f)) - bknds <- bknds[order(unlist(split(seq_along(bknds), f), - use.names = FALSE))] + bknds <- extractByIndex( + bknds, order(unlist(split(seq_along(bknds), f), + use.names = FALSE))) } else { bknds <- backendInitialize( backend, data = spectraData(object@backend), ...) @@ -2415,7 +2416,8 @@ setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { stop("Subsetting 'Spectra' by columns is not (yet) supported") if (missing(i)) return(x) - slot(x, "backend", check = FALSE) <- x@backend[i = i] + slot(x, "backend", check = FALSE) <- extractByIndex( + x@backend, i2index(i, length(x))) x }) @@ -2439,7 +2441,8 @@ setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), #' @rdname filterMsLevel setMethod("filterEmptySpectra", "Spectra", function(object) { - object@backend <- object@backend[as.logical(lengths(object))] + object@backend <- extractByIndex(object@backend, + which(as.logical(lengths(object)))) object@processing <- .logging(object@processing, "Filter: removed empty spectra.") object diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 2874d082..269b7cdd 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -40,6 +40,8 @@ \alias{dataStorage,MsBackend-method} \alias{dataStorage<-,MsBackend-method} \alias{dropNaSpectraVariables,MsBackend-method} +\alias{extractByIndex,MsBackend,ANY-method} +\alias{extractByIndex,MsBackend,missing-method} \alias{filterAcquisitionNum,MsBackend-method} \alias{filterDataOrigin,MsBackend-method} \alias{filterDataStorage,MsBackend-method} @@ -145,6 +147,10 @@ \S4method{dropNaSpectraVariables}{MsBackend}(object) +\S4method{extractByIndex}{MsBackend,ANY}(object, i) + +\S4method{extractByIndex}{MsBackend,missing}(object, i) + \S4method{filterAcquisitionNum}{MsBackend}(object, n, file, ...) \S4method{filterDataOrigin}{MsBackend}(object, dataOrigin = character()) @@ -316,6 +322,8 @@ backend provides.} \item{value}{replacement value for \verb{<-} methods. See individual method description or expected data type.} +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} + \item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition numbers to filter for.} @@ -411,8 +419,6 @@ reported total ion current should be reported, or whether the total ion current should be (re)calculated on the actual data (\code{initial = FALSE}).} -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} - \item{j}{For \code{[}: not supported.} \item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return @@ -604,6 +610,17 @@ queue) are applied prior to export - this would not be possible with only a for the \code{MsBackendMzR} backend that supports export of the data in \emph{mzML} or \emph{mzXML} format. See the documentation for the \code{MsBackendMzR} class below for more information. +\item \code{extractByIndex()}: function to subset a backend to selected elements +defined by the provided index. Similar to \code{[}, this method should allow +extracting (or to subset) the data in any order. In contrast to \code{[}, +however, \code{i} is expected to be an \code{integer} (while \code{[} should also +support \code{logical} and eventually \code{character}). While being apparently +redundant to \code{[}, this methods avoids package namespace errors/problems +that can result in implementations of \code{[} being not found by R (which +can happen sometimes in parallel processing using the \code{\link[=SnowParam]{SnowParam()}}). This +method is used internally by \code{Spectra} to extract/subset its backend. +Implementation is optional, as the default implementation for \code{MsBackend} +will use \code{[}. \item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or \code{dataStorage} is also provided, \code{object} is subsetted to the spectra with diff --git a/man/MsBackendCached.Rd b/man/MsBackendCached.Rd index e65e41e9..ae8c6687 100644 --- a/man/MsBackendCached.Rd +++ b/man/MsBackendCached.Rd @@ -5,6 +5,7 @@ \alias{MsBackendCached-class} \alias{backendInitialize,MsBackendCached-method} \alias{dataStorage,MsBackendCached-method} +\alias{extractByIndex,MsBackendCached,ANY-method} \alias{length,MsBackendCached-method} \alias{spectraVariables,MsBackendCached-method} \alias{spectraData,MsBackendCached-method} @@ -57,6 +58,8 @@ MsBackendCached() \S4method{dataStorage}{MsBackendCached}(object) +\S4method{extractByIndex}{MsBackendCached,ANY}(object, i) + \S4method{length}{MsBackendCached}(x) \S4method{spectraVariables}{MsBackendCached}(object) @@ -150,6 +153,8 @@ variables to keep.} \item{...}{ignored} +\item{i}{For \code{[}: \code{integer} with the indices to subset the object.} + \item{x}{A \code{MsBackendCached} object.} \item{columns}{For \code{spectraData()}: \code{character} with the names of the spectra @@ -158,8 +163,6 @@ variables to retrieve.} \item{value}{replacement value for \verb{<-} methods. See individual method description or expected data type.} -\item{i}{For \code{[}: \code{integer} with the indices to subset the object.} - \item{j}{For \code{[}: ignored.} \item{drop}{For \code{[}: not considered.} diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index 1249a50f..3e70d26c 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -20,6 +20,7 @@ \alias{dataOrigin<-,MsBackendDataFrame-method} \alias{dataStorage,MsBackendDataFrame-method} \alias{dataStorage<-,MsBackendDataFrame-method} +\alias{extractByIndex,MsBackendDataFrame,ANY-method} \alias{intensity,MsBackendDataFrame-method} \alias{intensity<-,MsBackendDataFrame-method} \alias{isEmpty,MsBackendDataFrame-method} @@ -75,6 +76,7 @@ \alias{spectraData<-,MsBackendHdf5Peaks-method} \alias{$<-,MsBackendHdf5Peaks-method} \alias{[,MsBackendHdf5Peaks-method} +\alias{extractByIndex,MsBackendHdf5Peaks,ANY-method} \alias{backendMerge,MsBackendHdf5Peaks-method} \alias{show,MsBackendMemory-method} \alias{backendMerge,MsBackendMemory-method} @@ -87,6 +89,7 @@ \alias{dataOrigin<-,MsBackendMemory-method} \alias{dataStorage,MsBackendMemory-method} \alias{dataStorage<-,MsBackendMemory-method} +\alias{extractByIndex,MsBackendMemory,ANY-method} \alias{intensity,MsBackendMemory-method} \alias{intensity<-,MsBackendMemory-method} \alias{ionCount,MsBackendMemory-method} @@ -185,6 +188,8 @@ \S4method{dataStorage}{MsBackendDataFrame}(object) <- value +\S4method{extractByIndex}{MsBackendDataFrame,ANY}(object, i) + \S4method{intensity}{MsBackendDataFrame}(object) \S4method{intensity}{MsBackendDataFrame}(object) <- value @@ -309,6 +314,8 @@ \S4method{[}{MsBackendHdf5Peaks}(x, i, j, ..., drop = FALSE) +\S4method{extractByIndex}{MsBackendHdf5Peaks,ANY}(object, i) + \S4method{backendMerge}{MsBackendHdf5Peaks}(object, ...) \S4method{show}{MsBackendMemory}(object) @@ -333,6 +340,8 @@ \S4method{dataStorage}{MsBackendMemory}(object) <- value +\S4method{extractByIndex}{MsBackendMemory,ANY}(object, i) + \S4method{intensity}{MsBackendMemory}(object) \S4method{intensity}{MsBackendMemory}(object) <- value diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index d80bd757..cf36605e 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -56,6 +56,9 @@ test_that("MsBackend methods throw errors", { expect_error(dm[1], "implemented for") expect_error(dm$a, "implemented for") expect_error(dm$a <- "a", "implemented for") + expect_error(extractByIndex(dm, 1), "implemented for") + + expect_equal(extractByIndex(dm), dm) }) test_that("reset,MsBackend works", { diff --git a/tests/testthat/test_MsBackendCached.R b/tests/testthat/test_MsBackendCached.R index 6ff1b7ee..86bd8639 100644 --- a/tests/testthat/test_MsBackendCached.R +++ b/tests/testthat/test_MsBackendCached.R @@ -87,12 +87,24 @@ test_that("[,MsBackendCached works", { res <- be[c(1, 4, 3), ] expect_true(length(res) == 3) expect_true(nrow(res@localData) == 3) + res_2 <- extractByIndex(be, c(1, 4, 3)) + expect_equal(res, res_2) df <- data.frame(msLevel = 1L, b = 1:6) be <- backendInitialize(be, data = df) res <- be[c(6, 1, 3)] expect_true(length(res) == 3) expect_equal(res@localData$b, c(6, 1, 3)) + res_2 <- extractByIndex(be, c(6, 1, 3)) + expect_equal(res, res_2) + + res <- be[c(6, 1, 3, 1)] + expect_true(length(res) == 4) + expect_equal(res@localData$b, c(6, 1, 3, 1)) + res_2 <- extractByIndex(be, c(6, 1, 3, 1)) + expect_equal(res, res_2) + + expect_equal(extractByIndex(be), be) }) test_that("$,MsBackendCached works", { diff --git a/tests/testthat/test_MsBackendDataFrame.R b/tests/testthat/test_MsBackendDataFrame.R index 4b41d6d0..e5de3662 100644 --- a/tests/testthat/test_MsBackendDataFrame.R +++ b/tests/testthat/test_MsBackendDataFrame.R @@ -576,24 +576,42 @@ test_that("show,MsBackendDataFrame works", { test_that("[,MsBackendDataFrame works", { be <- MsBackendDataFrame() expect_error(be[1]) + + expect_equal(extractByIndex(be), be) + df <- DataFrame(scanIndex = 1:2, a = "a", b = "b") be <- backendInitialize(be, df) res <- be[1] expect_true(validObject(res)) expect_equal(be@spectraData[1, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 1) + expect_equal(res, res_2) res <- be[2] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) res <- be[2:1] expect_true(validObject(res)) expect_equal(be@spectraData[2:1, ], res@spectraData) + res_2 <- extractByIndex(be, 2:1) + expect_equal(res, res_2) + + res <- be[c(2, 1, 2)] + expect_equal(res$scanIndex, c(2, 1, 2)) + res_2 <- extractByIndex(be, c(2, 1, 2)) + expect_equal(res, res_2) res <- be[c(FALSE, FALSE)] expect_true(validObject(res)) expect_true(length(res) == 0) + res_2 <- extractByIndex(be, integer()) + expect_equal(res, res_2) res <- be[c(FALSE, TRUE)] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) expect_error(be[TRUE], "match the length of") expect_error(be["a"], "does not have names") @@ -606,11 +624,15 @@ test_that("[,MsBackendDataFrame works", { expect_true(validObject(res)) expect_equal(dataStorage(res), "2") expect_equal(res@spectraData$file, "b") + res_2 <- extractByIndex(be, 3) + expect_equal(res, res_2) res <- be[c(3, 1)] expect_true(validObject(res)) expect_equal(dataStorage(res), c("2", "1")) expect_equal(res@spectraData$file, c("b", "a")) + res_2 <- extractByIndex(be, c(3, 1)) + expect_equal(res, res_2) }) test_that("selectSpectraVariables,MsBackendDataFrame works", { diff --git a/tests/testthat/test_MsBackendHdf5Peaks.R b/tests/testthat/test_MsBackendHdf5Peaks.R index b7afdf37..3604d895 100644 --- a/tests/testthat/test_MsBackendHdf5Peaks.R +++ b/tests/testthat/test_MsBackendHdf5Peaks.R @@ -334,12 +334,16 @@ test_that("[,MsBackendHdf5Peaks works", { expect_identical(peaksData(res), sciex_pks[idx]) expect_identical(rtime(res), rtime(sciex_mzr)[idx]) expect_identical(msLevel(res), msLevel(sciex_mzr)[idx]) + res_2 <- extractByIndex(be, idx) + expect_equal(res, res_2) idx <- dataStorage(be) == fls[2] res <- be[idx, ] expect_true(validObject(res)) expect_true(all(dataStorage(res) == fls[2])) expect_identical(peaksData(res), sciex_pks[idx]) + res_2 <- extractByIndex(be, idx) + expect_equal(res, res_2) }) test_that("backendMerge,MsBackendHdf5Peaks works", { diff --git a/tests/testthat/test_MsBackendMemory.R b/tests/testthat/test_MsBackendMemory.R index 59776143..bb3c9973 100644 --- a/tests/testthat/test_MsBackendMemory.R +++ b/tests/testthat/test_MsBackendMemory.R @@ -501,41 +501,67 @@ test_that("$<-,MsBackendMemory works", { test_that("[,MsBackendMemory works", { be <- new("MsBackendMemory") + res <- extractByIndex(be) + expect_equal(res, be) + df <- data.frame(scanIndex = 1:2, a = "a", b = "b") be <- backendInitialize(be, df) res <- be[1] expect_true(validObject(res)) expect_equal(be@spectraData[1, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 1) + expect_equal(res, res_2) + res <- be[2] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) + res <- be[2:1] expect_true(validObject(res)) expect_equal(be@spectraData[2:1, ], res@spectraData) + res_2 <- extractByIndex(be, 2:1) + expect_equal(res, res_2) res <- be[c(FALSE, FALSE)] expect_true(validObject(res)) expect_true(length(res) == 0) + res_2 <- extractByIndex(be, integer()) + expect_equal(res, res_2) + res <- be[c(FALSE, TRUE)] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) expect_error(be[TRUE], "match the length of") expect_error(be["a"], "names") df <- data.frame(scanIndex = c(1L, 2L, 1L, 2L), - file = c("a", "a", "b", "b")) + file = c("a", "a", "b", "b"), + idx = 1:4) be <- backendInitialize(be, df) dataStorage(be) <- c("1", "1", "2", "2") res <- be[3] expect_true(validObject(res)) expect_equal(dataStorage(res), "2") expect_equal(res@spectraData$file, "b") + res_2 <- extractByIndex(be, 3) + expect_equal(res, res_2) res <- be[c(3, 1)] expect_true(validObject(res)) expect_equal(dataStorage(res), c("2", "1")) expect_equal(res@spectraData$file, c("b", "a")) + res_2 <- extractByIndex(be, c(3, 1)) + expect_equal(res, res_2) + + res <- be[c(3, 1, 3)] + expect_equal(res$idx, c(3, 1, 3)) + res_2 <- extractByIndex(be, c(3, 1, 3)) + expect_equal(res, res_2) }) test_that("split,MsBackendMemory works", { diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index d051b8e6..36de14c4 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -474,6 +474,8 @@ test_that("[,MsBackendMzR works", { expect_equal(length(tmp), 13) expect_equal(tmp@spectraData$scanIndex, 13:25) expect_true(all(is.na(smoothed(tmp)))) + tmp_2 <- extractByIndex(sciex_mzr, 13:25) + expect_equal(tmp, tmp_2) ints <- intensity(tmp) spd <- spectraData(tmp) diff --git a/vignettes/MsBackend.Rmd b/vignettes/MsBackend.Rmd index a6423e63..ff0269c9 100644 --- a/vignettes/MsBackend.Rmd +++ b/vignettes/MsBackend.Rmd @@ -602,6 +602,28 @@ a <- be[c(2, 2, 2)] spectraData(a) ``` +In addition to the `[` method it is also suggested to implement a +`extractByIndex()` method. Similar to `[`, this method should extract elements +from, or subset, a backend, but it expects an integer vector with the indices as +second parameter. Hence, this method does not require conversion of +e.g. `logical` to `integer`. Also, this method helps avoiding namespace issues +sometimes encountered with parallel processing using the `SnowParam` when the +implementation of `[` for certain backends are not found. + +Data analysis methods on `Spectra` objects will use this method for most +operations. Implementation of this method is optional, since the default +implementation for `MsBackend` will fall back to `[`. Below we implement this +method for our example backend. + +```{r} +setMethod("extractByIndex", c("MsBackendTest", "ANY"), function(object, i) { + x@spectraVars <- x@spectraVars[i, ] + x@mz <- x@mz[i] + x@intensity <- x@intensity[i] + x +}) +``` + ### `backendMerge()` From 37ef79bcc104d0777db5f694628586f1bffc2bd2 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 27 Sep 2024 08:54:36 +0200 Subject: [PATCH 31/41] refactor: add `extractByIndex` method --- R/MsBackend.R | 13 ++-- R/MsBackendCached.R | 2 +- R/MsBackendDataFrame.R | 2 +- .../test_MsBackend/test_spectra_subsetting.R | 18 ++++++ man/MsBackend.Rd | 8 ++- tests/testthat/test_MsBackend.R | 2 - vignettes/MsBackend.Rmd | 64 +++++++------------ 7 files changed, 55 insertions(+), 54 deletions(-) diff --git a/R/MsBackend.R b/R/MsBackend.R index d7216c53..70cf211e 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -15,6 +15,7 @@ #' @aliases dataStorageBasePath,MsBackendMzR-method #' @aliases dataStorageBasePath<- #' @aliases dataStorageBasePath<-,MsBackendMzR-method +#' @aliases extractByIndex #' @aliases msLeveL<-,MsBackend-method #' #' @description @@ -223,7 +224,9 @@ #' allowed. Parameter `i` should support `integer` indices and `logical` #' and should throw an error if `i` is out of bounds. The #' `MsCoreUtils::i2index` could be used to check the input `i`. -#' For `i = integer()` an empty backend should be returned. +#' For `i = integer()` an empty backend should be returned. Implementation +#' of this method is optional, as the default calls the `extractByIndex()` +#' method (which has to be implemented as the main subsetting method). #' #' - `$`, `$<-`: access or set/add a single spectrum variable (column) in the #' backend. Using a `value` of `NULL` should allow deleting the specified @@ -337,8 +340,7 @@ #' that can result in implementations of `[` being not found by R (which #' can happen sometimes in parallel processing using the [SnowParam()]). This #' method is used internally by `Spectra` to extract/subset its backend. -#' Implementation is optional, as the default implementation for `MsBackend` -#' will use `[`. +#' Implementation of this method is mandatory. #' #' - `filterAcquisitionNum()`: filters the object keeping only spectra matching #' the provided acquisition numbers (argument `n`). If `dataOrigin` or @@ -1117,14 +1119,13 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { #' #' @export setMethod("extractByIndex", c("MsBackend", "ANY"), function(object, i) { - object[i = i] + stop("'extractByIndex' not implemented for ", class(object), ".") }) #' @rdname MsBackend #' #' @export setMethod("extractByIndex", c("MsBackend", "missing"), function(object, i) { - message("extractByIndex,MsBackend,missing") object }) @@ -1858,7 +1859,7 @@ setMethod("tic", "MsBackend", function(object, initial = TRUE) { #' #' @export setMethod("[", "MsBackend", function(x, i, j, ..., drop = FALSE) { - stop("Not implemented for ", class(x), ".") + extractByIndex(x, i2index(i, length = length(x))) }) #' @exportMethod $ diff --git a/R/MsBackendCached.R b/R/MsBackendCached.R index caf7f743..dfb4aa40 100644 --- a/R/MsBackendCached.R +++ b/R/MsBackendCached.R @@ -437,7 +437,7 @@ setMethod("show", "MsBackendCached", function(object) { cat(class(object), "with", n, "spectra\n") if (n) { idx <- unique(c(1L:min(6L, n), max(1L, n-5L):n)) - spd <- spectraData(object[idx, ], + spd <- spectraData(extractByIndex(object, idx), c("msLevel", "precursorMz", "polarity")) if (!length(rownames(spd))) rownames(spd) <- idx diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index c89e192d..c04f2f6f 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -574,5 +574,5 @@ setMethod("filterAcquisitionNum", "MsBackendDataFrame", "acquisition number(s) for sub-setting") sel_file <- .sel_file(object, dataStorage, dataOrigin) sel_acq <- acquisitionNum(object) %in% n & sel_file - object[sel_acq | !sel_file] + extractByIndex(object, which(sel_acq | !sel_file)) }) diff --git a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R index fe10f10c..1782747c 100644 --- a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R +++ b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R @@ -49,6 +49,24 @@ test_that("[", { res <- be[integer()] expect_s4_class(res, class(be)[1L]) expect_true(length(res) == 0L) + + ## logical + l <- rep(FALSE, length(be)) + l[sample(seq_along(l), floor(length(l) / 2))] <- TRUE + res <- be[l] + expect_true(validObject(res)) + expect_true(length(res) == sum(l)) + expect_equal(res, be[which(l)]) +}) + +#' extractByIndex. Uses [ if not implemented +test_that("extractByIndex", { + i <- sample(seq_along(be), floor(length(be) / 2)) + res <- extractByIndex(be, i) + expect_true(validObject(res)) + expect_equal(length(res), length(i)) + expect_equal(msLevel(res), msLevel(be)[i]) + expect_equal(rtime(res), rtime(be)[i]) }) #' dropNASpectraVariables: only for not read-only diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 269b7cdd..e4424015 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -21,6 +21,7 @@ \alias{dataStorageBasePath,MsBackendMzR-method} \alias{dataStorageBasePath<-} \alias{dataStorageBasePath<-,MsBackendMzR-method} +\alias{extractByIndex} \alias{msLeveL<-,MsBackend-method} \alias{backendBpparam,MsBackend-method} \alias{backendInitialize,MsBackend-method} @@ -527,7 +528,9 @@ detailed description and examples): allowed. Parameter \code{i} should support \code{integer} indices and \code{logical} and should throw an error if \code{i} is out of bounds. The \code{MsCoreUtils::i2index} could be used to check the input \code{i}. -For \code{i = integer()} an empty backend should be returned. +For \code{i = integer()} an empty backend should be returned. Implementation +of this method is optional, as the default calls the \code{extractByIndex()} +method (which has to be implemented as the main subsetting method). \item \code{$}, \verb{$<-}: access or set/add a single spectrum variable (column) in the backend. Using a \code{value} of \code{NULL} should allow deleting the specified spectra variable. An error should be thrown if the spectra variable is not @@ -619,8 +622,7 @@ redundant to \code{[}, this methods avoids package namespace errors/problems that can result in implementations of \code{[} being not found by R (which can happen sometimes in parallel processing using the \code{\link[=SnowParam]{SnowParam()}}). This method is used internally by \code{Spectra} to extract/subset its backend. -Implementation is optional, as the default implementation for \code{MsBackend} -will use \code{[}. +Implementation of this method is mandatory. \item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or \code{dataStorage} is also provided, \code{object} is subsetted to the spectra with diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index cf36605e..f929451a 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -57,8 +57,6 @@ test_that("MsBackend methods throw errors", { expect_error(dm$a, "implemented for") expect_error(dm$a <- "a", "implemented for") expect_error(extractByIndex(dm, 1), "implemented for") - - expect_equal(extractByIndex(dm), dm) }) test_that("reset,MsBackend works", { diff --git a/vignettes/MsBackend.Rmd b/vignettes/MsBackend.Rmd index ff0269c9..9b5191ed 100644 --- a/vignettes/MsBackend.Rmd +++ b/vignettes/MsBackend.Rmd @@ -563,35 +563,39 @@ additionally available variables and the `columns` parameter of the (in addition to the required `"mz"` and `"intensity"` variables). -### `[` - -The `[` method allows to subset `MsBackend` objects. This operation is expected -to reduce a `MsBackend` object to the selected spectra. The method should -support to subset by indices or logical vectors and should also support -duplicating elements (i.e. when duplicated indices are used) as well as to -subset in arbitrary order. An error should be thrown if indices are out of -bounds, but the method should also support returning an empty backend with -`[integer()]`. Note that the `MsCoreUtils::i2index` function can be used to +### `extractByIndex()` and `[` + +The `extractByIndex()` and `[` methods allows to subset `MsBackend` objects. +This operation is expected to reduce a `MsBackend` object to the selected +spectra. These methods must also support duplication (e.g. `[c(1, 1, 1)]` and +extraction in any arbitrary order (e.g. `[c(3, 1, 5, 3)]`). While both methods +subset the object, `extractByIndex()` only supports to subset with an `integer` +index, while `[`, to be compliant with the base R implementation, should support +to subset by indices or logical vectors. An error should be thrown if indices +are out of bounds, but the method should also support returning an empty backend +with `[integer()]`. Note that the `MsCoreUtils::i2index` function can be used to check for correct input (and convert the input to an `integer` index). -Below we implement a possible `[` for our test backend class. We ignore the -parameters `j` from the definition of the `[` generic, since we treat our data -to be one-dimensional (with each spectrum being one element). +The `extractByIndex()` method is used by the data operation and analysis methods +on `Spectra` objects, while the `[` is intended to be used by the end user (if +needed). Below we implement `extractByIndex()` for our backend: ```{r} -setMethod("[", "MsBackendTest", function(x, i, j, ..., drop = FALSE) { - i <- MsCoreUtils::i2index(i, length = length(x)) - x@spectraVars <- x@spectraVars[i, ] - x@mz <- x@mz[i] - x@intensity <- x@intensity[i] - x +setMethod("extractByIndex", c("MsBackendTest", "ANY"), function(object, i) { + object@spectraVars <- object@spectraVars[i, ] + object@mz <- object@mz[i] + object@intensity <- object@intensity[i] + object }) ``` +The `[` does not need to be defined because a default implementation for +the base `MsBackend` exists. + We can now subset our backend to the last two spectra. ```{r} -a <- be[2:3] +a <- extractByIndex(be, 2:3) spectraData(a) ``` @@ -602,28 +606,6 @@ a <- be[c(2, 2, 2)] spectraData(a) ``` -In addition to the `[` method it is also suggested to implement a -`extractByIndex()` method. Similar to `[`, this method should extract elements -from, or subset, a backend, but it expects an integer vector with the indices as -second parameter. Hence, this method does not require conversion of -e.g. `logical` to `integer`. Also, this method helps avoiding namespace issues -sometimes encountered with parallel processing using the `SnowParam` when the -implementation of `[` for certain backends are not found. - -Data analysis methods on `Spectra` objects will use this method for most -operations. Implementation of this method is optional, since the default -implementation for `MsBackend` will fall back to `[`. Below we implement this -method for our example backend. - -```{r} -setMethod("extractByIndex", c("MsBackendTest", "ANY"), function(object, i) { - x@spectraVars <- x@spectraVars[i, ] - x@mz <- x@mz[i] - x@intensity <- x@intensity[i] - x -}) -``` - ### `backendMerge()` From 02d8394c08b4f4ca132cd65b70afe6163fb48b55 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 27 Sep 2024 10:31:03 +0200 Subject: [PATCH 32/41] fix: add backward compatibility --- R/MsBackend.R | 4 +++- tests/testthat/test_MsBackend.R | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/R/MsBackend.R b/R/MsBackend.R index 70cf211e..f1721a3e 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -1119,7 +1119,9 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { #' #' @export setMethod("extractByIndex", c("MsBackend", "ANY"), function(object, i) { - stop("'extractByIndex' not implemented for ", class(object), ".") + if (existsMethod("[", class(object)[1L])) + object[i = i] + else stop("'extractByIndex' not implemented for ", class(object)[1L], ".") }) #' @rdname MsBackend diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index f929451a..cea6af27 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -59,6 +59,30 @@ test_that("MsBackend methods throw errors", { expect_error(extractByIndex(dm, 1), "implemented for") }) +test_that("extractByIndex not implemented fallback", { + ## Backends that don't implement a dedicated `extractByIndex` method should + ## fall back to the [ method. + setClass("DummyBackend", + contains = "MsBackend", + slots = c(d = "integer")) + dm <- new("DummyBackend") + expect_error(extractByIndex(dm, 1L), "'extractByIndex' not implemented") + + dm@d <- 1:4 + + ## Have an implementation for [ but not extractByIndex: + setMethod("[", "DummyBackend", function(x, i, j, ..., drop = FALSE) { + x@d <- x@d[i] + x + }) + + res <- dm[c(3, 1)] + expect_equal(res@d, c(3L, 1L)) + + res <- extractByIndex(dm, c(3, 1)) + expect_equal(res@d, c(3L, 1L)) +}) + test_that("reset,MsBackend works", { res <- reset(sciex_mzr) expect_equal(res, sciex_mzr) From 1efb9093dfcbdc1b21af0acfe62df924547e347c Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 3 Oct 2024 15:21:09 +0200 Subject: [PATCH 33/41] Add MsBackendMetaboLights to the list of backends --- DESCRIPTION | 2 +- NEWS.md | 4 +++ README.md | 73 ++++++++++++++++++++++++++++--------------- vignettes/Spectra.Rmd | 51 +++++++++++++++++++----------- 4 files changed, 85 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 91db6af4..1892f972 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.10 +Version: 1.15.11 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index 3cc44fb5..c3cf888c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # Spectra 1.15 +## Changes in 1.15.11 + +- Add reference to `MsBackendMetaboLights`. + ## Changes in 1.15.10 - Add new `extractSpectra()` generic and implementation for `MsBackend`. Fixes diff --git a/README.md b/README.md index be839639..78d7efb9 100644 --- a/README.md +++ b/README.md @@ -19,58 +19,81 @@ footprint. A (possibly incomplete) list of available backends (along with a link to the R package providing it) is shown below: -- `MsBackendMemory` (package: *Spectra*): *default* backend which keeps all data - in memory. Optimized for fast processing. +- `MsBackendCompDb` (package + [*CompoundDb*](https://github.com/rformassspectrometry/CompoundDb): provides + access to spectra data (spectra and peaks variables) from a *CompDb* + database. Has a small memory footprint because all data (except precursor m/z + values) are retrieved on-the-fly from the database. + - `MsBackendDataFrame` (package: *Spectra*): alternative to the `MsBackendMemory` also keeping all data in memory, but supporting `S4` objects as spectra variables because the data is stored internally in a `DataFrame`. -- `MsBackendMzR` (package: *Spectra*): by using the `mzR` package it supports - import of MS data from mzML, mzXML and CDF files. This backend keeps only - general spectra variables in memory and retrieves the peaks data (m/z and - intensity values) on-the-fly from the original data files. The backend has - thus a smaller memory footprint compared to in-memory backends. + - `MsBackendHdf5Peaks` (package: *Spectra*): on-disk backend similar to `MsBackendMzR`, but the peaks data is stored in HDF5 files (general spectra variables are kept in memory). -- `MsBackendMgf` (package - [*MsBackendMgf*](https://github.com/rformassspectrometry/MsBackendMgf): allows - to import/export data in mascot generic format (MGF). Extends the - `MsBackendDataFrame` and keeps thus all data, after import, in memory. -- `MsBackendMsp` (package - [*MsbackendMsp*](https://github.com/rformassspectrometry/MsBackendMsp): allows - to import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and - keeps thus all data, after import, in memory. + +- `MsBackendHmdbXml` (package + [*MsbackendHmdb*](https://github.com/rformassspectrometry/MsBackendHmdb)): + allows import of MS data from xml files of the Human Metabolome Database + (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after + import, in memory. + - `MsBackendMassbank` (package [*MsBackendMassbank*](https://github.com/rformassspectrometry/MsBackendMassbank)): allows to import/export data in MassBank text file format. Extends the `MsBackendDataFrame` and keeps thus all data, after import, in memory. + - `MsBackendMassbankSql` (package [*MsBackendMassbank*](https://github.com/rformassspectrometry/MsBackendMassbank)): allows to directly connect to a MassBank SQL database to retrieve all MS data and variables. Has a minimal memory footprint because all data is retrieved on-the-fly from the SQL database. + +- `MsBackendMemory` (package: *Spectra*): *default* backend which keeps all data + in memory. Optimized for fast processing. + +- `MsBackendMetaboLights` (package + [*MsBackendMetaboLights*](https://github.com/rformassspectrometry/MsBackendMetaboLights)): + retrieves and caches MS data files from MetaboLights. + +- `MsBackendMgf` (package + [*MsBackendMgf*](https://github.com/rformassspectrometry/MsBackendMgf)): allows + to import/export data in mascot generic format (MGF). Extends the + `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMsp` (package + [*MsbackendMsp*](https://github.com/rformassspectrometry/MsBackendMsp)): allows + to import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and + keeps thus all data, after import, in memory. + +- `MsBackendMzR` (package: *Spectra*): by using the `mzR` package it supports + import of MS data from mzML, mzXML and CDF files. This backend keeps only + general spectra variables in memory and retrieves the peaks data (m/z and + intensity values) on-the-fly from the original data files. The backend has + thus a smaller memory footprint compared to in-memory backends. + +- `MsBackendOfflineSql` (package + [*MsBackendSql*](https://github.com/rformassspectrometry/MsBackendSql)): + stores all MS data in a SQL database and has thus a minimal memory footprint. + Does, in contrast to `MsBackendSql`, not keep an active SQL database + connection and can thus support parallel processing. + - `MsBackendRawFileReader` (package [*MsBackendRawFileReader*](https://github.com/fgcz/MsBackendRawFileReader)): implements a backend for reading MS data from Thermo Fisher Scientific's raw data files using the manufacturer's NewRawFileReader .Net libraries. The package generalizes the functionality introduced by the `rawrr` package. -- `MsBackendHmdbXml` (package - [*MsbackendHmdb*](https://github.com/rformassspectrometry/MsBackendHmdb)): - allows import of MS data from xml files of the Human Metabolome Database - (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after - import, in memory. + - `MsBackendSql` (package [*MsBackendSql*](https://github.com/rformassspectrometry/MsBackendSql)): stores all MS data in a SQL database and has thus a minimal memory footprint. -- `MsBackendCompDb` (package - [*CompoundDb*](https://github.com/rformassspectrometry/CompoundDb): provides - access to spectra data (spectra and peaks variables) from a *CompDb* - database. Has a small memory footprint because all data (except precursor m/z - values) are retrieved on-the-fly from the database. + - `MsBackendTimsTof` (package [*MsBackendTimsTof*](https://github.com/rformassspectrometry/MsBackendTimsTof): allows import of data from Bruker TimsTOF raw data files (using the `opentimsr` R package). + - `MsBackendWeizMass` (package [*MsBackendWeizMass*](https://github.com/rformassspectrometry/MsBackendWeizMass): allows to access MS data from WeizMass MS/MS spectral databases. diff --git a/vignettes/Spectra.Rmd b/vignettes/Spectra.Rmd index 8d383700..35e0dfbb 100644 --- a/vignettes/Spectra.Rmd +++ b/vignettes/Spectra.Rmd @@ -1244,38 +1244,51 @@ head(basename(dataStorage(sps_tmt))) A (possibly incomplete) list of R packages providing additional backends that add support for additional data types or storage options is provided below: -- `r BiocStyle::Biocpkg("MsBackendMgf")`: support for import/export of mass - spectrometry files in mascot generic format (MGF). -- `r BiocStyle::Biocpkg("MsBackendMsp")`: allows to import/export data in NIST - MSP format. Extends the `MsBackendDataFrame` and keeps thus all data, after - import, in memory. -- `MsBackendMassbank` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): - allows to import/export data in MassBank text file format. Extends the - `MsBackendDataFrame` and keeps thus all data, after import, in memory. -- `MsBackendMassbankSql` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): - allows to directly connect to a MassBank SQL database to retrieve all MS data - and variables. Has a minimal memory footprint because all data is retrieved - on-the-fly from the SQL database. -- `r BiocStyle::Biocpkg("MsBackendSql")`: stores all MS data in a SQL database - and has thus a minimal memory footprint. - `MsBackendCompDb` (package `r BiocStyle::Biocpkg("CompoundDb")`): provides access to spectra data (spectra and peaks variables) from a *CompDb* database. Has a small memory footprint because all data (except precursor m/z values) are retrieved on-the-fly from the database. -- `r Biocpkg("MsBackendRawFileReader")`: implements a backend for reading MS - data from Thermo Fisher Scientific's raw data files using the manufacturer's - NewRawFileReader .Net libraries. The package generalizes the functionality - introduced by the `r Biocpkg("rawrr")` package, see also - [@kockmann_rawrr_2021]. + - `MsBackendHmdbXml` (package [`MsbackendHmdb`](https://github.com/rformassspectrometry/MsBackendHmdb)): allows import of MS data from xml files of the Human Metabolome Database (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMassbank` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): + allows to import/export data in MassBank text file format. Extends the + `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMassbankSql` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): + allows to directly connect to a MassBank SQL database to retrieve all MS data + and variables. Has a minimal memory footprint because all data is retrieved + on-the-fly from the SQL database. + +- `MsBackendMetaboLights` (package `r + BiocStyle::Biocpkg("MsBackendMetaboLights")`): retrieves and caches MS data + files from the MetaboLights repository. + +- `MsBackendMgf`: (package `r BiocStyle::Biocpkg("MsBackendMgf")`): support for + import/export of mass spectrometry files in mascot generic format (MGF). + +- `MsBackendMsp`: (package `r BiocStyle::Biocpkg("MsBackendMsp")`): allows to + import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and + keeps thus all data, after import, in memory. + +- `MsBackendRawFileReader` (package `r Biocpkg("MsBackendRawFileReader")`): + implements a backend for reading MS data from Thermo Fisher Scientific's raw + data files using the manufacturer's NewRawFileReader .Net libraries. The + package generalizes the functionality introduced by the `r Biocpkg("rawrr")` + package, see also [@kockmann_rawrr_2021]. + +- `MsBackendSql` (package `r BiocStyle::Biocpkg("MsBackendSql")`): stores all MS + data in a SQL database and has thus a minimal memory footprint. + - `MsBackendTimsTof` (package [`MsBackendTimsTof`](https://github.com/rformassspectrometry/MsBackendTimsTof): allows import of data from Bruker TimsTOF raw data files (using the `opentimsr` R package). + - `MsBackendWeizMass` (package [`MsBackendWeizMass`](https://github.com/rformassspectrometry/MsBackendWeizMass): allows to access MS data from WeizMass MS/MS spectral databases. From 37d502933344ed5d9bc938f8cff2c5e189ac27d3 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Tue, 15 Oct 2024 12:04:17 +0200 Subject: [PATCH 34/41] feat: add new method backendRequiredSpectraVariables - Add new method `backendRequiredSpectraVariables()` that allows a backend to specify which spectra variables are necessary to ensure proper functionality. Subsetting functions (like `selectSpectraVariables()`) should use these. --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ NEWS.md | 5 +++++ R/AllGenerics.R | 2 ++ R/MsBackend.R | 22 +++++++++++++++++-- R/MsBackendDataFrame.R | 17 ++++++++++---- R/MsBackendHdf5Peaks.R | 10 +++++++-- R/MsBackendMemory.R | 9 +++++++- R/MsBackendMzR.R | 10 +++++++-- .../test_MsBackend/test_spectra_subsetting.R | 5 +++-- man/MsBackend.Rd | 8 +++++++ man/hidden_aliases.Rd | 12 ++++++++++ tests/testthat/test_MsBackend.R | 1 + tests/testthat/test_MsBackendDataFrame.R | 7 +++++- tests/testthat/test_MsBackendHdf5Peaks.R | 5 +++++ tests/testthat/test_MsBackendMemory.R | 5 +++++ tests/testthat/test_MsBackendMzR.R | 8 ++++++- vignettes/MsBackend.Rmd | 17 ++++++++++++++ 18 files changed, 131 insertions(+), 16 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1892f972..431584b9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.11 +Version: 1.15.12 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NAMESPACE b/NAMESPACE index e3e4970c..df65fbe1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,6 +71,7 @@ exportMethods(backendBpparam) exportMethods(backendInitialize) exportMethods(backendMerge) exportMethods(backendParallelFactor) +exportMethods(backendRequiredSpectraVariables) exportMethods(bin) exportMethods(c) exportMethods(centroided) @@ -207,6 +208,7 @@ importFrom(methods,.hasSlot) importFrom(methods,.valueClassTest) importFrom(methods,as) importFrom(methods,callNextMethod) +importFrom(methods,existsMethod) importFrom(methods,is) importFrom(methods,new) importFrom(methods,setAs) diff --git a/NEWS.md b/NEWS.md index c3cf888c..d21a1714 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Spectra 1.15 +## Changes in 1.15.12 + +- Add generic `backendRequiredSpectraVariables()` to allow definition of + mandatory spectra variables for a backend. + ## Changes in 1.15.11 - Add reference to `MsBackendMetaboLights`. diff --git a/R/AllGenerics.R b/R/AllGenerics.R index d02aa13c..856cb69e 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -1,6 +1,8 @@ #' @include hidden_aliases.R NULL +setGeneric("backendRequiredSpectraVariables", function(object, ...) + standardGeneric("backendRequiredSpectraVariables")) #' @rdname hidden_aliases setMethod("bin", "numeric", MsCoreUtils::bin) setGeneric("combinePeaks", function(object, ...) diff --git a/R/MsBackend.R b/R/MsBackend.R index f1721a3e..f2a86d0b 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -17,6 +17,8 @@ #' @aliases dataStorageBasePath<-,MsBackendMzR-method #' @aliases extractByIndex #' @aliases msLeveL<-,MsBackend-method +#' @aliases backendRequiredSpectraVariables +#' @aliases backendRequiredSpectraVariables,MsBackend-method #' #' @description #' @@ -280,6 +282,13 @@ #' `MsBackendMzR` on the other hand returns `factor(dataStorage(object))` #' hence suggesting to split the object by data file. #' +#' - `backendRequiredSpectraVariables()`: returns a `character` with spectra +#' variable names that are mandatory for a specific backend. The default +#' returns an empty `character()`. The implementation for `MsBackendMzR` +#' returns `c("dataStorage", "scanIndex")` as these two spectra variables +#' are required to load the MS data on-the-fly. This method needs only to +#' be implemented if a backend requires specific variables to be defined. +#' #' - `dataOrigin()`: gets a `character` of length equal to the number of #' spectra in `object` with the *data origin* of each spectrum. This could #' e.g. be the mzML file from which the data was read. @@ -965,6 +974,12 @@ setMethod("backendParallelFactor", "MsBackend", function(object, ...) { factor() }) +#' @export +setMethod("backendRequiredSpectraVariables", "MsBackend", + function(object, ...) { + character() + }) + #' @rdname MsBackend #' #' @export @@ -1104,7 +1119,8 @@ setReplaceMethod("dataStorage", "MsBackend", function(object, value) { #' @export setMethod("dropNaSpectraVariables", "MsBackend", function(object) { svs <- spectraVariables(object) - svs <- svs[!(svs %in% c("mz", "intensity"))] + req_cols <- c(backendRequiredSpectraVariables(object), c("mz", "intensity")) + svs <- svs[!(svs %in% req_cols)] spd <- spectraData(object, columns = svs) keep <- !vapply1l(spd, function(z) { allna <- all(is.na(z)) @@ -1112,11 +1128,13 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { FALSE else allna }) - selectSpectraVariables(object, c(svs[keep], "mz", "intensity")) + selectSpectraVariables(object, c(svs[keep], req_cols)) }) #' @rdname MsBackend #' +#' @importFrom methods existsMethod +#' #' @export setMethod("extractByIndex", c("MsBackend", "ANY"), function(object, i) { if (existsMethod("[", class(object)[1L])) diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index c04f2f6f..6959d771 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -22,7 +22,8 @@ setClass("MsBackendDataFrame", version = "0.2")) setValidity("MsBackendDataFrame", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) if (length(msg)) return(msg) msg <- c( @@ -92,6 +93,12 @@ setMethod("backendMerge", "MsBackendDataFrame", function(object, ...) { res }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendDataFrame", + function(object, ...) { + "dataStorage" + }) + ## Data accessors #' @rdname hidden_aliases @@ -413,14 +420,16 @@ setMethod("selectSpectraVariables", "MsBackendDataFrame", paste(spectraVariables[!(spectraVariables %in% spectraVariables(object))], collapse = ", "), " not available") + bv <- backendRequiredSpectraVariables(object) + if (!all(bv %in% spectraVariables)) + stop("Spectra variables ", + paste(bv[!bv %in% spectraVariables], collapse = ","), + " are required by the backend") keep <- spectraVariables[spectraVariables %in% colnames(object@spectraData)] if (length(keep)) object@spectraData <- object@spectraData[, keep, drop = FALSE] - msg <- .valid_spectra_data_required_columns(object@spectraData) - if (length(msg)) - stop(msg) object@peaksVariables <- intersect(object@peaksVariables, spectraVariables) validObject(object) diff --git a/R/MsBackendHdf5Peaks.R b/R/MsBackendHdf5Peaks.R index ebcb8ea2..27f14753 100644 --- a/R/MsBackendHdf5Peaks.R +++ b/R/MsBackendHdf5Peaks.R @@ -26,8 +26,8 @@ setClass("MsBackendHdf5Peaks", prototype = prototype(version = "0.1", readonly = FALSE)) setValidity("MsBackendHdf5Peaks", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData, - c("dataStorage", "scanIndex")) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) fls <- unique(object@spectraData$dataStorage) msg <- c(msg, .valid_ms_backend_mod_count(object@modCount, fls)) msg <- c(msg, .valid_ms_backend_files_exist(fls)) @@ -36,6 +36,12 @@ setValidity("MsBackendHdf5Peaks", function(object) { else msg }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendHdf5Peaks", + function(object, ...) { + c("dataStorage", "scanIndex") + }) + #' @rdname hidden_aliases #' #' @importFrom fs path_sanitize diff --git a/R/MsBackendMemory.R b/R/MsBackendMemory.R index 594fc799..4bde69ac 100644 --- a/R/MsBackendMemory.R +++ b/R/MsBackendMemory.R @@ -122,6 +122,12 @@ setMethod("backendMerge", "MsBackendMemory", function(object, ...) { res }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendMemory", + function(object, ...) { + "dataStorage" + }) + ## Data accessors #' @rdname hidden_aliases @@ -514,7 +520,8 @@ setMethod("selectSpectraVariables", "MsBackendMemory", z[, keep, drop = FALSE]) } } - msg <- .valid_spectra_data_required_columns(object@spectraData) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) if (length(msg)) stop(msg) validObject(object) diff --git a/R/MsBackendMzR.R b/R/MsBackendMzR.R index 69a04987..a7930e0d 100644 --- a/R/MsBackendMzR.R +++ b/R/MsBackendMzR.R @@ -24,14 +24,20 @@ setClass("MsBackendMzR", prototype = prototype(version = "0.1", readonly = TRUE)) setValidity("MsBackendMzR", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData, - c("dataStorage", "scanIndex")) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) msg <- c(msg, .valid_ms_backend_files_exist( unique(object@spectraData$dataStorage))) if (length(msg)) msg else TRUE }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendMzR", + function(object, ...) { + c("dataStorage", "scanIndex") + }) + #' @rdname hidden_aliases #' #' @importFrom methods callNextMethod diff --git a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R index 1782747c..93adce0d 100644 --- a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R +++ b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R @@ -92,8 +92,9 @@ test_that("selectSpectraVariables", { if (!isReadOnly(be) || inherits(be, "MsBackendCached") || inherits(be, "MsBackendDataFrame")) { tmp <- be - res <- selectSpectraVariables(tmp, c("mz", "intensity", - "dataStorage", "scanIndex")) + res <- selectSpectraVariables( + tmp, union(c("mz", "intensity", "dataStorage", "scanIndex"), + backendRequiredSpectraVariables(be))) expect_true(all(names(coreSpectraVariables()) %in% spectraVariables(res))) expect_true(all(is.na(res$msLevel))) diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index e4424015..16b5e782 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -23,6 +23,8 @@ \alias{dataStorageBasePath<-,MsBackendMzR-method} \alias{extractByIndex} \alias{msLeveL<-,MsBackend-method} +\alias{backendRequiredSpectraVariables} +\alias{backendRequiredSpectraVariables,MsBackend-method} \alias{backendBpparam,MsBackend-method} \alias{backendInitialize,MsBackend-method} \alias{backendMerge,list-method} @@ -576,6 +578,12 @@ The default implementation returns a factor of length 0 (\code{factor()}) providing thus no default splitting. \code{backendParallelFactor()} for \code{MsBackendMzR} on the other hand returns \code{factor(dataStorage(object))} hence suggesting to split the object by data file. +\item \code{backendRequiredSpectraVariables()}: returns a \code{character} with spectra +variable names that are mandatory for a specific backend. The default +returns an empty \code{character()}. The implementation for \code{MsBackendMzR} +returns \code{c("dataStorage", "scanIndex")} as these two spectra variables +are required to load the MS data on-the-fly. This method needs only to +be implemented if a backend requires specific variables to be defined. \item \code{dataOrigin()}: gets a \code{character} of length equal to the number of spectra in \code{object} with the \emph{data origin} of each spectrum. This could e.g. be the mzML file from which the data was read. diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index 3e70d26c..ce4e63e9 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -10,6 +10,7 @@ \alias{bin,numeric-method} \alias{show,MsBackendDataFrame-method} \alias{backendMerge,MsBackendDataFrame-method} +\alias{backendRequiredSpectraVariables,MsBackendDataFrame-method} \alias{acquisitionNum,MsBackendDataFrame-method} \alias{peaksData,MsBackendDataFrame-method} \alias{centroided,MsBackendDataFrame-method} @@ -60,6 +61,7 @@ \alias{$<-,MsBackendDataFrame-method} \alias{split,MsBackendDataFrame,ANY-method} \alias{filterAcquisitionNum,MsBackendDataFrame-method} +\alias{backendRequiredSpectraVariables,MsBackendHdf5Peaks-method} \alias{backendInitialize,MsBackendHdf5Peaks-method} \alias{show,MsBackendHdf5Peaks-method} \alias{peaksData,MsBackendHdf5Peaks-method} @@ -80,6 +82,7 @@ \alias{backendMerge,MsBackendHdf5Peaks-method} \alias{show,MsBackendMemory-method} \alias{backendMerge,MsBackendMemory-method} +\alias{backendRequiredSpectraVariables,MsBackendMemory-method} \alias{acquisitionNum,MsBackendMemory-method} \alias{centroided,MsBackendMemory-method} \alias{centroided<-,MsBackendMemory-method} @@ -132,6 +135,7 @@ \alias{[,MsBackendMemory-method} \alias{split,MsBackendMemory,ANY-method} \alias{filterAcquisitionNum,MsBackendMemory-method} +\alias{backendRequiredSpectraVariables,MsBackendMzR-method} \alias{backendInitialize,MsBackendMzR-method} \alias{show,MsBackendMzR-method} \alias{peaksData,MsBackendMzR-method} @@ -168,6 +172,8 @@ \S4method{backendMerge}{MsBackendDataFrame}(object, ...) +\S4method{backendRequiredSpectraVariables}{MsBackendDataFrame}(object, ...) + \S4method{acquisitionNum}{MsBackendDataFrame}(object) \S4method{peaksData}{MsBackendDataFrame}(object, columns = c("mz", "intensity")) @@ -275,6 +281,8 @@ dataOrigin = character() ) +\S4method{backendRequiredSpectraVariables}{MsBackendHdf5Peaks}(object, ...) + \S4method{backendInitialize}{MsBackendHdf5Peaks}( object, files = character(), @@ -322,6 +330,8 @@ \S4method{backendMerge}{MsBackendMemory}(object, ...) +\S4method{backendRequiredSpectraVariables}{MsBackendMemory}(object, ...) + \S4method{acquisitionNum}{MsBackendMemory}(object) \S4method{centroided}{MsBackendMemory}(object) @@ -431,6 +441,8 @@ dataOrigin = character() ) +\S4method{backendRequiredSpectraVariables}{MsBackendMzR}(object, ...) + \S4method{backendInitialize}{MsBackendMzR}(object, files, ..., BPPARAM = bpparam()) \S4method{show}{MsBackendMzR}(object) diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index cea6af27..01fa65c2 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -57,6 +57,7 @@ test_that("MsBackend methods throw errors", { expect_error(dm$a, "implemented for") expect_error(dm$a <- "a", "implemented for") expect_error(extractByIndex(dm, 1), "implemented for") + expect_equal(backendRequiredSpectraVariables(dm), character()) }) test_that("extractByIndex not implemented fallback", { diff --git a/tests/testthat/test_MsBackendDataFrame.R b/tests/testthat/test_MsBackendDataFrame.R index e5de3662..238e945a 100644 --- a/tests/testthat/test_MsBackendDataFrame.R +++ b/tests/testthat/test_MsBackendDataFrame.R @@ -653,7 +653,7 @@ test_that("selectSpectraVariables,MsBackendDataFrame works", { expect_equal(colnames(res@spectraData), c("dataStorage", "rtime")) expect_equal(res@peaksVariables, be@peaksVariables) - expect_error(selectSpectraVariables(be, "rtime"), "dataStorage is/are missing") + expect_error(selectSpectraVariables(be, "rtime"), "are required") expect_error(selectSpectraVariables(be, "something"), "something not available") @@ -1024,3 +1024,8 @@ test_that("[[,[[<-,MsBackendDataFrame works", { test_that("supportsSetBackend,MsBackendDataFrame", { expect_true(supportsSetBackend(MsBackendDataFrame())) }) + +test_that("backendRequiredSpectraVariables,MsBackendDataFrame works", { + expect_equal(backendRequiredSpectraVariables(MsBackendDataFrame()), + "dataStorage") +}) diff --git a/tests/testthat/test_MsBackendHdf5Peaks.R b/tests/testthat/test_MsBackendHdf5Peaks.R index 3604d895..17495169 100644 --- a/tests/testthat/test_MsBackendHdf5Peaks.R +++ b/tests/testthat/test_MsBackendHdf5Peaks.R @@ -413,3 +413,8 @@ test_that("backendParallelFactor,MsBackendHdf5Peaks", { factor(dataStorage(sciex_hd5), levels = unique(dataStorage(sciex_hd5)))) }) + +test_that("backendRequiredSpectraVariables,MsBackendHdf5Peaks works", { + expect_equal(backendRequiredSpectraVariables(MsBackendHdf5Peaks()), + c("dataStorage", "scanIndex")) +}) diff --git a/tests/testthat/test_MsBackendMemory.R b/tests/testthat/test_MsBackendMemory.R index bb3c9973..119e2c56 100644 --- a/tests/testthat/test_MsBackendMemory.R +++ b/tests/testthat/test_MsBackendMemory.R @@ -944,3 +944,8 @@ test_that("tic,MsBackendMemory works", { test_that("supportsSetBackend,MsBackendMemory", { expect_true(supportsSetBackend(MsBackendMemory())) }) + +test_that("backendRequiredSpectraVariables,MsBackendMemory works", { + expect_equal(backendRequiredSpectraVariables(MsBackendMemory()), + "dataStorage") +}) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index 36de14c4..d8a83227 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -504,7 +504,7 @@ test_that("selectSpectraVariables,MsBackendMzR works", { expect_equal(res@peaksVariables, c("mz", "intensity")) expect_error(selectSpectraVariables(be, c("dataStorage", "msLevel")), - "scanIndex is/are missing") + "required") }) test_that("$,$<-,MsBackendMzR works", { @@ -597,3 +597,9 @@ test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { #' errors expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") }) + +test_that("backendRequiredSpectraVariables,MsBackendMzR works", { + tmp <- MsBackendMzR() + expect_equal(backendRequiredSpectraVariables(tmp), + c("dataStorage", "scanIndex")) +}) diff --git a/vignettes/MsBackend.Rmd b/vignettes/MsBackend.Rmd index 9b5191ed..5192084e 100644 --- a/vignettes/MsBackend.Rmd +++ b/vignettes/MsBackend.Rmd @@ -1590,6 +1590,23 @@ setMethod("backendParallelFactor", "MsBackend", function(object, ...) { ``` +### `backendRequiredSpectraVariables()` + +The `backendRequiredSpectraVariables()` method can be implemented if a backend +needs specific spectra variables to work. The default implementation is: + +```{r} +setMethod("backendRequiredSpectraVariables", "MsBackend", + function(object, ...) { + character() + }) +``` + +The implementation for `MsBackendMzR` returns `c("dataStorage", "scanIndex")` as +the backend needs these two spectra variables to load the MS data on-the-fly +from the original data files. + + ### `dropNaSpectraVariables()` The `dropNaSpectraVariables()` is supposed to allow removing all spectra From 5f3a02c49cceb3bf52b7dfcef2ecb376c424b481 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 25 Oct 2024 08:17:36 +0200 Subject: [PATCH 35/41] feat: add precursorMz<- replacement method (issue #336) --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ NEWS.md | 4 ++++ R/MsBackend.R | 12 ++++++++++++ R/MsBackendCached.R | 9 --------- R/Spectra.R | 6 ++++++ man/MsBackend.Rd | 3 +++ man/spectraData.Rd | 3 +++ tests/testthat/test_MsBackend.R | 1 + tests/testthat/test_MsBackendCached.R | 7 +++++++ tests/testthat/test_MsBackendMzR.R | 6 ++++++ tests/testthat/test_Spectra.R | 6 ++++++ vignettes/MsBackend.Rmd | 15 +++++++++++++++ 13 files changed, 66 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 431584b9..7727f9b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.12 +Version: 1.15.13 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NAMESPACE b/NAMESPACE index df65fbe1..8d8185f9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -60,6 +60,7 @@ exportMethods("msLevel<-") exportMethods("mz<-") exportMethods("peaksData<-") exportMethods("polarity<-") +exportMethods("precursorMz<-") exportMethods("rtime<-") exportMethods("smoothed<-") exportMethods("spectraData<-") @@ -237,6 +238,7 @@ importMethodsFrom(ProtGenerics,"msLevel<-") importMethodsFrom(ProtGenerics,"mz<-") importMethodsFrom(ProtGenerics,"peaksData<-") importMethodsFrom(ProtGenerics,"polarity<-") +importMethodsFrom(ProtGenerics,"precursorMz<-") importMethodsFrom(ProtGenerics,"rtime<-") importMethodsFrom(ProtGenerics,"smoothed<-") importMethodsFrom(ProtGenerics,"spectraData<-") diff --git a/NEWS.md b/NEWS.md index d21a1714..b3d0404d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # Spectra 1.15 +## Changes in 1.15.13 + +- Add `precursorMz<-` method [issue #336](https://github.com/rformassspectrometry/Spectra/issues/336). + ## Changes in 1.15.12 - Add generic `backendRequiredSpectraVariables()` to allow definition of diff --git a/R/MsBackend.R b/R/MsBackend.R index f2a86d0b..186f26c6 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -1699,6 +1699,18 @@ setMethod("precursorMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) +#' @exportMethod precursorMz<- +#' +#' @importMethodsFrom ProtGenerics precursorMz<- +#' +#' @rdname MsBackend +#' +#' @export +setReplaceMethod("precursorMz", "MsBackend", function(object, ..., value) { + object$precursorMz <- value + object +}) + #' @exportMethod peaksData<- #' #' @importMethodsFrom ProtGenerics peaksData<- diff --git a/R/MsBackendCached.R b/R/MsBackendCached.R index dfb4aa40..e2f4d4d2 100644 --- a/R/MsBackendCached.R +++ b/R/MsBackendCached.R @@ -464,7 +464,6 @@ setMethod("centroided", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("centroided", "MsBackendCached", function(object, value) { object$centroided <- value - validObject(object) object }) @@ -476,7 +475,6 @@ setMethod("collisionEnergy", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("collisionEnergy", "MsBackendCached", function(object, value) { object$collisionEnergy <- value - validObject(object) object }) @@ -488,7 +486,6 @@ setMethod("dataOrigin", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("dataOrigin", "MsBackendCached", function(object, value) { object$dataOrigin <- value - validObject(object) object }) @@ -525,7 +522,6 @@ setMethod("isolationWindowLowerMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowLowerMz", "MsBackendCached", function(object, value) { object$isolationWindowLowerMz <- value - validObject(object) object }) @@ -538,7 +534,6 @@ setMethod("isolationWindowTargetMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowTargetMz", "MsBackendCached", function(object, value) { object$isolationWindowTargetMz <- value - validObject(object) object }) @@ -551,7 +546,6 @@ setMethod("isolationWindowUpperMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowUpperMz", "MsBackendCached", function(object, value) { object$isolationWindowUpperMz <- value - validObject(object) object }) @@ -574,7 +568,6 @@ setMethod("polarity", "MsBackendCached", function(object) { setReplaceMethod("polarity", "MsBackendCached", function(object, value) { if (is.numeric(value)) value <- as.integer(value) object$polarity <- value - validObject(object) object }) @@ -601,7 +594,6 @@ setMethod("rtime", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("rtime", "MsBackendCached", function(object, value) { object$rtime <- value - validObject(object) object }) @@ -618,6 +610,5 @@ setMethod("smoothed", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("smoothed", "MsBackendCached", function(object, value) { object$smoothed <- value - validObject(object) object }) diff --git a/R/Spectra.R b/R/Spectra.R index 7564b0a4..14ebbf2c 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1244,6 +1244,12 @@ setMethod("precursorMz", "Spectra", function(object) { precursorMz(object@backend) }) +#' @rdname spectraData +setReplaceMethod("precursorMz", "Spectra", function(object, ..., value) { + precursorMz(object@backend) <- value + object +}) + #' @rdname spectraData setMethod("rtime", "Spectra", function(object) { rtime(object@backend) diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 16b5e782..279576a5 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -84,6 +84,7 @@ \alias{precursorCharge,MsBackend-method} \alias{precursorIntensity,MsBackend-method} \alias{precursorMz,MsBackend-method} +\alias{precursorMz<-,MsBackend-method} \alias{peaksData<-,MsBackend-method} \alias{reset,MsBackend-method} \alias{rtime,MsBackend-method} @@ -244,6 +245,8 @@ \S4method{precursorMz}{MsBackend}(object) +\S4method{precursorMz}{MsBackend}(object, ...) <- value + \S4method{peaksData}{MsBackend}(object) <- value \S4method{reset}{MsBackend}(object) diff --git a/man/spectraData.Rd b/man/spectraData.Rd index 49d2bee3..2aad735f 100644 --- a/man/spectraData.Rd +++ b/man/spectraData.Rd @@ -62,6 +62,7 @@ \alias{precursorCharge,Spectra-method} \alias{precursorIntensity,Spectra-method} \alias{precursorMz,Spectra-method} +\alias{precursorMz<-,Spectra-method} \alias{rtime,Spectra-method} \alias{rtime<-,Spectra-method} \alias{scanIndex,Spectra-method} @@ -154,6 +155,8 @@ coreSpectraVariables() \S4method{precursorMz}{Spectra}(object) +\S4method{precursorMz}{Spectra}(object, ...) <- value + \S4method{rtime}{Spectra}(object) \S4method{rtime}{Spectra}(object) <- value diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index 01fa65c2..5e91a0fe 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -58,6 +58,7 @@ test_that("MsBackend methods throw errors", { expect_error(dm$a <- "a", "implemented for") expect_error(extractByIndex(dm, 1), "implemented for") expect_equal(backendRequiredSpectraVariables(dm), character()) + expect_error(precursorMz(dm) <- 12.3, "implemented for") }) test_that("extractByIndex not implemented fallback", { diff --git a/tests/testthat/test_MsBackendCached.R b/tests/testthat/test_MsBackendCached.R index 86bd8639..e547b190 100644 --- a/tests/testthat/test_MsBackendCached.R +++ b/tests/testthat/test_MsBackendCached.R @@ -302,3 +302,10 @@ test_that("lengths,MsBackendCached works", { res <- lengths(be) expect_true(all(res == 0)) }) + +test_that("precursorMz<-,MsBackendCached works", { + be <- backendInitialize(MsBackendCached(), nspectra = 4) + expect_true(all(is.na(precursorMz(be)))) + precursorMz(be) <- c(1.1, 1.2, 1.3, 1.34) + expect_equal(precursorMz(be), c(1.1, 1.2, 1.3, 1.34)) +}) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index d8a83227..44d38cd2 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -603,3 +603,9 @@ test_that("backendRequiredSpectraVariables,MsBackendMzR works", { expect_equal(backendRequiredSpectraVariables(tmp), c("dataStorage", "scanIndex")) }) + +test_that("precursorMz<-,MsbackendMzR works", { + a <- sciex_mzr[1:3] + precursorMz(a) <- c(12.2, 1.2, 1.4) + expect_equal(precursorMz(a), c(12.2, 1.2, 1.4)) +}) diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index 43638a4d..4cc721d9 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -1986,3 +1986,9 @@ test_that("estimatePrecursorIntensity works", { res_both <- estimatePrecursorIntensity(both) expect_equal(res_second, res_both[510:length(res_both)]) }) + +test_that("precursorMz<-,Spectra works", { + a <- sps_dda[1:3] + precursorMz(a) <- c(12.3, 1.1, 34.3) + expect_equal(precursorMz(a), c(12.3, 1.1, 34.3)) +}) diff --git a/vignettes/MsBackend.Rmd b/vignettes/MsBackend.Rmd index 5192084e..c74f82e8 100644 --- a/vignettes/MsBackend.Rmd +++ b/vignettes/MsBackend.Rmd @@ -1677,6 +1677,21 @@ This method thus retrieves first the MS levels of all spectra and then calls operation by selecting the unique MS levels directly using an SQL call. +### `precursorMz<-` + +Replace the values for the *precursor m/z* spectra +variable. Parameter `value` has to be of type `numeric` (`NA_real_` missing +values are supported, e.g. for MS1 spectra). The default implementation uses the +`$<-` method: + +```{r} +setReplaceMethod("precursorMz", "MsBackend", function(object, ..., value) { + object$precursorMz <- value + object +}) +``` + + ### `ionCount()` The `ionCount()` method should return a `numeric` (length equal to the number of From 967402eff1cbf73f4b33f89915a801a466fe64f3 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Fri, 25 Oct 2024 09:08:59 +0200 Subject: [PATCH 36/41] Add reference to code of conduct --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 78d7efb9..3df3e6d7 100644 --- a/README.md +++ b/README.md @@ -118,4 +118,6 @@ BiocManager::install("Spectra") Contributions are highly welcome and should follow the [contribution guidelines](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html#contributions). Also, please check the coding style guidelines in the [RforMassSpectrometry -vignette](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html). +vignette](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html) +and importantly, follow our [code of +conduct](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html#code-of-conduct). From 8baf07c7acbf44a943f5f2b7dd42d0ff90bfeebb Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 29 Oct 2024 10:49:53 -0400 Subject: [PATCH 37/41] bump x.y.z version to even y prior to creation of RELEASE_3_20 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7727f9b8..45f21460 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.13 +Version: 1.16.0 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different From 803c9d2fa3858749404d6be71e97a844ed2a7959 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 29 Oct 2024 10:49:53 -0400 Subject: [PATCH 38/41] bump x.y.z version to odd y following creation of RELEASE_3_20 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 45f21460..df891150 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.16.0 +Version: 1.17.0 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different From 6eeeb79f2e9a1b72e790adfdaa424f02d1ee9d82 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 20 Nov 2024 09:16:32 +0100 Subject: [PATCH 39/41] refactor: support chunk-wise processing in containsMz - Add support for chunk-wise processing to `containsMz()`. Related to issue #340. --- .editorconfig | 4 +-- .github/workflows/check-bioc.yml | 2 +- DESCRIPTION | 2 +- NEWS.md | 6 ++++ R/Spectra-functions.R | 46 ++++++------------------- R/Spectra.R | 41 ++++++++++------------ R/peaks-functions.R | 10 ++++++ tests/testthat/test_Spectra-functions.R | 26 -------------- tests/testthat/test_peaks-functions.R | 13 +++++++ 9 files changed, 62 insertions(+), 88 deletions(-) diff --git a/.editorconfig b/.editorconfig index 71842659..0cebcc70 100644 --- a/.editorconfig +++ b/.editorconfig @@ -6,7 +6,7 @@ root = true charset = utf-8 end_of_line = lf trim_trailing_whitespace = true -insert_final_newline = false +insert_final_newline = true [*.R] indent_style = space @@ -22,4 +22,4 @@ indent_style = tab [*.yml] indent_style = space -indent_size = 2 +indent_size = 2 \ No newline at end of file diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 85b29ffa..48f4ea23 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -53,7 +53,7 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.4', bioc: '3.20', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - { os: macOS-latest, r: '4.4', bioc: '3.20'} - { os: windows-latest, r: '4.4', bioc: '3.20'} env: diff --git a/DESCRIPTION b/DESCRIPTION index df891150..2057828e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.17.0 +Version: 1.17.1 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different diff --git a/NEWS.md b/NEWS.md index b3d0404d..f498b62d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# Spectra 1.17 + +## Changes in 1.17.1 + +- Refactor `containsMz()` to support chunk-wise processing. + # Spectra 1.15 ## Changes in 1.15.13 diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 99014163..93d9f2db 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -63,7 +63,13 @@ NULL #' @description #' #' This function applies the processing queue and an arbitrary function to -#' the peaks matrix of each spectrum of the `Spectra` object `object`. +#' the peaks matrix of each spectrum of the `Spectra` object `object`. It has +#' build-in parallel and/or chunk-wise processing enabled through parameter +#' `f`, that allows to define how the `Spectra` (or rather its backend) needs +#' to be splitted. The default `f = .parallel_processing_factor(object)` splits +#' the backend by chunk (if a finite chunk size is defined for the `Spectra`) +#' or by it's optimal parallel processing factor. See the description of +#' the `.parallel_processing_factor()` function below for information. #' #' @param object `Spectra` object. #' @@ -78,7 +84,8 @@ NULL #' #' @param f `factor` or `vector` that can be coerced to one defining how the #' data should be split for parallel processing. Set to `NULL` or -#' `factor()` to disable splitting and parallel processing. +#' `factor()` to disable splitting and parallel processing. See function +#' description above for details and information. #' #' @param columns `character` defining the columns that should be returned. #' This will be passed to the backend's `peaksData` function. @@ -571,39 +578,8 @@ combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, #' @description #' -#' Internal function to check if any (or all) of the provided `mz` values are -#' in the spectras' m/z. -#' -#' @param x `Spectra` object -#' -#' @param mz `numeric` of m/z value(s) to check in each spectrum of `x`. -#' -#' @param tolarance `numeric(1)` with the tolerance. -#' -#' @param ppm `numeric(1)` with the ppm. -#' -#' @param condFun `function` such as `any` or `all`. -#' -#' @param parallel `BiocParallel` parameter object. -#' -#' @return `logical` same length than `x`. -#' -#' @author Johannes Rainer -#' -#' @importFrom MsCoreUtils common -#' -#' @noRd -.has_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, condFun = any, - parallel = SerialParam()) { - mzs <- mz(x, BPPARAM = parallel) - vapply(mzs, FUN = function(z) - condFun(common(mz, z, tolerance = tolerance, ppm = ppm)), logical(1)) -} - -#' @description -#' -#' Same as `.has_mz` only that a different `mz` is used for each spectrum in -#' `x`. Length of `mz` is thus expected to be equal to length of `x`. +#' Check for presence of an m/z value in each spectrum. Each spectrum gets +#' its own m/z. #' #' @param mz `numeric` **same length as `x`**. #' diff --git a/R/Spectra.R b/R/Spectra.R index 14ebbf2c..73520422 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -3278,23 +3278,18 @@ setMethod("containsMz", "Spectra", function(object, mz = numeric(), tolerance = 0, ppm = 20, which = c("any", "all"), BPPARAM = bpparam()) { - cond_fun <- match.fun(match.arg(which)) - if (all(is.na(mz))) - return(rep(NA, length(object))) - mz <- unique(sort(mz)) - BPPARAM <- backendBpparam(object@backend, BPPARAM) - ## TODO: fix to use .peaksapply instead. - if (is(BPPARAM, "SerialParam")) - .has_mz(object, mz, tolerance = tolerance, ppm = ppm, - condFun = cond_fun, parallel = BPPARAM) - else { - sp <- SerialParam() - f <- as.factor(dataStorage(object)) - res <- .lapply(object, FUN = .has_mz, mz = mz, tolerance = tolerance, - condFun = cond_fun, parallel = sp, f = f, - BPPARAM = BPPARAM) - unsplit(res, f = f) - } + if (length(object)) { + cond_fun <- match.fun(match.arg(which)) + if (all(is.na(mz))) + return(rep(NA, length(object))) + mz <- unique(sort(mz)) + BPPARAM <- backendBpparam(object@backend, BPPARAM) + unlist(.peaksapply( + object, FUN = .peaks_contain_mz, mz = mz, tolerance = tolerance, + ppm = ppm, condFun = cond_fun, BPPARAM = BPPARAM), + use.names = FALSE + ) + } else logical() }) #' @rdname addProcessing @@ -3327,12 +3322,12 @@ setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, #' @export setMethod("entropy", "Spectra", function(object, normalized = TRUE) { if (length(object)) { - if (normalized) entropy_fun <- nentropy - else entropy_fun <- entropy - unlist(.peaksapply( - object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), - use.names = FALSE - ) + if (normalized) entropy_fun <- nentropy + else entropy_fun <- entropy + unlist(.peaksapply( + object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), + use.names = FALSE + ) } else numeric() }) #' @rdname addProcessing diff --git a/R/peaks-functions.R b/R/peaks-functions.R index f34adde9..dc19e353 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -737,3 +737,13 @@ joinPeaksNone <- function(x, y, ...) { if (keep) x[sel, , drop = FALSE] else x[!sel, , drop = FALSE] } + +#' Check for presence of peaks defined by their m/z value. Note that this +#' function does **not** return a peak matrix, but only a logical of length 1! +#' +#' @return `logical(1)` +#' @noRd +.peaks_contain_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, + condFun = any, ...) { + condFun(common(mz, x[, "mz"], tolerance = tolerance, ppm = ppm)) +} diff --git a/tests/testthat/test_Spectra-functions.R b/tests/testthat/test_Spectra-functions.R index 8df50d71..2dbcf372 100644 --- a/tests/testthat/test_Spectra-functions.R +++ b/tests/testthat/test_Spectra-functions.R @@ -352,32 +352,6 @@ test_that("dropNaSpectraVariables works", { function(z) !any(is.na(z))))) }) -test_that(".has_mz works", { - sps <- Spectra(sciex_mzr)[1:10] - sps <- setBackend(sps, MsBackendDataFrame()) - mzs <- mz(sps) - x <- c(mzs[[2]][5], mzs[[3]][8]) - - res <- .has_mz(sps, mz = x, ppm = 0) - expect_true(length(res) == length(sps)) - expect_true(is.logical(res)) - - spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3)) - spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1)) - spd$intensity <- list(c(10, 20, 30, 40), c(11, 21, 31), c(12, 22, 32)) - sps <- Spectra(spd) - - res <- .has_mz(sps, mz = c(14, 34)) - expect_equal(res, c(TRUE, TRUE, FALSE)) - res <- .has_mz(sps, mz = c(14, 34), tolerance = 0.15) - expect_equal(res, c(TRUE, TRUE, TRUE)) - - res <- .has_mz(sps, mz = c(14, 34), condFun = all) - expect_true(all(!res)) - res <- .has_mz(sps, mz = c(14, 34), condFun = all, tolerance = 0.15) - expect_equal(res, c(FALSE, TRUE, TRUE)) -}) - test_that(".has_mz_each works", { spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3)) spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1)) diff --git a/tests/testthat/test_peaks-functions.R b/tests/testthat/test_peaks-functions.R index f28452dd..ef0978c8 100644 --- a/tests/testthat/test_peaks-functions.R +++ b/tests/testthat/test_peaks-functions.R @@ -722,3 +722,16 @@ test_that(".peaks_filter_ranges works", { ranges = ranges, keep = FALSE) expect_equal(res, x) }) + +test_that(".peaks_contain_mz works", { + pks <- cbind(mz = c(1.3, 1.5, 32.1, 45.6), c(1, 2, 3, 4)) + + expect_false(.peaks_contain_mz(pks)) + expect_true(.peaks_contain_mz(pks, 1.5)) + expect_false(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = all)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = any)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = any, + tolerance = 0.1)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = all, + tolerance = 0.1)) +}) From 6a1367c38ae3804da4aad4b9bc017ae341dc3f30 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 20 Nov 2024 09:57:43 +0100 Subject: [PATCH 40/41] tests: update vdiffr svg --- .../_snaps/plotMzDelta/plotmzdelta-1000.svg | 823 ++++++------------ 1 file changed, 249 insertions(+), 574 deletions(-) diff --git a/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg b/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg index e16506da..e041fc61 100644 --- a/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg +++ b/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg @@ -1,579 +1,254 @@ - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + +Histogram of Mass Delta Distributions +M/Z delta +Frequency + + + + + +50 +100 +150 +200 + + + + + + + +0 +500 +1000 +1500 +2000 +2500 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +peg +A +R +N +D +C +E +Q/K +G +H +I/L +M +F +P +S +T +W +Y +V From d52f57756a92a24f1352986a0461e0e6e33da5e1 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 20 Nov 2024 11:38:03 +0100 Subject: [PATCH 41/41] tests: add CODECOV token --- .github/workflows/check-bioc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 48f4ea23..5c1ebc66 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -239,7 +239,7 @@ jobs: - name: Test coverage if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' run: | - covr::codecov() + covr::codecov(token = "${{ secrets.CODECOV_TOKEN }}") shell: Rscript {0} - name: Install package