From 6cd260fd6d0277430ecedde7f4aa5fe2bf09c702 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 08:25:29 +0200 Subject: [PATCH] docs: restructure documentation --- DESCRIPTION | 4 +- R/Spectra.R | 4 + man/addProcessing.Rd | 547 +++++++++++++++++++++++++++++++++ man/combinePeaks.Rd | 110 +++++++ man/combineSpectra.Rd | 240 +++++++++++++++ man/compareSpectra.Rd | 131 ++++++++ man/filterMsLevel.Rd | 689 ++++++++++++++++++++++++++++++++++++++++++ man/spectraData.Rd | 598 ++++++++++++++++++++++++++++++++++++ 8 files changed, 2321 insertions(+), 2 deletions(-) create mode 100644 man/addProcessing.Rd create mode 100644 man/combinePeaks.Rd create mode 100644 man/combineSpectra.Rd create mode 100644 man/compareSpectra.Rd create mode 100644 man/filterMsLevel.Rd create mode 100644 man/spectraData.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a04e4ac3..0270d5db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,9 +40,9 @@ Authors@R: c(person(given = "RforMassSpectrometry Package Maintainer", Depends: R (>= 4.0.0), S4Vectors, - BiocParallel, - ProtGenerics (>= 1.37.1) + BiocParallel Imports: + ProtGenerics (>= 1.37.1), methods, IRanges, MsCoreUtils (>= 1.7.5), diff --git a/R/Spectra.R b/R/Spectra.R index 179ee58c..045cf88a 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1053,6 +1053,8 @@ asDataFrame <- function(object, i = seq_along(object), } #' @rdname spectraData +#' +#' @export setMethod("acquisitionNum", "Spectra", function(object) acquisitionNum(object@backend)) @@ -1195,6 +1197,8 @@ setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), }) #' @rdname spectraData +#' +#' @export setMethod( "peaksData", "Spectra", function(object, columns = c("mz", "intensity"), diff --git a/man/addProcessing.Rd b/man/addProcessing.Rd new file mode 100644 index 00000000..787aeabe --- /dev/null +++ b/man/addProcessing.Rd @@ -0,0 +1,547 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{applyProcessing} +\alias{applyProcessing} +\alias{processingLog} +\alias{scalePeaks} +\alias{addProcessing} +\alias{bin} +\alias{containsMz} +\alias{containsNeutralLoss} +\alias{entropy} +\alias{pickPeaks} +\alias{replaceIntensitiesBelow} +\alias{reset} +\alias{smooth} +\alias{spectrapply} +\alias{addProcessing,Spectra-method} +\alias{bin,Spectra-method} +\alias{containsMz,Spectra-method} +\alias{containsNeutralLoss,Spectra-method} +\alias{entropy,Spectra-method} +\alias{entropy,ANY-method} +\alias{pickPeaks,Spectra-method} +\alias{replaceIntensitiesBelow,Spectra-method} +\alias{reset,Spectra-method} +\alias{smooth,Spectra-method} +\alias{spectrapply,Spectra-method} +\title{Data manipulation and analysis methods} +\usage{ +applyProcessing( + object, + f = processingChunkFactor(object), + BPPARAM = bpparam(), + ... +) + +processingLog(x) + +scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) + +\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) + +\S4method{bin}{Spectra}( + x, + binSize = 1L, + breaks = NULL, + msLevel. = uniqueMsLevels(x), + FUN = sum, + zero.rm = TRUE +) + +\S4method{containsMz}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + which = c("any", "all"), + BPPARAM = bpparam() +) + +\S4method{containsNeutralLoss}{Spectra}( + object, + neutralLoss = 0, + tolerance = 0, + ppm = 20, + BPPARAM = bpparam() +) + +\S4method{entropy}{Spectra}(object, normalized = TRUE) + +\S4method{entropy}{ANY}(object, ...) + +\S4method{pickPeaks}{Spectra}( + object, + halfWindowSize = 2L, + method = c("MAD", "SuperSmoother"), + snr = 0, + k = 0L, + descending = FALSE, + threshold = 0, + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{replaceIntensitiesBelow}{Spectra}( + object, + threshold = min, + value = 0, + msLevel. = uniqueMsLevels(object) +) + +\S4method{reset}{Spectra}(object, ...) + +\S4method{smooth}{Spectra}( + x, + halfWindowSize = 2L, + method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), + msLevel. = uniqueMsLevels(x), + ... +) + +\S4method{spectrapply}{Spectra}( + object, + FUN, + ..., + chunkSize = integer(), + f = factor(), + BPPARAM = SerialParam() +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{f}{For \code{spectrapply()} and \code{applyProcessing()}: \code{factor} defining +how \code{object} should be splitted for eventual parallel processing. +Defaults to \code{factor()} for \code{spectrapply()} hence the object is not +splitted while it defaults to \code{f = processingChunkSize(object)} for +\code{applyProcessing()} splitting thus the object by default into chunks +depending on \code{\link[=processingChunkSize]{processingChunkSize()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for +additional information on parallel processing.} + +\item{...}{Additional arguments passed to internal and downstream functions.} + +\item{x}{A \code{Spectra}.} + +\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from +intensity values of a spectrum by which all intensities (of +that spectrum) should be divided by. The default \code{by = sum} will +divide intensities of each spectrum by the sum of intensities of that +spectrum.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix +of each spectrum in \code{object}. +For \code{bin()}: function to aggregate intensity values of peaks falling +into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. +For \code{spectrapply()} and \code{chunkapply()}: function to be applied to +each individual or each chunk of \code{Spectra}.} + +\item{spectraVariables}{For \code{addProcessing()}: \code{character} with additional +spectra variables that should be passed along to the function defined +with \code{FUN}. See function description for details.} + +\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. +Defaults to \code{binSize = 1}.} + +\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between +bins.} + +\item{zero.rm}{For \code{bin()}: \code{logical(1)} indicating whether to remove bins +with zero intensity. Defaults to \code{TRUE}, meaning the function will +discard bins created with an intensity of 0 to enhance memory +efficiency.} + +\item{mz}{For \code{containsMz()}: \code{numeric} with the m/z value(s) of the mass +peaks to check.} + +\item{tolerance}{For \code{containsMz()} and \code{neutralLoss()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched.} + +\item{ppm}{For \code{containsMz()} and \code{neutralLoss()}: \code{numeric(1)} defining a +relative, m/z-dependent, maximal accepted difference between m/z values +for peaks to be matched.} + +\item{which}{For \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether +any (the default) or all provided \code{mz} have to be present in the +spectrum.} + +\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the +value which should be subtracted from the spectrum's precursor m/z.} + +\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized +entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for +details.} + +\item{halfWindowSize}{For \code{pickPeaks()}: \code{integer(1)}, used in the +identification of the mass peaks: a local maximum has to be the +maximum in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. +For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the +window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}.} + +\item{method}{For \code{pickPeaks()}: \code{character(1)}, the noise estimators that +should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation +(\code{method = "MAD"}) and Friedman's Super Smoother +(\code{method = "SuperSmoother"}) are supported. +For \code{smooth()}: \code{character(1)}, the smoothing function that should be +used, currently, the Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported.} + +\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the +\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be +higher than \code{snr * noise} to be considered as peak.} + +\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of +the peak that should be considered in the weighted mean calculation.} + +\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values +betwee the nearest valleys around the peak centroids are used.} + +\item{threshold}{For \code{pickPeaks()}: a \code{numeric(1)} defining the proportion +of the maximal peak intensity. Only values above the threshold are +used for the weighted mean calculation. +For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold +or a \code{function} to calculate the threshold for each spectrum on its +intensity values. Defaults to \code{threshold = min}.} + +\item{value}{For \code{replaceIntensitiesBelow()}: \code{numeric(1)} defining the +value with which intensities should be replaced with.} + +\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which the +\code{Spectra} should be split. This parameter overrides parameters +\code{f} and \code{BPPARAM}.} +} +\value{ +See the documentation of the individual functions for a description of the +return value. +} +\description{ +Various data analysis functions are available for \code{Spectra} objects. These +can be categorized into functions that either return a \code{Spectra} object +(with the manipulated data) and functions that directly return the +result from the calculation. For the former category, the data manipulations +are cached in the result object's \emph{processing queue} and only exectuted +on-the-fly when the respective data gets extracted from the \code{Spectra} (see +section \emph{The processing queue} for more information). + +For the second category, the calculations are directly executed and the +result, usually one value per spectrum, returned. Generally, to reduce +memory demand, a chunk-wise processing of the data is performed. +} +\section{Data analysis methods returning a \code{Spectra}}{ + + +The methods listed here return a \code{Spectra} object as a result. +\itemize{ +\item \code{addProcessing()}: adds an arbitrary function that should be applied to the +peaks matrix of every spectrum in \code{object}. The function (can be passed +with parameter \code{FUN}) is expected to take a peaks matrix as input and to +return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +the first containing the m/z values of the peaks and the second the +corresponding intensities. The function has to have \code{...} in its +definition. Additional arguments can be passed with \code{...}. With parameter +\code{spectraVariables} it is possible to define additional spectra variables +from \code{object} that should be passed to the function \code{FUN}. These will be +passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} +will pass the spectra's precursor m/z as a parameter named \code{precursorMz} +to the function. The only exception is the spectra's MS level, these will +be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. +with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be +submitted to the function as a parameter called \code{spectrumMsLevel}). +Examples are provided in the package vignette. +\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is +performed only on spectra of the specified MS level(s) (parameter +\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with +parameter \code{breaks} which by default are equally sized bins, with size +being defined by parameter \code{binSize}, from the minimal to the maximal m/z +of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used +for all spectra in \code{x}. All intensity values for peaks falling into the +same bin are aggregated using the function provided with parameter \code{FUN} +(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that +the binning operation is applied to the peak data on-the-fly upon data +access and it is possible to \emph{revert} the operation with the \code{reset()} +function (see description of \code{reset()} below). +\item \code{countIdentifications}: counts the number of identifications each scan has +led to. See \code{\link[=countIdentifications]{countIdentifications()}} for more details. +\item \code{pickPeaks()}: picks peaks on individual spectra using a moving +window-based approach (window size = \code{2 * halfWindowSize}). For noisy +spectra there are currently two different noise estimators available, +the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and +Friedman's Super Smoother (\code{method = "SuperSmoother"}), +as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. +The method supports also to optionally \emph{refine} the m/z value of +the identified centroids by considering data points that belong (most +likely) to the same mass peak. Therefore the m/z value is calculated as an +intensity weighted average of the m/z values within the peak region. +The peak region is defined as the m/z values (and their respective +intensities) of the \code{2 * k} closest signals to the centroid or the closest +valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} +has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for +details. +If the ratio of the signal to the highest intensity of the peak is below +\code{threshold} it will be ignored for the weighted average. +\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified +threshold with the provided \code{value}. Parameter \code{threshold} can be either +a single numeric value or a function which is applied to all non-\code{NA} +intensities of each spectrum to determine a threshold value for each +spectrum. The default is \code{threshold = min} which replaces all values +which are <= the minimum intensity in a spectrum with \code{value} (the +default for \code{value} is \code{0}). Note that the function specified with +\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} +will be passed to the function. If the spectrum is in profile mode, +ranges of successive non-0 peaks <= \code{threshold} are set to 0. +Parameter \code{msLevel.} allows to apply this to only spectra of certain MS +level(s). +\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending +on parameter \code{by}. With \code{by = sum} (the default) peak intensities are +divided by the sum of peak intensities within each spectrum. The sum of +intensities is thus 1 for each spectrum after scaling. Parameter +\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. +By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all +spectra will be scaled. +\item \code{smooth()}: smooths individual spectra using a moving window-based approach +(window size = \code{2 * halfWindowSize}). Currently, the +Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +weights depending on the distance of the center and calculated +\code{1/2^(-halfWindowSize:halfWindowSize)}) and +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. +For details how to choose the correct \code{halfWindowSize} please see +\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. +} +} + +\section{Data analysis methods returning the result from the calculation}{ + + +The functions listed in this section return immediately the result from the +calculation. To reduce memory demand (and allow parallel processing) the +calculations a chunk-wise processing is generally performed. +\itemize{ +\item \code{chunkapply()}: apply an arbitrary function to chunks of spectra. See +\code{\link[=chunkapply]{chunkapply()}} for details and examples. +\item \code{containsMz()}: checks for each of the spectra whether they contain mass +peaks with an m/z equal to \code{mz} (given acceptable difference as defined by +parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter +\code{which} allows to define whether any (\code{which = "any"}, the default) or +all (\code{which = "all"}) of the \code{mz} have to match. The function returns +\code{NA} if \code{mz} is of length 0 or is \code{NA}. +\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a +peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given +acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). +Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). +\item \code{entropy()}: calculates the entropy of each spectra based on the metrics +suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. +\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 +spectra using the intensity of the matching MS1 peak from the +closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +respective MS2 spectrum). With \code{method = "interpolation"} it is also +possible to calculate the precursor intensity based on an interpolation of +intensity values (and retention times) of the matching MS1 peaks from the +previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for +examples and more details. +\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment +spectra's precursor m/z based on the reported precursor m/z and the data +from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. +\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See +\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. +\item \code{spectrapply()}: applies a given function to each individual spectrum or +sets of a \code{Spectra} object. By default, the \code{Spectra} is split into +individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} +is applied to each of them. An alternative splitting can be defined with +parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. +The returned result and its order depend on the function \code{FUN} and how +\code{object} is split (hence on \code{f}, if provided). Parallel processing is +supported and can be configured with parameter \code{BPPARAM}, is however only +suggested for computational intense \code{FUN}. +As an alternative to the (eventual parallel) processing of the full +\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, +parameter \code{chunkSize} needs to be specified. \code{object} is then split into +chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. +This guarantees a lower memory demand (especially for on-disk backends) +since only the data for one chunk needs to be loaded into memory in each +iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and +\code{BPPARAM} will be ignored. +See also \code{chunkapply()} above or examples below for details on chunk-wise +processing. +} +} + +\section{The processing queue}{ + + +Operations that modify mass peak data, i.e. the m/z and intensity values of +a \code{Spectra} are generally not applied immediately to the data but are +\emph{cached} within the object's \emph{processing queue}. These operations are then +applied to the data only upon request, for example when m/z and/or +intensity values are extracted. This lazy execution guarantees that the +same functionality can be applied to any \code{Spectra} object, regardless of +the type of backend that is used. Thus, data manipulation operations can +also be applied to data that is \emph{read only}. As a side effect, this enables +also to \emph{undo} operations using the \code{reset()} function. + +Functions related to the processing queue are: +\itemize{ +\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend +only: apply all steps from the lazy processing queue to the peak data and +write it back to the data storage. Parameter \code{f} allows to specify how +\code{object} should be split for parallel processing. This should either be +equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable +parallel processing alltogether. Other partitionings might result in +errors (especially if a \code{MsBackendHdf5Peaks} backend is used). +\item \code{processingLog()}: returns a \code{character} vector with the processing log +messages. +\item \code{reset()}: restores the data to its original state (as much as possible): +removes any processing steps from the lazy processing queue and calls +\code{reset()} on the backend which, depending on the backend, can also undo +e.g. data filtering operations. Note that a \verb{reset*(} call after +\code{applyProcessing()} will not have any effect. See examples below for more +information. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- FUNCTIONS RETURNING A SPECTRA -------- + +## Replace peak intensities below 40 with a value of 1 +sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +sps_mod + +## Get the intensities of the first spectrum before and after the +## operation +intensity(sps_dda[1]) +intensity(sps_mod[1]) + +## Remove all peaks with an intensity below 5. +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) + +intensity(sps_mod) + +## In addition it is possible to pass a function to `filterIntensity()`: in +## the example below we want to keep only peaks that have an intensity which +## is larger than one third of the maximal peak intensity in that spectrum. +keep_peaks <- function(x, prop = 3) { + x > max(x, na.rm = TRUE) / prop +} +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +intensity(sps_mod) + +## We can also change the proportion by simply passing the `prop` parameter +## to the function. To keep only peaks that have an intensity which is +## larger than half of the maximum intensity: +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +intensity(sps_mod) + +## With the `scalePeaks()` function we can alternatively scale the +## intensities of mass peaks per spectrum to relative intensities. This +## is specifically useful for fragment (MS2) spectra. We below thus +## scale the intensities per spectrum by the total sum of intensities +## (such that the sum of all intensities per spectrum is 1). +## Below we scale the intensities of all MS2 spectra in our data set. +sps_mod <- scalePeaks(sps_dda, msLevel = 2L) + +## MS1 spectra were not affected +sps_mod |> + filterMsLevel(1L) |> + intensity() + +## Intensities of MS2 spectra were scaled +sps_mod |> + filterMsLevel(2L) |> + intensity() + +## Since data manipulation operations are by default not directly applied to +## the data but only cached in the internal processing queue, it is also +## possible to remove these data manipulations with the `reset()` function: +tmp <- reset(sps_mod) +tmp +lengths(sps_dda) |> head() +lengths(sps_mod) |> head() +lengths(tmp) |> head() + +## Data manipulation operations cached in the processing queue can also be +## applied to the mass peaks data with the `applyProcessing()` function, if +## the `Spectra` uses a backend that supports that (i.e. allows replacing +## the mass peaks data). Below we first change the backend to a +## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +## mass peaks data +sps_dda <- setBackend(sps_dda, MsBackendMemory()) +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +sps_mod <- applyProcessing(sps_mod) +sps_mod + +## While we can't *undo* this filtering operation now using the `reset()` +## function, accessing the data would now be faster, because the operation +## does no longer to be applied to the original data before returning to the +## user. + + +## -------- FUNCTIONS RETURNING THE RESULT -------- + +## With the `spectrapply()` function it is possible to apply an +## arbitrary function to each spectrum in a Spectra. +## In the example below we calculate the mean intensity for each spectrum +## in a subset of the sciex_im data. Note that we can access all variables +## of each individual spectrum either with the `$` operator or the +## corresponding method. +res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +head(res) + +## As an alternative, applying a function `FUN` to a `Spectra` can be +## performed *chunk-wise*. The advantage of this is, that only the data for +## one chunk at a time needs to be loaded into memory reducing the memory +## demand. This type of processing can be performed by specifying the size +## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +## parameter +spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) + +## Precursor intensity estimation. Some manufacturers don't report the +## precursor intensity for MS2 spectra: +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +## This intensity can however be estimated from the previously measured +## MS1 scan with the `estimatePrecursorIntensity()` function: +pi <- estimatePrecursorIntensity(sps_dda) + +## This function returned the result as a `numeric` vector with one +## value per spectrum: +pi + +## We can replace the precursor intensity values of the originating +## object: +sps_dda$precursorIntensity <- pi +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +} +\seealso{ +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for calculation of spectra similarity scores. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +} diff --git a/man/combinePeaks.Rd b/man/combinePeaks.Rd new file mode 100644 index 00000000..a59b8f24 --- /dev/null +++ b/man/combinePeaks.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{combinePeaks} +\alias{combinePeaks} +\alias{combinePeaks,Spectra-method} +\title{Aggregating and combining mass peaks data} +\usage{ +\S4method{combinePeaks}{Spectra}( + object, + tolerance = 0, + ppm = 20, + intensityFun = base::mean, + mzFun = base::mean, + weighted = TRUE, + msLevel. = uniqueMsLevels(object), + ... +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{tolerance = 0}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{ppm = 20}.} + +\item{intensityFun}{Function to aggregate intensities for all peaks in +each peak group into a single intensity value.} + +\item{mzFun}{Function to aggregate m/z values for all mass peaks within +each peak group into a single m/z value. This parameter is ignored if +\code{weighted = TRUE} (the default).} + +\item{weighted}{\code{logical(1)} whether m/z values of peaks within each peak +group should be aggregated into a single m/z value using an +intensity-weighted mean. Defaults to \code{weighted = TRUE}.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{...}{ignored.} +} +\description{ +In addition to aggregating content of spectra variables (describe in +\code{\link[=combineSpectra]{combineSpectra()}}) it is also possible to aggregate and combine mass peaks +data from individual spectra within a \code{Spectra}. These \code{combinePeaks()} +function combines mass peaks \strong{within each spectrum} with a difference in +their m/z values that is smaller than the maximal acceptable difference +defined by \code{ppm} and \code{tolerance}. Parameters \code{intensityFun} and \code{mzFun} +allow to define functions to aggregate the intensity and m/z values for +each such group of peaks. With \code{weighted = TRUE} (the default), the m/z +value of the combined peak is calculated using an intensity-weighted mean +and parameter \code{mzFun} is ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is +used for the grouping of mass peaks. Parameter \code{msLevel.} allows to define +selected MS levels for which peaks should be combined. This function +returns a \code{Spectra} with the same number of spectra than the input object, +but with possibly combined peaks within each spectrum. +Additional peak variables (other than \code{"mz"} and \code{"intensity"}) are +dropped (i.e. their values are replaced with \code{NA}) for combined peaks +unless they are constant across the combined peaks. See also +\code{\link[=reduceSpectra]{reduceSpectra()}} for a function to select a single \emph{representative} +mass peak for each peak group. +} +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) + +## Combine mass peaks per spectrum with a difference in their m/z value +## that is smaller than 20 ppm. The intensity values of such peaks are +## combined by summing their values, while for the m/z values the median +## is reported +sciex_comb <- combinePeaks(sciex, ppm = 20, + intensityFun = sum, mzFun = median) + +## Comparing the number of mass peaks before and after aggregation +lengths(sciex) |> head() +lengths(sciex_comb) |> head() + +## Plotting the first spectrum before and after aggregation +par(mfrow = c(1, 2)) +plotSpectra(sciex[2L]) +plotSpectra(sciex_comb[2L]) + +## Using `reduceSpectra()` to keep for each group of mass peaks with a +## difference in their m/z values < 20ppm the one with the highest intensity. +sciex_red <- reduceSpectra(sciex, ppm = 20) + +## Comparing the number of mass peaks before and after the operation +lengths(sciex) |> head() +lengths(sciex_red) |> head() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}'s +spectra data. +\item \code{\link[=combinePeaksData]{combinePeaksData()}} for the function to combine the mass peaks data. +\item \code{\link[=reduceSpectra]{reduceSpectra()}} and similar functions to filter mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/combineSpectra.Rd b/man/combineSpectra.Rd new file mode 100644 index 00000000..d4f7bdb0 --- /dev/null +++ b/man/combineSpectra.Rd @@ -0,0 +1,240 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{concatenateSpectra} +\alias{concatenateSpectra} +\alias{combineSpectra} +\alias{joinSpectraData} +\alias{split} +\alias{c,Spectra-method} +\alias{split,Spectra,ANY-method} +\title{Merging, aggregating and splitting Spectra} +\usage{ +concatenateSpectra(x, ...) + +combineSpectra( + x, + f = x$dataStorage, + p = x$dataStorage, + FUN = combinePeaksData, + ..., + BPPARAM = bpparam() +) + +joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") + +\S4method{c}{Spectra}(x, ...) + +\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{...}{Additional arguments.} + +\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} +for details. +For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra +that should be combined. Defaults to \code{x$dataStorage}.} + +\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input +\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., +depending on the used backend, per-file parallel processing will be +performed.} + +\item{FUN}{For \code{combineSpectra()}: function to combine the (peak matrices) +of the spectra. Defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} + +\item{y}{A \code{DataFrame} with the spectra variables to join/add.} + +\item{by.x}{A \code{character(1)} specifying the spectra variable used +for merging. Default is \code{"spectrumId"}.} + +\item{by.y}{A \code{character(1)} specifying the column used for +merging. Set to \code{by.x} if missing.} + +\item{suffix.y}{A \code{character(1)} specifying the suffix to be used +for making the names of columns in the merged spectra variables +unique. This suffix will be used to amend \code{names(y)}, while +\code{spectraVariables(x)} will remain unchanged.} + +\item{drop}{For \code{split()}: not considered.} +} +\description{ +Various functions are availabe to combine, aggregate or split data from one +of more \code{Spectra} objects. These are: +\itemize{ +\item \code{c()} and \code{concatenateSpectra()}: combines several \code{Spectra} objects into +a single object. The resulting \code{Spectra} contains all data from all +individual \code{Spectra}, i.e. the union of all their spectra variables. +Concatenation will fail if the processing queue of any of the \code{Spectra} +objects is not empty or if different backends are used for the \code{Spectra} +objects. In such cases it is suggested to first change the backends of +all \code{Spectra} to the same type of backend (using the \code{\link[=setBackend]{setBackend()}} +function and to eventually (if needed) apply the processing queue using +the \code{\link[=applyProcessing]{applyProcessing()}} function. +\item \code{combineSpectra()}: combines sets of spectra (defined with parameter \code{f}) +into a single spectrum per set aggregating their MS data (i.e. their +\emph{peaks data} matrices with the \emph{m/z} and intensity values of their +mass peaks). The spectra variable values of the first spectrum per set +are reported for the combined spectrum. The peak matrices of the spectra +per set are combined using the function specified with parameter \code{FUN} +which uses by default the \code{\link[=combinePeaksData]{combinePeaksData()}} function. See the +documentation of \code{\link[=combinePeaksData]{combinePeaksData()}} for details on the aggregation of +the peak data and the package vignette for examples. +The sets of spectra can be specified with parameter \code{f} which is expected +to be a \code{factor} or \code{vector} of length equal to the length of the +\code{Spectra} specifying to which set a spectrum belongs to. The function +returns a \code{Spectra} of length equal to the unique levels of \code{f}. The +optional parameter \code{p} allows to define how the \code{Spectra} should be +split for potential parallel processing. The default is +\code{p = x$dataStorage} and hence a per storage file parallel processing is +applied for \code{Spectra} with on disk data representations (such as the +\code{\link[=MsBackendMzR]{MsBackendMzR()}}). This also prevents that spectra from different data +files/samples are combined (eventually use e.g. \code{p = x$dataOrigin} or any +other spectra variables defining the originating samples for a spectrum). +Before combining the peaks data, all eventual present processing steps are +applied (by calling \code{\link[=applyProcessing]{applyProcessing()}} on the \code{Spectra}). This function +will replace the original \emph{m/z} and intensity values of a \code{Spectra} hence +it can not be called on a \code{Spectra} with a \emph{read-only} backend. In such +cases, the backend should be changed to a \emph{writeable} backend before +using the \code{\link[=setBackend]{setBackend()}} function (to e.g. a \code{\link[=MsBackendMemory]{MsBackendMemory()}} backend). +\item \code{joinSpectraData()}: Individual spectra variables can be directly +added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} +function allows to merge a \code{DataFrame} to the existing spectra +data of a \code{Spectra}. This function diverges from the \code{\link[=merge]{merge()}} method in +two main ways: +\itemize{ +\item The \code{by.x} and \code{by.y} column names must be of length 1. +\item If variable names are shared in \code{x} and \code{y}, the spectra +variables of \code{x} are not modified. It's only the \code{y} +variables that are appended with the suffix defined in +\code{suffix.y}. This is to avoid modifying any core spectra +variables that would lead to an invalid object. +\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not +allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) +throw a warning and only the last occurrence is kept. These +should be explored and ideally be removed using for +\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar +functions. +} +\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} +of \code{Spectra} objects. +} +} +\examples{ + +## Create a Spectra providing a `DataFrame` containing a MS data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## Create a second Spectra from mzML files and use the `MsBackendMzR` +## on-disk backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Subset to the first 100 spectra to reduce running time of the examples +sciex <- sciex[1:100] + + +## -------- COMBINE SPECTRA -------- + +## Combining the `Spectra` object `s` with the MS data from `sciex`. +## Calling directly `c(s, sciex)` would result in an error because +## both backends use a different backend. We thus have to first change +## the backends to the same backend. We change the backend of the `sciex` +## `Spectra` to a `MsBackendMemory`, the backend used by `s`. + +sciex <- setBackend(sciex, MsBackendMemory()) + +## Combine the two `Spectra` +all <- c(s, sciex) +all + +## The new `Spectra` objects contains the union of spectra variables from +## both: +spectraVariables(all) + +## The spectra variables that were not present in `s`: +setdiff(spectraVariables(all), spectraVariables(s)) + +## The values for these were filled with missing values for spectra from +## `s`: +all$peaksCount |> head() + + +## -------- AGGREGATE SPECTRA -------- + +## Sets of spectra can be combined into a single, representative spectrum +## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +## the spectra's m/z and intensity values) while using the values for all +## spectra variables from the first spectrum per set. Below we define the +## sets as all spectra measured in the *same second*, i.e. rounding their +## retention time to the next closer integer value. +f <- round(rtime(sciex)) +head(f) + +cmp <- combineSpectra(sciex, f = f) + +## The length of `cmp` is now equal to the length of unique levels in `f`: +length(cmp) + +## The spectra variable value from the first spectrum per set is used in +## the representative/combined spectrum: +cmp$rtime + +## The peaks data was aggregated: the number of mass peaks of the first six +## spectra from the original `Spectra`: +lengths(sciex) |> head() + +## and for the first aggreagated spectra: +lengths(cmp) |> head() + +## The default peaks data aggregation method joins all mass peaks. See +## documentation of the `combinePeaksData()` function for more options. + + +## -------- SPLITTING DATA -------- + +## A `Spectra` can be split into a `list` of `Spectra` objects using the +## `split()` function defining the sets into which the `Spectra` should +## be splitted into with parameter `f`. +sciex_split <- split(sciex, f) + +length(sciex_split) +sciex_split |> head() + + +## -------- ADDING SPECTRA DATA -------- + +## Adding new spectra variables +sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging + var1 = rnorm(10), + var2 = sample(letters, 10)) +spv + +sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") + +spectraVariables(sciex2) +spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +} +\seealso{ +\itemize{ +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to aggregate mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/compareSpectra.Rd b/man/compareSpectra.Rd new file mode 100644 index 00000000..375671c4 --- /dev/null +++ b/man/compareSpectra.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{compareSpectra} +\alias{compareSpectra} +\alias{compareSpectra,Spectra,Spectra-method} +\alias{compareSpectra,Spectra,missing-method} +\title{Spectra similarity calculations} +\usage{ +\S4method{compareSpectra}{Spectra,Spectra}( + x, + y, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) + +\S4method{compareSpectra}{Spectra,missing}( + x, + y = NULL, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{y}{A \code{Spectra} object.} + +\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between +the two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and +possible functions. Defaults to \code{\link[=joinPeaks]{joinPeaks()}}.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{FUN}{function to compare intensities of peaks between two spectra. +Defaults to \code{\link[=ndotproduct]{ndotproduct()}}.} + +\item{...}{Additional arguments passed to the internal functions.} + +\item{SIMPLIFY}{\code{logical(1)} defining whether the result matrix should be +\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is +of length 1).} +} +\description{ +\code{compareSpectra()} compares each spectrum in \code{x} with each spectrum in \code{y} +using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If +\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum +in \code{x}. +The matching/mapping of peaks between the compared spectra is done with the +\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra +and allows to keep all peaks from the first spectrum (\code{type = "left"}), +from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to +keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more +information and examples). The \code{MAPFUN} function should have parameters +\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to +the function. + +In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is supported for +GNPS-like similarity score calculations. Note that \code{joinPeaksGnps()} should +only be used in combination with \code{FUN = MsCoreUtils::gnps} +(see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and details). Use +\code{MAPFUN = joinPeaksNone} to disable internal peak matching/mapping if a +similarity scoring function is used that performs the matching internally. + +\code{FUN} is supposed to be a function to compare intensities of (matched) +peaks of the two spectra that are compared. The function needs to take two +matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed +to return a single numeric as result. In addition to the two peak matrices +the spectra's precursor m/z values are passed to the function as parameters +\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} +(precursor m/z of the \code{y} peak matrix). Additional parameters to functions +\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and +\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. +The function returns a \code{matrix} with the results of \code{FUN} for each +comparison, number of rows equal to \code{length(x)} and number of columns +equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from +the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} +is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also +the vignette for additional examples, such as using spectral entropy +similarity in the scoring. +} +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + +## Restrict to MS2 (fragment) spectra: +sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) + +## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +## the normalized dotproduct method. +res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +## the second row comparisons of spectrum 3 with spectra 10 to 20 +res + +## We next calculate the pairwise similarity for the first 10 spectra +compareSpectra(sps_ms2[1:10]) + +## Use compareSpectra to determine the number of common (matching) peaks +## with a ppm of 10: +## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +## peaks that can be mapped betwen both spectra. The provided FUN returns +## simply the number of matching peaks. +compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) + +## We repeat this calculation between all pairwise combinations +## of the first 20 spectra +compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/filterMsLevel.Rd b/man/filterMsLevel.Rd new file mode 100644 index 00000000..0ea3698b --- /dev/null +++ b/man/filterMsLevel.Rd @@ -0,0 +1,689 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{deisotopeSpectra} +\alias{deisotopeSpectra} +\alias{reduceSpectra} +\alias{filterPrecursorMaxIntensity} +\alias{filterPrecursorIsotopes} +\alias{filterPrecursorPeaks} +\alias{filterMsLevel} +\alias{[,Spectra-method} +\alias{filterAcquisitionNum} +\alias{filterDataOrigin} +\alias{filterDataStorage} +\alias{filterEmptySpectra} +\alias{filterIsolationWindow} +\alias{filterPolarity} +\alias{filterPrecursorCharge} +\alias{filterPrecursorMzRange} +\alias{filterPrecursorMzValues} +\alias{filterPrecursorScan} +\alias{filterRanges} +\alias{filterRt} +\alias{filterValues} +\alias{dropNaSpectraVariables} +\alias{selectSpectraVariables} +\alias{filterIntensity} +\alias{filterMzRange} +\alias{filterMzValues} +\alias{dropNaSpectraVariables,Spectra-method} +\alias{selectSpectraVariables,Spectra-method} +\alias{filterAcquisitionNum,Spectra-method} +\alias{filterEmptySpectra,Spectra-method} +\alias{filterDataOrigin,Spectra-method} +\alias{filterDataStorage,Spectra-method} +\alias{filterFourierTransformArtefacts,Spectra-method} +\alias{filterIntensity,Spectra-method} +\alias{filterIsolationWindow,Spectra-method} +\alias{filterMsLevel,Spectra-method} +\alias{filterMzRange,Spectra-method} +\alias{filterMzValues,Spectra-method} +\alias{filterPolarity,Spectra-method} +\alias{filterPrecursorMz,Spectra-method} +\alias{filterPrecursorMzRange,Spectra-method} +\alias{filterPrecursorMzValues,Spectra-method} +\alias{filterPrecursorCharge,Spectra-method} +\alias{filterPrecursorScan,Spectra-method} +\alias{filterRt,Spectra-method} +\alias{filterRanges,Spectra-method} +\alias{filterValues,Spectra-method} +\title{Filter and subset Spectra objects} +\usage{ +deisotopeSpectra( + x, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), + tolerance = 0, + ppm = 20, + charge = 1 +) + +reduceSpectra(x, tolerance = 0, ppm = 20) + +filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) + +filterPrecursorIsotopes( + x, + tolerance = 0, + ppm = 20, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") +) + +filterPrecursorPeaks( + object, + tolerance = 0, + ppm = 20, + mz = c("==", ">="), + msLevel. = uniqueMsLevels(object) +) + +\S4method{dropNaSpectraVariables}{Spectra}(object) + +\S4method{selectSpectraVariables}{Spectra}( + object, + spectraVariables = union(spectraVariables(object), peaksVariables(object)) +) + +\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) + +\S4method{filterAcquisitionNum}{Spectra}( + object, + n = integer(), + dataStorage = character(), + dataOrigin = character() +) + +\S4method{filterEmptySpectra}{Spectra}(object) + +\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) + +\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) + +\S4method{filterFourierTransformArtefacts}{Spectra}( + object, + halfWindowSize = 0.05, + threshold = 0.2, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 +) + +\S4method{filterIntensity}{Spectra}( + object, + intensity = c(0, Inf), + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) + +\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) + +\S4method{filterMzRange}{Spectra}( + object, + mz = numeric(), + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterMzValues}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterPolarity}{Spectra}(object, polarity = integer()) + +\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) + +\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) + +\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) + +\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) + +\S4method{filterRanges}{Spectra}( + object, + spectraVariables = character(), + ranges = numeric(), + match = c("all", "any") +) + +\S4method{filterValues}{Spectra}( + object, + spectraVariables = character(), + values = numeric(), + ppm = 0, + tolerance = 0, + match = c("all", "any") +) +} +\arguments{ +\item{x}{\code{Spectra} object.} + +\item{substDefinition}{For \code{deisotopeSpectra()} and +\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions +of isotopic substitutions. Uses by default isotopic substitutions +defined from all compounds in the Human Metabolome Database (HMDB). See +\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} in the +\emph{MetaboCoreUtils} for details.} + +\item{tolerance}{For \code{filterMzValues()} and \code{reduceSpectra()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched (or grouped). For +\code{containsMz()} it can also be of length equal \code{mz} to specify a different +tolerance for each m/z value. +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the +(constant) maximal accepted difference of precursor m/z values of +spectra for grouping them into \emph{precursor groups}. For +\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} +function. For \code{filterValues()}: \code{numeric} of any length allowing to +define a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be +recycled. Default is \code{tolerance = 0}.} + +\item{ppm}{For \code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} +defining a relative, m/z-dependent, maximal accepted difference between +m/z values for peaks to be matched (or grouped). +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative +maximal accepted difference of precursor m/z values of spectra for +grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: +passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. +For \code{filterValues()}: \code{numeric} of any length allowing to define +a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be +recycled.} + +\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized +compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} + +\item{object}{\code{Spectra} object.} + +\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to +filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: +\code{numeric(2)} defining the lower and upper m/z boundary. +For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with +the m/z values to match peaks or precursor m/z against. +For \code{filterPrecursorPeaks()}: \code{character(1)} defining whether mass peaks +with an m/z matching the spectrum's precursor m/z (\code{mz = "=="}, +the default) or mass peaks with a m/z that is equal or larger +(\code{mz = ">="}) should be removed.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}. +For \code{filterMsLevel()}: the MS level to which \code{object} should be +subsetted.} + +\item{spectraVariables}{For \code{selectSpectraVariables()}: \code{character} with the +names of the spectra variables to which the backend should be +subsetted. For \code{filterRanges()} and \code{filterValues()}: \code{character} +vector specifying the column(s) from \code{spectraData(object)} on which +to filter the data and that correspond to the the names of the +spectra variables that should be used for the filtering.} + +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the +object.} + +\item{j}{For \code{[}: not supported.} + +\item{...}{Additional arguments.} + +\item{drop}{For \code{[}: not considered.} + +\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition +numbers to filter for.} + +\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occur only for spectra of selected \code{dataStorage}.} + +\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occurr only for spectra of selected \code{dataOrigin}.} + +\item{halfWindowSize}{For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} +defining the m/z window left and right of a peak where to remove +fourier transform artefacts.} + +\item{threshold}{For \code{filterFourierTransformArtefacts()}: the relative +intensity (to a peak) below which peaks are considered fourier +artefacts. Defaults to \code{threshold = 0.2} hence removing peaks that +have an intensity below 0.2 times the intensity of the tested peak +(within the selected \code{halfWindowSize}).} + +\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope +peaks should not be removed as fourier artefacts.} + +\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge +to be considered for isotopes.} + +\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z +\code{tolerance} to be used to define whether peaks might be isotopes of +the current tested peak.} + +\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 +defining either the lower or the lower and upper intensity limit for the +filtering, or a \code{function} that takes the intensities as input and +returns a \code{logical} (same length then peaks in the spectrum) whether the +peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus +only peaks with \code{NA} intensity are removed.} + +\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} +whether the matching peaks should be retained (\code{keep = TRUE}, the +default) or dropped (\code{keep = FALSE}).} + +\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to +to subset \code{object}.} + +\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor +charges to be used as filter.} + +\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the +acquisition number of the spectra to which the object should be +subsetted.} + +\item{f}{For \code{filterPrecursorScan()}: defining which spectra +belong to the same original data file (sample): Defaults to +\code{f = dataOrigin(x)}.} + +\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to +be used to subset/filter \code{object}.} + +\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values +(upper and lower boundary) that define the ranges to filter the \code{object}. +These paired values need to be in the same order as the +\code{spectraVariables} parameter (see below).} + +\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } +defining whether the condition has to match for all provided +\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them +(\code{match = "any"}) for spectra to be retained.} + +\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the +values to filter the Spectra data. These values need to be in the same +order as the \code{spectraVariables} parameter.} +} +\description{ +A variety of functions to filter or subset \code{Spectra} objects are available. +These can be generally separated into two main classes: I) \emph{classical} +subset operations that immediately reduce the number of spectra in the +object and II) filters that reduce the \strong{content} of the object without +changing its length (i.e. the number of spectra). The latter can be further +subdivided into functions that affect the content of the \code{spectraData} (i.e. +the general spectrum metadata) and those that reduce the content of the +object's \code{peaksData} (i.e. the m/z and intensity values of a spectrum's +mass peaks). + +A description of functions from these 3 different categories are given below +in sections \emph{Subset \code{Spectra}}, \emph{Filter content of \code{spectraData()}} and +\emph{Filter content of \code{peaksData()}}, respectively. +} +\section{Subset \code{Spectra}}{ + + +These functions affect the number of spectra in a \code{Spectra} object creating +a subset of the original object without affecting its content. +\itemize{ +\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method +\strong{always} returns a \code{Spectra} object. +\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching +the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or +\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with +an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin +or dataStorage values} retaining all other spectra. +Returns the filtered \code{Spectra}. +\item \code{filterDataOrigin()}: filters the object retaining spectra matching the +provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type +\code{character} and needs to match exactly the data origin value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataOrigin} parameter). +\item \code{filterDataStorage()}: filters the object retaining spectra stored in the +specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type +\code{character} and needs to match exactly the data storage value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataStorage} parameter). +\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). +Returns the filtered \code{Spectra} object (with spectra in their +original order). +\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their +isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} +and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} +object (with spectra in their original order). +\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching +the MS level specified with argument \code{msLevel}. Returns the filtered +\code{Spectra} (with spectra in their original order). +\item \code{filterPolarity()}: filters the object keeping only spectra matching the +provided polarity. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor +charge(s). +\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor +m/z and precursor intensity into predicted isotope groups and keep for each +only the spectrum representing the monoisotopic precursor. MS1 spectra +are returned as is. See documentation for \code{deisotopeSpectra()} below for +details on isotope prediction and parameter description. +\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups +of (MS2) spectra with similar precursor m/z values (given parameters +\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The +function filters only MS2 spectra and returns all MS1 spectra. If +precursor intensities are \code{NA} for all spectra within a spectra group, the +first spectrum of that groups is returned. +Note: some manufacturers don't provide precursor intensities. These can +however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. +\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now +deprecated): retains spectra with a precursor m/z within the +provided m/z range. See examples for details on selecting spectra with +a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. +\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching +any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with +missing precursor m/z value (e.g. MS1 spectra) are dropped. +\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. +MS2) of acquisition number \code{acquisitionNum}. Returns the filtered +\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to +define which spectra belong to the same sample or original data file ( +defaults to \code{f = dataOrigin(object)}). +\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user +defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available +spectra variables in object (spectra variable names can be specified with +parameter \code{spectraVariables}). Spectra for which the value of a spectra +variable is within it's defined range are retained. If multiple +ranges/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention +times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) +\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on +similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} +(parameter \code{spectraVariables}) to provided values (parameter \code{values}) +given acceptable differences (parameters tolerance and ppm). If multiple +values/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +} +} + +\section{Filter content of \code{spectraData()}}{ + + +The functions described in this section filter the content from a +\code{Spectra}'s spectra data, i.e. affect values of, or complete, spectra +variables. None of these functions reduces the object's number of spectra. +\itemize{ +\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the +object's \code{spectraData} that contain only missing values (\code{NA}). Note that +while columns with only \code{NA}s are removed, a \code{spectraData()} call after +\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values +for \emph{core} spectra variables. The total number of spectra is not changed +by this function. +\item \code{selectSpectraVariables()}: reduces the information within the object to +the selected spectra variables: all data for variables not specified will +be dropped. For mandatory columns (i.e., those listed by +\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only +the values will be dropped but not the variable itself. Additional (or +user defined) spectra variables will be completely removed. +Returns the filtered \code{Spectra}. +} +} + +\section{Filter content of \code{peaksData()}}{ + + +The functions described in this section filter the content of the +\code{Spectra}'s peaks data, i.e. either the number or the values (\emph{m/z} or +intensity values) of the mass peaks. Also, the actual operation is only +executed once peaks data is accessed (through \code{peaksData()}, +\code{mz()} or \code{intensity()}) or \code{applyProcessing()} is called. +These operations don't affect the number of spectra in the \code{Spectra} object. +\itemize{ +\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the +monoisotopic peak for groups of isotopologues. Isotopologues are +estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the +\emph{MetaboCoreUtils} package. Note that +the default parameters for isotope prediction/detection have been +determined using data from the Human Metabolome Database (HMDB) and +isotopes for elements other than CHNOPS might not be detected. See +parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for +more information. The approach and code to define the parameters for +isotope prediction is described +\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. +\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier +artefact peaks from spectra (see examples below). The function iterates +through all intensity ordered peaks in a spectrum and removes all peaks +with an m/z within +/- \code{halfWindowSize} of the current peak if their +intensity is lower than \code{threshold} times the current peak's intensity. +Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} +allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} +being the maximum charge that should be considered and \code{isotopeTolerance} +the absolute acceptable tolerance for matching their m/z). +See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and +\code{deisitopeSpectra()} for an alternative. +\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only +those with intensities that are within the provided range or match the +criteria of the provided function. For the former, parameter \code{intensity} +has to be a \code{numeric} defining the intensity range, for the latter a +\code{function} that takes the intensity values of the spectrum and returns +a \code{logical} whether the peak should be retained or not (see examples +below for details) - additional parameters to the function can be passed +with \code{...}. +To remove only peaks with intensities below a certain threshold, say +100, use \code{intensity = c(100, Inf)}. Note: also a single value can be +passed with the \code{intensity} parameter in which case an upper limit of +\code{Inf} is used. +Note that this function removes also peaks with missing intensities +(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the +filtering to spectra of the specified MS level(s). +\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing +those in each spectrum that are within the provided m/z range. Whether +peaks are retained or removed can be configured with parameter \code{keep} +(default \code{keep = TRUE}). +\item \code{filterMzValues()}: filters mass peaks in the object keeping all +peaks in each spectrum that match the provided m/z value(s) (for +\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). +The m/z matching considers also the absolute \code{tolerance} and m/z-relative +\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. +\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any +set of range-based filters on numeric spectra or peaks variables. See +\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. +\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with +an m/z equal or larger than the m/z of the precursor, depending on the +value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). +\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in +(given \code{ppm} and \code{tolerance}) in each spectrum only the mass peak with the +highest intensity removing all other peaks hence \emph{reducing} each +spectrum to the highest intensity peaks per \emph{peak group}. +Peak groups are defined using the \code{\link[=group]{group()}} function from the +\emph{MsCoreUtils} package. See also the \code{\link[=combinePeaks]{combinePeaks()}} function for an +alternative function to combine peaks within each spectrum. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- SUBSET SPECTRA -------- + +## Subset to the first 3 spectra +tmp <- sps_dda[1:3] +tmp +length(tmp) + +## Subset to all MS2 spectra; this could be done with [, or, more +## efficiently, with the `filterMsLevel` function: +sps_dda[msLevel(sps_dda) == 2L] +filterMsLevel(sps_dda, 2L) + +## Filter the object keeping only MS2 spectra with an precursor m/z value +## between a specified range: +filterPrecursorMzRange(sps_dda, c(80, 90)) + +## Filter the object to MS2 spectra with an precursor m/z matching a +## pre-defined value (given ppm and tolerance) +filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) + +## The `filterRanges()` function allows to filter a `Spectra` based on +## numerical ranges of any of its (numerical) spectra variables. +## First, determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz", "peaksCount") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the ranges (pairs of values with lower and upper boundary) to be +## used for the individual spectra variables. The first two values will be +## used for the first spectra variable (e.g., `"rtime"` here), the next two +## for the second (e.g. `"precursorMz"` here) and so on: +ranges <- c(30, 350, 200, 500, 350, 600) + +## Input the parameters within the filterRanges function: +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges) +filt_spectra + +## `filterRanges()` can also be used to filter a `Spectra` object with +## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +sv <- c("rtime", "rtime") +ranges <- c(30, 100, 200, 300) +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges, match = "any") +filt_spectra + +## While `filterRanges()` filtered on numeric ranges, `filterValues()` +## allows to filter an object matching spectra variable values to user +## provided values (allowing to configure allowed differences using the +## `ppm` and `tolerance` parameters). +## First determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the values that will be used to filter the spectra based on their +## similarities to their respective `spectraVariables`. +## The first values in the parameters values, tolerance and ppm will be +## used for the first spectra variable (e.g. `"rtime"` here), the next for +## the second (e.g. `"precursorMz"` here) and so on: +values <- c(350, 80) +tolerance <- c(100, 0.1) +ppm <- c(0, 50) + +## Input the parameters within the `filterValues()` function: +filt_spectra <- filterValues(sps_dda, spectraVariables = sv, + values = values, tolerance = tolerance, ppm = ppm) +filt_spectra + + +## -------- FILTER SPECTRA DATA -------- + +## Remove spectra variables without content (i.e. with only missing values) +sps_noNA <- dropNaSpectraVariables(sps_dda) + +## This reduced the size of the object slightly +print(object.size(sps_dda), unit = "MB") +print(object.size(sps_noNA), unit = "MB") + +## With the `selectSpectraVariables()` function it is in addition possible +## to subset the data of a `Spectra` to the selected columns/variables, +## keeping only their data: +tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", + "scanIndex")) +print(object.size(tmp), units = "MB") + +## Except the selected variables, all data is now removed. Accessing +## core spectra variables still works, but returns only NA +rtime(tmp) |> head() + + +## -------- FILTER PEAKS DATA -------- + +## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +## only those mass peaks with an m/z value matching the provided value(s). +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) + +## The filtered `Spectra` has the same length +length(sps_dda) +length(sps_sub) + +## But the number of mass peaks changed +lengths(sps_dda) |> head() +lengths(sps_sub) |> head() + +## This function can also be used to remove specific peaks from a spectrum +## by setting `keep = FALSE`. +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), + tolerance = 0.3, keep = FALSE) +lengths(sps_sub) |> head() + +## With the `filterMzRange()` function it is possible to keep (or remove) +## mass peaks with m/z values within a specified numeric range. +sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +lengths(sps_sub) |> head() + +## See also the `filterPeaksRanges()` function for a more flexible framework +## to filter mass peaks + + +## Removing fourier transform artefacts seen in Orbitra data. + +## Loading an Orbitrap spectrum with artefacts. +data(fft_spectrum) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +fft_spectrum +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +## Using a few examples peaks in your data you can optimize the parameters +fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, + halfWindowSize = 0.2, + threshold = 0.005, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 + ) + +fft_spectrum_filtered +length(mz(fft_spectrum_filtered)[[1]]) +plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + + +## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +## by similarity of their m/z values) only one representative peak. This +## function helps cleaning fragment spectra. +## Filter the data set to MS2 spectra +ms2 <- filterMsLevel(sps_dda, 2L) + +## For groups of fragment peaks with a difference in m/z < 0.1, keep only +## the largest one. +ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +lengths(ms2) |> tail() +lengths(ms2_red) |> tail() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}. +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to combine or aggregate a \code{Spectra}'s +\code{peaksData()} +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +} diff --git a/man/spectraData.Rd b/man/spectraData.Rd new file mode 100644 index 00000000..49d2bee3 --- /dev/null +++ b/man/spectraData.Rd @@ -0,0 +1,598 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{spectraData} +\alias{spectraData} +\alias{acquisitionNum} +\alias{centroided} +\alias{collisionEnergy} +\alias{dataOrigin} +\alias{dataStorage} +\alias{intensity} +\alias{ionCount} +\alias{isCentroided} +\alias{isEmpty} +\alias{isolationWindowLowerMz} +\alias{isolationWindowUpperMz} +\alias{isolationWindowTargetMz} +\alias{lengths} +\alias{msLevel} +\alias{mz} +\alias{peaksData} +\alias{peaksVariables} +\alias{polarity} +\alias{precursorCharge} +\alias{precursorIntensity} +\alias{precursorMz} +\alias{rtime} +\alias{scanIndex} +\alias{smoothed} +\alias{spectraNames} +\alias{spectraVariables} +\alias{tic} +\alias{uniqueMsLevels} +\alias{asDataFrame} +\alias{acquisitionNum,Spectra-method} +\alias{centroided,Spectra-method} +\alias{centroided<-,Spectra-method} +\alias{collisionEnergy,Spectra-method} +\alias{collisionEnergy<-,Spectra-method} +\alias{coreSpectraVariables} +\alias{dataOrigin,Spectra-method} +\alias{dataOrigin<-,Spectra-method} +\alias{dataStorage,Spectra-method} +\alias{intensity,Spectra-method} +\alias{ionCount,Spectra-method} +\alias{isCentroided,Spectra-method} +\alias{isEmpty,Spectra-method} +\alias{isolationWindowLowerMz,Spectra-method} +\alias{isolationWindowLowerMz<-,Spectra-method} +\alias{isolationWindowTargetMz,Spectra-method} +\alias{isolationWindowTargetMz<-,Spectra-method} +\alias{isolationWindowUpperMz,Spectra-method} +\alias{isolationWindowUpperMz<-,Spectra-method} +\alias{length,Spectra-method} +\alias{lengths,Spectra-method} +\alias{msLevel,Spectra-method} +\alias{mz,Spectra-method} +\alias{peaksData,Spectra-method} +\alias{peaksVariables,Spectra-method} +\alias{polarity,Spectra-method} +\alias{polarity<-,Spectra-method} +\alias{precScanNum,Spectra-method} +\alias{precursorCharge,Spectra-method} +\alias{precursorIntensity,Spectra-method} +\alias{precursorMz,Spectra-method} +\alias{rtime,Spectra-method} +\alias{rtime<-,Spectra-method} +\alias{scanIndex,Spectra-method} +\alias{smoothed,Spectra-method} +\alias{smoothed<-,Spectra-method} +\alias{spectraData,Spectra-method} +\alias{spectraData<-,Spectra-method} +\alias{spectraNames,Spectra-method} +\alias{spectraNames<-,Spectra-method} +\alias{spectraVariables,Spectra-method} +\alias{tic,Spectra-method} +\alias{uniqueMsLevels,Spectra-method} +\alias{$,Spectra-method} +\alias{$<-,Spectra-method} +\alias{[[,Spectra-method} +\alias{[[<-,Spectra-method} +\title{Accessing mass spectrometry data} +\usage{ +asDataFrame( + object, + i = seq_along(object), + spectraVars = spectraVariables(object) +) + +\S4method{acquisitionNum}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) <- value + +\S4method{collisionEnergy}{Spectra}(object) + +\S4method{collisionEnergy}{Spectra}(object) <- value + +coreSpectraVariables() + +\S4method{dataOrigin}{Spectra}(object) + +\S4method{dataOrigin}{Spectra}(object) <- value + +\S4method{dataStorage}{Spectra}(object) + +\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{ionCount}{Spectra}(object) + +\S4method{isCentroided}{Spectra}(object, ...) + +\S4method{isEmpty}{Spectra}(x) + +\S4method{isolationWindowLowerMz}{Spectra}(object) + +\S4method{isolationWindowLowerMz}{Spectra}(object) <- value + +\S4method{isolationWindowTargetMz}{Spectra}(object) + +\S4method{isolationWindowTargetMz}{Spectra}(object) <- value + +\S4method{isolationWindowUpperMz}{Spectra}(object) + +\S4method{isolationWindowUpperMz}{Spectra}(object) <- value + +\S4method{length}{Spectra}(x) + +\S4method{lengths}{Spectra}(x, use.names = FALSE) + +\S4method{msLevel}{Spectra}(object) + +\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{peaksData}{Spectra}( + object, + columns = c("mz", "intensity"), + f = processingChunkFactor(object), + ..., + BPPARAM = bpparam() +) + +\S4method{peaksVariables}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) <- value + +\S4method{precScanNum}{Spectra}(object) + +\S4method{precursorCharge}{Spectra}(object) + +\S4method{precursorIntensity}{Spectra}(object) + +\S4method{precursorMz}{Spectra}(object) + +\S4method{rtime}{Spectra}(object) + +\S4method{rtime}{Spectra}(object) <- value + +\S4method{scanIndex}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) <- value + +\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) + +\S4method{spectraData}{Spectra}(object) <- value + +\S4method{spectraNames}{Spectra}(object) + +\S4method{spectraNames}{Spectra}(object) <- value + +\S4method{spectraVariables}{Spectra}(object) + +\S4method{tic}{Spectra}(object, initial = TRUE) + +\S4method{uniqueMsLevels}{Spectra}(object, ...) + +\S4method{$}{Spectra}(x, name) + +\S4method{$}{Spectra}(x, name) <- value + +\S4method{[[}{Spectra}(x, i, j, ...) + +\S4method{[[}{Spectra}(x, i, j, ...) <- value +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{i}{For \code{asDataFrame()}: A \code{numeric} indicating which scans to coerce +to a \code{DataFrame} (default is \code{seq_along(object)}).} + +\item{spectraVars}{\code{character()} indicating what spectra variables to add to +the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all +available variables.} + +\item{value}{A vector with values to replace the respective spectra +variable. Needs to be of the correct data type for the spectra variable.} + +\item{f}{For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how +data should be chunk-wise loaded an processed. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} + +\item{...}{Additional arguments.} + +\item{x}{A \code{Spectra} object.} + +\item{use.names}{For \code{lengths()}: ignored.} + +\item{columns}{For \code{spectraData()} accessor: optional \code{character} with +column names (spectra variables) that should be included in the +returned \code{DataFrame}. By default, all columns are returned. +For \code{peaksData()} accessor: optional \code{character} with requested columns +in the individual \code{matrix} of the returned \code{list}. Defaults to +\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} +with \code{object} being the \code{Spectra} object are supported.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for more information +on parallel processing.} + +\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially +reported total ion current should be reported, or whether the +total ion current should be (re)calculated on the actual data +(\code{initial = FALSE}, same as \code{ionCount()}).} + +\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return +or set.} + +\item{j}{For \code{[}: not supported.} +} +\description{ +As detailed in the documentation of the \link{Spectra} class, a \code{Spectra} object +is a container for mass spectrometry (MS) data that includes both the mass +peaks data (or \emph{peaks data}, generally \emph{m/z} and intensity values) as well +as spectra metadata (so called \emph{spectra variables}). Spectra variables +generally define one value per spectrum, while for peaks variables one value +per mass peak is defined and hence multiple values per spectrum (depending +on the number of mass peaks of a spectrum). + +Data can be extracted from a \code{Spectra} object using dedicated accessor +functions or also using the \code{$} operator. Depending on the backend class +used by the \code{Spectra} to represent the data, data can also be added or +replaced (again, using dedicated functions or using \verb{$<-}). +} +\section{Spectra variables}{ + + +A common set of \emph{core spectra variables} are defined for \code{Spectra}. These +have a pre-defined data type and each \code{Spectra} will return a value for +these if requested. If no value for a spectra variable is defined, a missing +value (of the correct data type) is returned. The list of core spectra +variables and their respective data type is: +\itemize{ +\item \emph{acquisitionNum} \code{integer(1)}: the index of acquisition of a spectrum +during an MS run. +\item \emph{centroided} \code{logical(1)}: whether the spectrum is in profile or centroid +mode. +\item \emph{collisionEnergy} \code{numeric(1)}: collision energy used to create an MSn +spectrum. +\item \emph{dataOrigin} \code{character(1)}: the \emph{origin} of the spectrum's data, e.g. the +mzML file from which it was read. +\item \emph{dataStorage} \code{character(1)}: the (current) storage location of the +spectrum data. This value depends on the backend used to handle and +provide the data. For an \emph{in-memory} backend like the \code{MsBackendDataFrame} +this will be \code{""}, for an on-disk backend such as the +\code{MsBackendHdf5Peaks} it will be the name of the HDF5 file where the +spectrum's peak data is stored. +\item \emph{isolationWindowLowerMz} \code{numeric(1)}: lower m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowTargetMz} \code{numeric(1)}: the target m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowUpperMz} \code{numeric(1)}: upper m/z for the isolation window +in which the (MSn) spectrum was measured. +\item \emph{msLevel} \code{integer(1)}: the MS level of the spectrum. +\item \emph{polarity} \code{integer(1)}: the polarity of the spectrum (\code{0} and \code{1} +representing negative and positive polarity, respectively). +\item \emph{precScanNum} \code{integer(1)}: the scan (acquisition) number of the precursor +for an MSn spectrum. +\item \emph{precursorCharge} \code{integer(1)}: the charge of the precursor of an MSn +spectrum. +\item \emph{precursorIntensity} \code{numeric(1)}: the intensity of the precursor of an +MSn spectrum. +\item \emph{precursorMz} \code{numeric(1)}: the m/z of the precursor of an MSn spectrum. +\item \emph{rtime} \code{numeric(1)}: the retention time of a spectrum. +\item \emph{scanIndex} \code{integer(1)}: the index of a spectrum within a (raw) file. +\item \emph{smoothed} \code{logical(1)}: whether the spectrum was smoothed. +} + +For each of these spectra variable a dedicated accessor function is defined +(such as \code{msLevel()} or \code{rtime()}) that allows to extract the values of +that spectra variable for all spectra in a \code{Spectra} object. Also, +replacement functions are defined, but not all backends might support +replacing values for spectra variables. As described above, additional +spectra variables can be defined or added. The \code{spectraVariables()} function +can be used to + +Values for multiple spectra variables, or all spectra vartiables* can be +extracted with the \code{spectraData()} function. +} + +\section{Peaks variables}{ + + +\code{Spectra} also provide mass peak data with the \emph{m/z} and intensity values +being the \emph{core} peaks variables: +\itemize{ +\item \emph{intensity} \code{numeric}: intensity values for the spectrum's peaks. +\item \emph{mz} \code{numeric}: the m/z values for the spectrum's peaks. +} + +Values for these can be extracted with the \code{mz()} and \code{intensity()} +functions, or the \code{peaksData()} function. The former functions return a +\code{NumericList} with the respective values, while the latter returns a \code{List} +with \code{numeric} two-column matrices. The list of peaks matrices can also +be extracted using \code{as(x, "list")} or \code{as(x, "SimpleList")} with \code{x} being +a \code{Spectra} object. + +Some \code{Spectra}/backends provide also values for additional peaks variables. +The set of available peaks variables can be extracted with the +\code{peaksVariables()} function. +} + +\section{Functions to access MS data}{ + + +The set of available functions to extract data from, or set data in, a +\code{Spectra} object are (in alphabetical order) listed below. Note that there +are also other functions to extract information from a \code{Spectra} object +documented in \code{\link[=addProcessing]{addProcessing()}}. +\itemize{ +\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. +See examples for details. Note that replacing values of a peaks variable +is not supported with a non-empty processing queue, i.e. if any filtering +or data manipulations on the peaks data was performed. In these cases +\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data +operations. +\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the +backend. +\item \code{acquisitionNum()}: returns the acquisition number of each +spectrum. Returns an \code{integer} of length equal to the number of +spectra (with \code{NA_integer_} if not available). +\item \code{asDataFrame()}: converts the \code{Spectra} to a \code{DataFrame} (in long format) +contining all data. Returns a \code{DataFrame}. +\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding +information of the spectra. \code{centroided()} returns a \code{logical} +vector of length equal to the number of spectra with \code{TRUE} if a +spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} +if it is undefined. See also \code{isCentroided()} for estimating from +the spectrum data whether the spectrum is centroided. \code{value} +for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of +length equal to the number of spectra in \code{object}. +\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the +collision energy for all spectra in \code{object}. \code{collisionEnergy()} +returns a \code{numeric} with length equal to the number of spectra +(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a +\code{numeric} of length equal to the number of spectra in \code{object}. +\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with +their expected data type. +\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each +spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than +\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a +\code{character} vector (same length than \code{object}) with the replacement +values for the data origin of each spectrum. +\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) +with the data storage location of each spectrum. +\item \code{intensity()}: gets the intensity values from the spectra. Returns +a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each +spectrum). The length of the list is equal to the number of +\code{spectra} in \code{object}. +\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for +each spectrum. If the spectrum is empty (see \code{isEmpty()}), +\code{NA_real_} is returned. +\item \code{isCentroided()}: a heuristic approach assessing if the spectra in +\code{object} are in profile or centroided mode. The function takes +the \code{qtl}th quantile top peaks, then calculates the difference +between adjacent m/z value and returns \code{TRUE} if the first +quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for +the code.) +\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty +(i.e. does not contain any peaks). Returns a \code{logical} vector of +length equal number of spectra. +\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the +lower m/z boundary of the isolation window. +\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the +target m/z of the isolation window. +\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the +upper m/z boundary of the isolation window. +\item \code{length()}: gets the number of spectra in the object. +\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per +spectrum. Returns an \code{integer} vector (length equal to the +number of spectra). For empty spectra, \code{0} is returned. +\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names +being spectrum names, length equal to the number of spectra) with the MS +level for each spectrum. +\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the +spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of +spectra, each element a \code{numeric} vector with the m/z values of +one spectrum. +\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks +data consist of the m/z and intensity values as well as possible additional +annotations (variables) of all peaks of each spectrum. The function +returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or +\code{data.frame}), with each array providing the values for the requested +\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter +\code{columns} is passed to the backend's \code{peaksData()} function to allow +the selection of specific (or additional) peaks variables (columns) that +should be extracted (if available). Importantly, +it is \strong{not} guaranteed that each backend supports this parameter (while +each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). +Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value +returned by \code{peaksVariables(object)} is supported. +Note also that it is possible to extract the peak data with +\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, +respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} +does not support the parameter \code{columns}. +\item \code{peaksVariables()}: lists the available variables for mass peaks provided +by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which +all backends need to support and provide), but some backends might provide +additional variables. +These variables correspond to the column names of the peak data array +returned by \code{peaksData()}. +\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each +spectrum. \code{polarity()} returns an \code{integer} vector (length equal +to the number of spectra), with \code{0} and \code{1} representing negative +and positive polarities, respectively. \verb{polarity<-} expects an +\code{integer} vector of length 1 or equal to the number of spectra. +\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, +\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), +intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) +and acquisition number (\code{interger}) of the precursor for MS level > +2 spectra from the object. Returns a vector of length equal to +the number of spectra in \code{object}. \code{NA} are reported for MS1 +spectra of if no precursor information is available. +\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) +for each spectrum. \code{rtime()} returns a \code{numeric} vector (length +equal to the number of spectra) with the retention time for each +spectrum. \verb{rtime<-} expects a numeric vector with length equal +to the number of spectra. +\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} +for each spectrum. This represents the relative index of the +spectrum within each file. Note that this can be different to the +\code{acquisitionNum} of the spectrum which represents the index of the +spectrum during acquisition/measurement (as reported in the mzML file). +\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is +\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal +to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector +of length 1 or equal to the number of spectra in \code{object}. +\item \code{spectraData()}: gets general spectrum metadata (annotation, also called +header). \code{spectraData()} returns a \code{DataFrame}. Note that this +method does by default \strong{not} return m/z or intensity values. +\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} +object with the one provided with \code{value}. The \verb{spectraData<-} function +expects a \code{DataFrame} to be passed as value with the same number of rows +as there a spectra in \code{object}. Note that replacing values of +peaks variables is not supported with a non-empty processing queue, i.e. +if any filtering or data manipulations on the peaks data was performed. +In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all +cached data operations and empty the processing queue. +\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. +\item \code{spectraVariables()}: returns a \code{character} vector with the +available spectra variables (columns, fields or attributes of each +spectrum) available in \code{object}. Note that \code{spectraVariables()} does not +list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional +annotations for each MS peak). Peak variables are returned by +\code{peaksVariables()}. +\item \code{tic()}: gets the total ion current/count (sum of signal of a +spectrum) for all spectra in \code{object}. By default, the value +reported in the original raw data file is returned. For an empty +spectrum, \code{0} is returned. +\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This +function is supposed to be more efficient than \code{unique(msLevel(object))}. +} +} + +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Get the number of spectra in the data set +length(sciex) + +## Get the number of mass peaks per spectrum - limit to the first 6 +lengths(sciex) |> head() + +## Get the MS level for each spectrum - limit to the first 6 spectra +msLevel(sciex) |> head() + +## Alternatively, we could also use $ to access a specific spectra variable. +## This could also be used to add additional spectra variables to the +## object (see further below). +sciex$msLevel |> head() + +## Get the intensity and m/z values. +intensity(sciex) +mz(sciex) + +## Convert a subset of the Spectra object to a long DataFrame. +asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) + +## Create a Spectra providing a `DataFrame` containing the spectrum data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## List all available spectra variables (i.e. spectrum data and metadata). +spectraVariables(s) + +## For all *core* spectrum variables accessor functions are available. These +## return NA if the variable was not set. +centroided(s) +dataStorage(s) +rtime(s) +precursorMz(s) + +## The core spectra variables are: +coreSpectraVariables() + +## Add an additional metadata column. +s$spectrum_id <- c("sp_1", "sp_2") + +## List spectra variables, "spectrum_id" is now also listed +spectraVariables(s) + +## Get the values for the new spectra variable +s$spectrum_id + +## Extract specific spectra variables. +spectraData(s, columns = c("spectrum_id", "msLevel")) + + +## -------- PEAKS VARIABLES AND DATA -------- + +## Get the peak data (m/z and intensity values). +pks <- peaksData(s) +pks +pks[[1]] +pks[[2]] + +## Note that we could get the same resulb by coercing the `Spectra` to +## a `list` or `SimpleList`: +as(s, "list") +as(s, "SimpleList") + +## Or use `mz()` and `intensity()` to extract the m/z and intensity values +## separately +mz(s) +intensity(s) + +## Some `MsBackend` classes provide support for arbitrary peaks variables +## (in addition to the mandatory `"mz"` and `"intensity"` values. Below +## we create a simple data frame with an additional peak variable `"pk_ann"` +## and create a `Spectra` with a `MsBackendMemory` for that data. +## Importantly the number of values (per spectrum) need to be the same +## for all peak variables. + +tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) +tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) +tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) +tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) + +## Create the Spectra. With parameter `peaksVariables` we can define +## the columns in `tmp` that contain peaks variables. +sps <- Spectra(tmp, source = MsBackendMemory(), + peaksVariables = c("mz", "intensity", "pk_ann")) +peaksVariables(sps) + +## Extract just the m/z and intensity values +peaksData(sps)[[1L]] + +## Extract the full peaks data +peaksData(sps, columns = peaksVariables(sps))[[1L]] + +## Access just the pk_ann variable +sps$pk_ann + + +} +\seealso{ +\itemize{ +\item \code{\link[=addProcessing]{addProcessing()}} for functions to analyze \code{Spectra}. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +}