diff --git a/.Rbuildignore b/.Rbuildignore index 22a5d1be..37442c70 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,6 +1,7 @@ ^\.github$ .editorconfig .travis.yml +man/figures* local_data favicon logo.png diff --git a/.editorconfig b/.editorconfig index 71842659..0cebcc70 100644 --- a/.editorconfig +++ b/.editorconfig @@ -6,7 +6,7 @@ root = true charset = utf-8 end_of_line = lf trim_trailing_whitespace = true -insert_final_newline = false +insert_final_newline = true [*.R] indent_style = space @@ -22,4 +22,4 @@ indent_style = tab [*.yml] indent_style = space -indent_size = 2 +indent_size = 2 \ No newline at end of file diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index c7c036d7..5c1ebc66 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -22,7 +22,8 @@ on: push: - pull_request: + paths-ignore: + - 'README.md' name: R-CMD-check-bioc @@ -52,9 +53,9 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - - { os: macOS-latest, r: 'next', bioc: '3.19'} - - { os: windows-latest, r: 'next', bioc: '3.19'} + - { os: ubuntu-latest, r: 'devel', bioc: 'devel', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + - { os: macOS-latest, r: '4.4', bioc: '3.20'} + - { os: windows-latest, r: '4.4', bioc: '3.20'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -125,32 +126,7 @@ jobs: - name: Install macOS system dependencies if: matrix.config.os == 'macOS-latest' run: | - ## Enable installing XML from source if needed - brew install libxml2 - echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV - - ## Required to install magick as noted at - ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 - brew install imagemagick@6 - - ## For textshaping, required by ragg, and required by pkgdown - brew install harfbuzz fribidi - - brew install libgit2 - ## Helps compile RCurl from source - ## brew uninstall curl - - ## required for ncdf4 - can not use the homebrew one because that uses GCC - ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ - curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz - tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / - rm netcdf-4.7.4-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz - tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / - rm hdf5-1.12.0-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz - tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / - rm szip-2.1.1-darwin.17-x86_64.tar.gz + shell: Rscript {0} - name: Install Windows system dependencies if: runner.os == 'Windows' @@ -263,7 +239,7 @@ jobs: - name: Test coverage if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' run: | - covr::codecov() + covr::codecov(token = "${{ secrets.CODECOV_TOKEN }}") shell: Rscript {0} - name: Install package diff --git a/DESCRIPTION b/DESCRIPTION index 6608f837..b1a33c7d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Spectra Title: Spectra Infrastructure for Mass Spectrometry Data -Version: 1.15.0 +Version: 1.17.2 Description: The Spectra package defines an efficient infrastructure for storing and handling mass spectrometry spectra and functionality to subset, process, visualize and compare spectra data. It provides different @@ -40,9 +40,9 @@ Authors@R: c(person(given = "RforMassSpectrometry Package Maintainer", Depends: R (>= 4.0.0), S4Vectors, - BiocParallel, - ProtGenerics (>= 1.35.4) + BiocParallel Imports: + ProtGenerics (>= 1.37.1), methods, IRanges, MsCoreUtils (>= 1.7.5), diff --git a/NAMESPACE b/NAMESPACE index 3e9d518a..d70ef776 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(MsBackendMemory) export(MsBackendMzR) export(PrecursorMzParam) export(applyProcessing) +export(asDataFrame) export(chunkapply) export(combinePeaksData) export(combineSpectra) @@ -16,8 +17,8 @@ export(concatenateSpectra) export(coreSpectraVariables) export(countIdentifications) export(deisotopeSpectra) -export(estimatePrecursorIntensity) export(estimatePrecursorMz) +export(filterPeaksRanges) export(filterPrecursorIsotopes) export(filterPrecursorMaxIntensity) export(filterPrecursorPeaks) @@ -50,13 +51,16 @@ exportMethods("centroided<-") exportMethods("collisionEnergy<-") exportMethods("dataOrigin<-") exportMethods("dataStorage<-") +exportMethods("dataStorageBasePath<-") exportMethods("intensity<-") exportMethods("isolationWindowLowerMz<-") exportMethods("isolationWindowTargetMz<-") exportMethods("isolationWindowUpperMz<-") +exportMethods("msLevel<-") exportMethods("mz<-") exportMethods("peaksData<-") exportMethods("polarity<-") +exportMethods("precursorMz<-") exportMethods("rtime<-") exportMethods("smoothed<-") exportMethods("spectraData<-") @@ -68,6 +72,7 @@ exportMethods(backendBpparam) exportMethods(backendInitialize) exportMethods(backendMerge) exportMethods(backendParallelFactor) +exportMethods(backendRequiredSpectraVariables) exportMethods(bin) exportMethods(c) exportMethods(cbind2) @@ -79,9 +84,12 @@ exportMethods(containsMz) exportMethods(containsNeutralLoss) exportMethods(dataOrigin) exportMethods(dataStorage) +exportMethods(dataStorageBasePath) exportMethods(dropNaSpectraVariables) exportMethods(entropy) +exportMethods(estimatePrecursorIntensity) exportMethods(export) +exportMethods(extractByIndex) exportMethods(filterAcquisitionNum) exportMethods(filterDataOrigin) exportMethods(filterDataStorage) @@ -158,6 +166,7 @@ importFrom(MsCoreUtils,coefMA) importFrom(MsCoreUtils,coefSG) importFrom(MsCoreUtils,coefWMA) importFrom(MsCoreUtils,common) +importFrom(MsCoreUtils,common_path) importFrom(MsCoreUtils,entropy) importFrom(MsCoreUtils,group) importFrom(MsCoreUtils,i2index) @@ -201,6 +210,7 @@ importFrom(methods,.hasSlot) importFrom(methods,.valueClassTest) importFrom(methods,as) importFrom(methods,callNextMethod) +importFrom(methods,existsMethod) importFrom(methods,is) importFrom(methods,new) importFrom(methods,setAs) @@ -225,9 +235,11 @@ importMethodsFrom(ProtGenerics,"intensity<-") importMethodsFrom(ProtGenerics,"isolationWindowLowerMz<-") importMethodsFrom(ProtGenerics,"isolationWindowTargetMz<-") importMethodsFrom(ProtGenerics,"isolationWindowUpperMz<-") +importMethodsFrom(ProtGenerics,"msLevel<-") importMethodsFrom(ProtGenerics,"mz<-") importMethodsFrom(ProtGenerics,"peaksData<-") importMethodsFrom(ProtGenerics,"polarity<-") +importMethodsFrom(ProtGenerics,"precursorMz<-") importMethodsFrom(ProtGenerics,"rtime<-") importMethodsFrom(ProtGenerics,"smoothed<-") importMethodsFrom(ProtGenerics,"spectraData<-") @@ -244,6 +256,7 @@ importMethodsFrom(ProtGenerics,collisionEnergy) importMethodsFrom(ProtGenerics,compareSpectra) importMethodsFrom(ProtGenerics,dataOrigin) importMethodsFrom(ProtGenerics,dataStorage) +importMethodsFrom(ProtGenerics,estimatePrecursorIntensity) importMethodsFrom(ProtGenerics,filterAcquisitionNum) importMethodsFrom(ProtGenerics,filterDataOrigin) importMethodsFrom(ProtGenerics,filterDataStorage) diff --git a/NEWS.md b/NEWS.md index e76fb0bb..2e469261 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,80 @@ -# Spectra 1.15 +# Spectra 1.17 -## Changes in 1.15.0 +## Changes in 1.17.2 - Add `cbind2()` method to easily add multiple `spectraVariables` to the `spectraData` +## Changes in 1.17.1 + +- Refactor `containsMz()` to support chunk-wise processing. + +# Spectra 1.15 + +## Changes in 1.15.13 + +- Add `precursorMz<-` method [issue #336](https://github.com/rformassspectrometry/Spectra/issues/336). + +## Changes in 1.15.12 + +- Add generic `backendRequiredSpectraVariables()` to allow definition of + mandatory spectra variables for a backend. + +## Changes in 1.15.11 + +- Add reference to `MsBackendMetaboLights`. + +## Changes in 1.15.10 + +- Add new `extractSpectra()` generic and implementation for `MsBackend`. Fixes + [issue #5](https://github.com/rformassspectrometry/MsBackendMetaboLights/issues/5). + +## Changes in 1.15.9 + +- Restructure and reorganize documentation for `Spectra`. + +## Changes in 1.15.8 + +- Refactor the `Spectra()` constructor method: better support for + initialization of backends that define their own specific parameters. + +## Changes in 1.15.7 + +- Change `estimatePrecursorIntensity()` to a method to avoid overrides/clashes + with the same-named implementation in *xcms*. + +## Changes in 1.15.6 + +- Fix in `selectSpectraVariables()` for `MsBackendMzR`: ensure peaks variables + `"mz"` and `"intensity"` are not by default removed. + +## Changes in 1.15.5 + +- Add new `filterPeaksRanges()` function to filter mass peaks by ranges on + numeric spectra or peak variables. + +## Changes in 1.15.3 + +- For evaluation of the `Spectra`'s processing queue: call functions from the + *MetaboCoreUtils* directly through their namespace (`MsCoreUtils::`) to avoid + errors if performed in parallel on Windows machines or if called on a + re-loaded object. +- New `asDataFrame()` function to convert a (small) `Spectra` object + into a long `DataFrame`. + +## Changes in 1.15.2 + +- Add `dataStorageDataPath()` and `dataStorageDataPath<-` methods to allow + updating/adapting the path of the data storage files of backends supporting + that [issue #321](https://github.com/rformassspectrometry/Spectra/issues/321). + +## Changes in 1.15.1 + +- Improve documentation for `combineSpectra()` and `combinePeaks()` [issue + #320](https://github.com/rformassspectrometry/Spectra/issues/320). + +# Spectra 1.13 + ## Changes in 1.13.8 - Add `estimatePrecursorMz()` function to *estimate* the precursor m/z for DDA diff --git a/R/AllGenerics.R b/R/AllGenerics.R index f68500ad..856cb69e 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -1,42 +1,40 @@ #' @include hidden_aliases.R NULL +setGeneric("backendRequiredSpectraVariables", function(object, ...) + standardGeneric("backendRequiredSpectraVariables")) #' @rdname hidden_aliases setMethod("bin", "numeric", MsCoreUtils::bin) setGeneric("combinePeaks", function(object, ...) standardGeneric("combinePeaks")) -#' @rdname hidden_aliases setGeneric("containsMz", function(object, ...) standardGeneric("containsMz")) -#' @rdname hidden_aliases setGeneric("containsNeutralLoss", function(object, ...) standardGeneric("containsNeutralLoss")) -#' @rdname hidden_aliases +setGeneric("dataStorageBasePath", function(object, ...) + standardGeneric("dataStorageBasePath")) +setGeneric("dataStorageBasePath<-", function(object, ..., value) + standardGeneric("dataStorageBasePath<-")) setGeneric("dropNaSpectraVariables", function(object, ...) standardGeneric("dropNaSpectraVariables")) -#' @rdname hidden_aliases setGeneric("entropy", function(object, ...) standardGeneric("entropy")) -#' @rdname hidden_aliases setGeneric("export", function(object, ...) standardGeneric("export")) +setGeneric("extractByIndex", function(object, i) + standardGeneric("extractByIndex")) setGeneric("filterFourierTransformArtefacts", function(object, ...) standardGeneric("filterFourierTransformArtefacts")) -#' @rdname neutralLoss setGeneric("neutralLoss", function(object, param, ...) standardGeneric("neutralLoss")) -#' @rdname hidden_aliases setGeneric("pickPeaks", function(object, ...) standardGeneric("pickPeaks")) setGeneric("plotSpectraMirror", function(x, y, ...) standardGeneric("plotSpectraMirror")) -#' @rdname hidden_aliases setGeneric("replaceIntensitiesBelow", function(object, threshold = min, ...) standardGeneric("replaceIntensitiesBelow")) -#' @rdname hidden_aliases setGeneric("reset", function(object, ...) standardGeneric("reset")) -#' @rdname hidden_aliases setGeneric("selectSpectraVariables", function(object, ...) standardGeneric("selectSpectraVariables")) setGeneric("Spectra", function(object, ...) standardGeneric("Spectra")) diff --git a/R/MsBackend.R b/R/MsBackend.R index 74945cbf..010dc963 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -11,6 +11,14 @@ #' @aliases backendInitialize #' @aliases backendParallelFactor,MsBackendMzR-method #' @aliases backendParallelFactor,MsBackendHdf5Peaks-method +#' @aliases dataStorageBasePath +#' @aliases dataStorageBasePath,MsBackendMzR-method +#' @aliases dataStorageBasePath<- +#' @aliases dataStorageBasePath<-,MsBackendMzR-method +#' @aliases extractByIndex +#' @aliases msLeveL<-,MsBackend-method +#' @aliases backendRequiredSpectraVariables +#' @aliases backendRequiredSpectraVariables,MsBackend-method #' #' @description #' @@ -222,7 +230,9 @@ #' allowed. Parameter `i` should support `integer` indices and `logical` #' and should throw an error if `i` is out of bounds. The #' `MsCoreUtils::i2index` could be used to check the input `i`. -#' For `i = integer()` an empty backend should be returned. +#' For `i = integer()` an empty backend should be returned. Implementation +#' of this method is optional, as the default calls the `extractByIndex()` +#' method (which has to be implemented as the main subsetting method). #' #' - `$`, `$<-`: access or set/add a single spectrum variable (column) in the #' backend. Using a `value` of `NULL` should allow deleting the specified @@ -276,6 +286,13 @@ #' `MsBackendMzR` on the other hand returns `factor(dataStorage(object))` #' hence suggesting to split the object by data file. #' +#' - `backendRequiredSpectraVariables()`: returns a `character` with spectra +#' variable names that are mandatory for a specific backend. The default +#' returns an empty `character()`. The implementation for `MsBackendMzR` +#' returns `c("dataStorage", "scanIndex")` as these two spectra variables +#' are required to load the MS data on-the-fly. This method needs only to +#' be implemented if a backend requires specific variables to be defined. +#' #' - `dataOrigin()`: gets a `character` of length equal to the number of #' spectra in `object` with the *data origin* of each spectrum. This could #' e.g. be the mzML file from which the data was read. @@ -284,6 +301,16 @@ #' spectra in `object` with the data storage of each spectrum. Note that #' missing values (`NA_character_`) are not supported for `dataStorage`. #' +#' - `dataStorageBasePath()`, `dataStorageBasePath<-: gets or sets the common +#' *base* path of the directory containing all data files. If supported, +#' the function is expected to return (or accept) a `character` of length 1. +#' Most backends (such as for example the `MsBackendMemory` will not support +#' this function and `dataStorageBasePath()` will return `NA_character_`. +#' For `MsBackendMzR`, this function allows to get or change the path to the +#' directory containing the original data files, which is required if e.g. +#' a serialized `MsBackendMzR` instance gets copied to another computer or +#' file system. +#' #' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the #' object's `spectraData` that contain only missing values (`NA`). Note that #' while columns with only `NA`s are removed, a `spectraData()` call after @@ -322,6 +349,17 @@ #' *mzML* or *mzXML* format. See the documentation for the `MsBackendMzR` #' class below for more information. #' +#' - `extractByIndex()`: function to subset a backend to selected elements +#' defined by the provided index. Similar to `[`, this method should allow +#' extracting (or to subset) the data in any order. In contrast to `[`, +#' however, `i` is expected to be an `integer` (while `[` should also +#' support `logical` and eventually `character`). While being apparently +#' redundant to `[`, this methods avoids package namespace errors/problems +#' that can result in implementations of `[` being not found by R (which +#' can happen sometimes in parallel processing using the [SnowParam()]). This +#' method is used internally by `Spectra` to extract/subset its backend. +#' Implementation of this method is mandatory. +#' #' - `filterAcquisitionNum()`: filters the object keeping only spectra matching #' the provided acquisition numbers (argument `n`). If `dataOrigin` or #' `dataStorage` is also provided, `object` is subsetted to the spectra with @@ -469,6 +507,8 @@ #' vector (of length equal to the number of spectra) with the MS #' level for each spectrum (or `NA_integer_` if not available). #' +#' - `msLevel<-`: replaces the spectra's MS level. +#' #' - `mz()`: gets the mass-to-charge ratios (m/z) from the #' spectra. Returns a [NumericList()] or length equal to the number of #' spectra, each element a `numeric` vector with the m/z values of @@ -697,7 +737,7 @@ #' #' The parameters are: #' - `object`: an instance of the `MsBackendMzR` class. -#' - `x`: the [Spectra-class] object to be exported. +#' - `x`: the [Spectra] object to be exported. #' - `file`: `character` with the (full) output file name(s). Should be #' of length 1 or equal `length(x)`. If a single file is specified, all #' spectra are exported to that file. Alternatively it is possible to specify @@ -710,7 +750,7 @@ #' backend and if `dataOrigin(x)` contains the original MS data file names. #' - `BPPARAM`: parallel processing settings. #' -#' See examples in [Spectra-class] or the vignette for more details and +#' See examples in [Spectra] or the vignette for more details and #' examples. #' #' The `MsBackendMzR` ignores parameter `columns` of the `peaksData()` @@ -769,7 +809,7 @@ #' #' @return See documentation of respective function. #' -#' @author Johannes Rainer, Sebastian Gibb, Laurent Gatto +#' @author Johannes Rainer, Sebastian Gibb, Laurent Gatto, Philippine Louail #' #' @md #' @@ -897,6 +937,8 @@ setValidity("MsBackend", function(object) { #' @exportMethod backendBpparam #' #' @rdname MsBackend +#' +#' @export setMethod("backendBpparam", signature = "MsBackend", function(object, BPPARAM = bpparam()) { BPPARAM @@ -907,6 +949,8 @@ setMethod("backendBpparam", signature = "MsBackend", #' @importMethodsFrom ProtGenerics backendInitialize #' #' @rdname MsBackend +#' +#' @export setMethod("backendInitialize", signature = "MsBackend", function(object, ...) { validObject(object) object @@ -922,6 +966,8 @@ setMethod("backendMerge", "list", function(object, ...) { #' @exportMethod backendMerge #' #' @rdname MsBackend +#' +#' @export setMethod("backendMerge", "MsBackend", function(object, ...) { stop("Not implemented for ", class(object), ".") }) @@ -931,11 +977,21 @@ setMethod("backendMerge", "MsBackend", function(object, ...) { #' @exportMethod backendParallelFactor #' #' @rdname MsBackend +#' +#' @export setMethod("backendParallelFactor", "MsBackend", function(object, ...) { factor() }) +#' @export +setMethod("backendRequiredSpectraVariables", "MsBackend", + function(object, ...) { + character() + }) + #' @rdname MsBackend +#' +#' @export setMethod("export", "MsBackend", function(object, ...) { stop(class(object), " does not support export of data; please provide a ", "backend that supports data export with parameter 'backend'.") @@ -946,6 +1002,8 @@ setMethod("export", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics acquisitionNum #' #' @rdname MsBackend +#' +#' @export setMethod("acquisitionNum", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -955,6 +1013,8 @@ setMethod("acquisitionNum", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics peaksData #' #' @rdname MsBackend +#' +#' @export setMethod("peaksData", "MsBackend", function(object, columns = c("mz", "intensity")) { stop("Not implemented for ", class(object), ".") @@ -965,6 +1025,8 @@ setMethod("peaksData", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics peaksVariables #' #' @rdname MsBackend +#' +#' @export setMethod("peaksVariables", "MsBackend", function(object) { c("mz", "intensity") }) @@ -996,6 +1058,8 @@ setMethod("cbind2", signature = c("MsBackend", "dataframeOrDataFrameOrmatrix"), #' @importMethodsFrom ProtGenerics centroided #' #' @rdname MsBackend +#' +#' @export setMethod("centroided", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1005,6 +1069,8 @@ setMethod("centroided", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics centroided<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("centroided", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1014,6 +1080,8 @@ setReplaceMethod("centroided", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics collisionEnergy #' #' @rdname MsBackend +#' +#' @export setMethod("collisionEnergy", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1023,6 +1091,8 @@ setMethod("collisionEnergy", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics collisionEnergy<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("collisionEnergy", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1032,6 +1102,8 @@ setReplaceMethod("collisionEnergy", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics dataOrigin #' #' @rdname MsBackend +#' +#' @export setMethod("dataOrigin", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1041,6 +1113,8 @@ setMethod("dataOrigin", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics dataOrigin<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("dataOrigin", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1050,6 +1124,8 @@ setReplaceMethod("dataOrigin", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics dataStorage #' #' @rdname MsBackend +#' +#' @export setMethod("dataStorage", "MsBackend", function(object) { stop("Method 'dataStorage' is not implemented for ", class(object), ".") }) @@ -1059,6 +1135,8 @@ setMethod("dataStorage", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics dataStorage<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("dataStorage", "MsBackend", function(object, value) { stop("Method 'dataStorage' is not implemented for ", class(object), ".") }) @@ -1066,9 +1144,12 @@ setReplaceMethod("dataStorage", "MsBackend", function(object, value) { #' @exportMethod dropNaSpectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod("dropNaSpectraVariables", "MsBackend", function(object) { svs <- spectraVariables(object) - svs <- svs[!(svs %in% c("mz", "intensity"))] + req_cols <- c(backendRequiredSpectraVariables(object), c("mz", "intensity")) + svs <- svs[!(svs %in% req_cols)] spd <- spectraData(object, columns = svs) keep <- !vapply1l(spd, function(z) { allna <- all(is.na(z)) @@ -1076,7 +1157,25 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { FALSE else allna }) - selectSpectraVariables(object, c(svs[keep], "mz", "intensity")) + selectSpectraVariables(object, c(svs[keep], req_cols)) +}) + +#' @rdname MsBackend +#' +#' @importFrom methods existsMethod +#' +#' @export +setMethod("extractByIndex", c("MsBackend", "ANY"), function(object, i) { + if (existsMethod("[", class(object)[1L])) + object[i = i] + else stop("'extractByIndex' not implemented for ", class(object)[1L], ".") +}) + +#' @rdname MsBackend +#' +#' @export +setMethod("extractByIndex", c("MsBackend", "missing"), function(object, i) { + object }) #' @exportMethod filterAcquisitionNum @@ -1084,6 +1183,8 @@ setMethod("dropNaSpectraVariables", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics filterAcquisitionNum #' #' @rdname MsBackend +#' +#' @export setMethod("filterAcquisitionNum", "MsBackend", function(object, n, file, ...) { stop("Not implemented for ", class(object), ".") }) @@ -1093,6 +1194,8 @@ setMethod("filterAcquisitionNum", "MsBackend", function(object, n, file, ...) { #' @importMethodsFrom ProtGenerics filterDataOrigin #' #' @rdname MsBackend +#' +#' @export setMethod("filterDataOrigin", "MsBackend", function(object, dataOrigin = character()) { if (length(dataOrigin)) { @@ -1108,6 +1211,8 @@ setMethod("filterDataOrigin", "MsBackend", #' @importMethodsFrom ProtGenerics filterDataStorage #' #' @rdname MsBackend +#' +#' @export setMethod("filterDataStorage", "MsBackend", function(object, dataStorage = character()) { if (length(dataStorage)) { @@ -1123,6 +1228,8 @@ setMethod("filterDataStorage", "MsBackend", #' @importMethodsFrom ProtGenerics filterEmptySpectra #' #' @rdname MsBackend +#' +#' @export setMethod("filterEmptySpectra", "MsBackend", function(object, ...) { if (!length(object)) return(object) object[as.logical(lengths(object))] @@ -1133,6 +1240,8 @@ setMethod("filterEmptySpectra", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics filterIsolationWindow #' #' @rdname MsBackend +#' +#' @export setMethod("filterIsolationWindow", "MsBackend", function(object, mz = numeric(), ...) { if (length(mz)) { @@ -1149,6 +1258,8 @@ setMethod("filterIsolationWindow", "MsBackend", #' @importMethodsFrom ProtGenerics filterMsLevel #' #' @rdname MsBackend +#' +#' @export setMethod("filterMsLevel", "MsBackend", function(object, msLevel = integer()) { if (length(msLevel)) { @@ -1161,6 +1272,8 @@ setMethod("filterMsLevel", "MsBackend", #' @importMethodsFrom ProtGenerics filterPolarity #' #' @rdname MsBackend +#' +#' @export setMethod("filterPolarity", "MsBackend", function(object, polarity = integer()) { if (length(polarity)) @@ -1173,6 +1286,8 @@ setMethod("filterPolarity", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMzRange #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMzRange", "MsBackend", function(object, mz = numeric()) { if (length(mz)) { @@ -1185,6 +1300,8 @@ setMethod("filterPrecursorMzRange", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMz #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMz", "MsBackend", function(object, mz = numeric()) { filterPrecursorMzRange(object, mz) @@ -1195,6 +1312,8 @@ setMethod("filterPrecursorMz", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorMzValues #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorMzValues", "MsBackend", function(object, mz = numeric(), ppm = 20, tolerance = 0) { if (length(mz)) { @@ -1208,6 +1327,8 @@ setMethod("filterPrecursorMzValues", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorCharge #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorCharge", "MsBackend", function(object, z = integer()) { if (length(z)) { @@ -1221,6 +1342,8 @@ setMethod("filterPrecursorCharge", "MsBackend", #' @importMethodsFrom ProtGenerics filterPrecursorScan #' #' @rdname MsBackend +#' +#' @export setMethod("filterPrecursorScan", "MsBackend", function(object, acquisitionNum = integer(), f = dataOrigin(object)) { if (length(acquisitionNum) && length(f)) { @@ -1241,6 +1364,8 @@ setMethod("filterPrecursorScan", "MsBackend", #' @importFrom MsCoreUtils between #' #' @rdname MsBackend +#' +#' @export setMethod("filterRanges", "MsBackend", function(object, spectraVariables = character(), ranges = numeric(), match = c("all", "any")){ @@ -1282,6 +1407,8 @@ setMethod("filterRanges", "MsBackend", #' @importMethodsFrom ProtGenerics filterRt #' #' @rdname MsBackend +#' +#' @export setMethod("filterRt", "MsBackend", function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { if (length(rt)) { @@ -1299,6 +1426,8 @@ setMethod("filterRt", "MsBackend", #' @importFrom MsCoreUtils ppm #' #' @rdname MsBackend +#' +#' @export setMethod("filterValues", "MsBackend", function(object, spectraVariables = character(), values = numeric(), ppm = 0, tolerance = 0, match = c("all", "any")){ @@ -1344,6 +1473,8 @@ setMethod("filterValues", "MsBackend", #' @importMethodsFrom ProtGenerics intensity #' #' @rdname MsBackend +#' +#' @export setMethod("intensity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1353,6 +1484,8 @@ setMethod("intensity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics intensity<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("intensity", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1364,6 +1497,8 @@ setReplaceMethod("intensity", "MsBackend", function(object, value) { #' @importFrom MsCoreUtils vapply1d #' #' @rdname MsBackend +#' +#' @export setMethod("ionCount", "MsBackend", function(object) { vapply1d(intensity(object), sum, na.rm = TRUE) }) @@ -1374,6 +1509,8 @@ setMethod("ionCount", "MsBackend", function(object) { #' @importFrom MsCoreUtils vapply1l #' #' @rdname MsBackend +#' +#' @export setMethod("isCentroided", "MsBackend", function(object, ...) { vapply1l(peaksData(object), .peaks_is_centroided) }) @@ -1383,6 +1520,8 @@ setMethod("isCentroided", "MsBackend", function(object, ...) { #' @rdname MsBackend #' #' @importMethodsFrom S4Vectors isEmpty +#' +#' @export setMethod("isEmpty", "MsBackend", function(x) { stop("Not implemented for ", class(x), ".") }) @@ -1392,6 +1531,8 @@ setMethod("isEmpty", "MsBackend", function(x) { #' @importMethodsFrom ProtGenerics isolationWindowLowerMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowLowerMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1401,6 +1542,8 @@ setMethod("isolationWindowLowerMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowLowerMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowLowerMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1411,6 +1554,8 @@ setReplaceMethod("isolationWindowLowerMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isolationWindowTargetMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowTargetMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1420,6 +1565,8 @@ setMethod("isolationWindowTargetMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowTargetMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowTargetMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1430,6 +1577,8 @@ setReplaceMethod("isolationWindowTargetMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isolationWindowUpperMz #' #' @rdname MsBackend +#' +#' @export setMethod("isolationWindowUpperMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1439,6 +1588,8 @@ setMethod("isolationWindowUpperMz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics isolationWindowUpperMz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("isolationWindowUpperMz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") @@ -1449,6 +1600,8 @@ setReplaceMethod("isolationWindowUpperMz", "MsBackend", function(object, #' @importMethodsFrom ProtGenerics isReadOnly #' #' @rdname MsBackend +#' +#' @export setMethod("isReadOnly", "MsBackend", function(object) { object@readonly }) @@ -1456,6 +1609,8 @@ setMethod("isReadOnly", "MsBackend", function(object) { #' @exportMethod length #' #' @rdname MsBackend +#' +#' @export setMethod("length", "MsBackend", function(x) { stop("Not implemented for ", class(x), ".") }) @@ -1465,15 +1620,28 @@ setMethod("length", "MsBackend", function(x) { #' @importMethodsFrom ProtGenerics msLevel #' #' @rdname MsBackend +#' +#' @export setMethod("msLevel", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) +#' @importMethodsFrom ProtGenerics msLevel<- +#' +#' @rdname MsBackend +#' +#' @export +setReplaceMethod("msLevel", "MsBackend", function(object, value) { + stop("Not implemented for ", class(object), ".") +}) + #' @exportMethod mz #' #' @importMethodsFrom ProtGenerics mz #' #' @rdname MsBackend +#' +#' @export setMethod("mz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1483,6 +1651,8 @@ setMethod("mz", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics mz<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("mz", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1497,6 +1667,8 @@ setMethod("lengths", "MsBackend", function(x, use.names = FALSE) { #' @importMethodsFrom ProtGenerics polarity #' #' @rdname MsBackend +#' +#' @export setMethod("polarity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1506,6 +1678,8 @@ setMethod("polarity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics polarity<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("polarity", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1515,6 +1689,8 @@ setReplaceMethod("polarity", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics precScanNum #' #' @rdname MsBackend +#' +#' @export setMethod("precScanNum", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1524,6 +1700,8 @@ setMethod("precScanNum", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorCharge #' #' @rdname MsBackend +#' +#' @export setMethod("precursorCharge", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1533,6 +1711,8 @@ setMethod("precursorCharge", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorIntensity #' #' @rdname MsBackend +#' +#' @export setMethod("precursorIntensity", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1542,14 +1722,31 @@ setMethod("precursorIntensity", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics precursorMz #' #' @rdname MsBackend +#' +#' @export setMethod("precursorMz", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) +#' @exportMethod precursorMz<- +#' +#' @importMethodsFrom ProtGenerics precursorMz<- +#' +#' @rdname MsBackend +#' +#' @export +setReplaceMethod("precursorMz", "MsBackend", function(object, ..., value) { + object$precursorMz <- value + object +}) + #' @exportMethod peaksData<- #' #' @importMethodsFrom ProtGenerics peaksData<- +#' #' @rdname MsBackend +#' +#' @export setReplaceMethod("peaksData", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1557,6 +1754,8 @@ setReplaceMethod("peaksData", "MsBackend", function(object, value) { #' @exportMethod reset #' #' @rdname MsBackend +#' +#' @export setMethod("reset", "MsBackend", function(object) { object }) @@ -1566,6 +1765,8 @@ setMethod("reset", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics rtime #' #' @rdname MsBackend +#' +#' @export setMethod("rtime", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1575,6 +1776,8 @@ setMethod("rtime", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics rtime<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("rtime", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1584,6 +1787,8 @@ setReplaceMethod("rtime", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics scanIndex #' #' @rdname MsBackend +#' +#' @export setMethod("scanIndex", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1591,6 +1796,8 @@ setMethod("scanIndex", "MsBackend", function(object) { #' @exportMethod selectSpectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod( "selectSpectraVariables", "MsBackend", function(object, spectraVariables = spectraVariables(object)) { @@ -1602,6 +1809,8 @@ setMethod( #' @importMethodsFrom ProtGenerics smoothed #' #' @rdname MsBackend +#' +#' @export setMethod("smoothed", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1613,6 +1822,8 @@ setMethod("smoothed", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics smoothed<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("smoothed", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1620,6 +1831,8 @@ setReplaceMethod("smoothed", "MsBackend", function(object, value) { #' @exportMethod spectraData #' #' @rdname MsBackend +#' +#' @export setMethod( "spectraData", "MsBackend", function(object, columns = spectraVariables(object)) { @@ -1629,6 +1842,8 @@ setMethod( #' @exportMethod spectraData<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("spectraData", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1638,6 +1853,8 @@ setReplaceMethod("spectraData", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics spectraNames #' #' @rdname MsBackend +#' +#' @export setMethod("spectraNames", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1647,6 +1864,8 @@ setMethod("spectraNames", "MsBackend", function(object) { #' @importMethodsFrom ProtGenerics spectraNames<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("spectraNames", "MsBackend", function(object, value) { stop("Not implemented for ", class(object), ".") }) @@ -1656,6 +1875,8 @@ setReplaceMethod("spectraNames", "MsBackend", function(object, value) { #' @importMethodsFrom ProtGenerics spectraVariables #' #' @rdname MsBackend +#' +#' @export setMethod("spectraVariables", "MsBackend", function(object) { stop("Not implemented for ", class(object), ".") }) @@ -1665,6 +1886,8 @@ setMethod("spectraVariables", "MsBackend", function(object) { #' @importMethodsFrom S4Vectors split #' #' @rdname MsBackend +#' +#' @export setMethod("split", "MsBackend", function(x, f, drop = FALSE, ...) { split.default(x, f, drop = drop, ...) }) @@ -1674,6 +1897,8 @@ setMethod("split", "MsBackend", function(x, f, drop = FALSE, ...) { #' @exportMethod supportsSetBackend #' #' @rdname MsBackend +#' +#' @export setMethod("supportsSetBackend", "MsBackend", function(object, ...) { !isReadOnly(object) }) @@ -1683,6 +1908,8 @@ setMethod("supportsSetBackend", "MsBackend", function(object, ...) { #' @importMethodsFrom ProtGenerics tic #' #' @rdname MsBackend +#' +#' @export setMethod("tic", "MsBackend", function(object, initial = TRUE) { stop("Not implemented for ", class(object), ".") }) @@ -1690,13 +1917,17 @@ setMethod("tic", "MsBackend", function(object, initial = TRUE) { #' @exportMethod [ #' #' @rdname MsBackend +#' +#' @export setMethod("[", "MsBackend", function(x, i, j, ..., drop = FALSE) { - stop("Not implemented for ", class(x), ".") + extractByIndex(x, i2index(i, length = length(x))) }) #' @exportMethod $ #' #' @rdname MsBackend +#' +#' @export setMethod("$", "MsBackend", function(x, name) { stop("Not implemented for ", class(x), ".") }) @@ -1704,6 +1935,8 @@ setMethod("$", "MsBackend", function(x, name) { #' @exportMethod $<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("$", "MsBackend", function(x, name, value) { stop("Not implemented for ", class(x), ".") }) @@ -1711,6 +1944,8 @@ setReplaceMethod("$", "MsBackend", function(x, name, value) { #' @exportMethod [[ #' #' @rdname MsBackend +#' +#' @export setMethod("[[", "MsBackend", function(x, i, j, ...) { if (!is.character(i)) stop("'i' is supposed to be a character defining the spectra ", @@ -1723,6 +1958,8 @@ setMethod("[[", "MsBackend", function(x, i, j, ...) { #' @exportMethod [[<- #' #' @rdname MsBackend +#' +#' @export setReplaceMethod("[[", "MsBackend", function(x, i, j, ..., value) { if (!is.character(i)) stop("'i' is supposed to be a character defining the spectra ", @@ -1737,6 +1974,29 @@ setReplaceMethod("[[", "MsBackend", function(x, i, j, ..., value) { #' @importMethodsFrom ProtGenerics uniqueMsLevels #' #' @rdname MsBackend +#' +#' @export setMethod("uniqueMsLevels", "MsBackend", function(object, ...) { unique(msLevel(object)) }) + +#' @exportMethod dataStorageBasePath +#' +#' @rdname MsBackend +#' +#' @export +setMethod("dataStorageBasePath", "MsBackend", function(object) { + NA_character_ +}) + +#' @exportMethod dataStorageBasePath<- +#' +#' @rdname MsBackend +#' +#' @export +setReplaceMethod( + "dataStorageBasePath", "MsBackend", function(object, value) { + warning(class(object)[1L], " does not support changing", + " 'dataStorageBasePath'.") + object + }) diff --git a/R/MsBackendCached.R b/R/MsBackendCached.R index 5628037d..e2f4d4d2 100644 --- a/R/MsBackendCached.R +++ b/R/MsBackendCached.R @@ -294,6 +294,15 @@ setMethod("dataStorage", "MsBackendCached", function(object) { rep("", length(object)) }) +#' @rdname MsBackendCached +setMethod("extractByIndex", c("MsBackendCached", "ANY"), + function(object, i) { + slot(object, "localData", check = FALSE) <- + object@localData[i, , drop = FALSE] + object@nspectra <- nrow(object@localData) + object +}) + #' @rdname MsBackendCached setMethod("length", "MsBackendCached", function(x) { x@nspectra @@ -428,7 +437,7 @@ setMethod("show", "MsBackendCached", function(object) { cat(class(object), "with", n, "spectra\n") if (n) { idx <- unique(c(1L:min(6L, n), max(1L, n-5L):n)) - spd <- spectraData(object[idx, ], + spd <- spectraData(extractByIndex(object, idx), c("msLevel", "precursorMz", "polarity")) if (!length(rownames(spd))) rownames(spd) <- idx @@ -455,7 +464,6 @@ setMethod("centroided", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("centroided", "MsBackendCached", function(object, value) { object$centroided <- value - validObject(object) object }) @@ -467,7 +475,6 @@ setMethod("collisionEnergy", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("collisionEnergy", "MsBackendCached", function(object, value) { object$collisionEnergy <- value - validObject(object) object }) @@ -479,7 +486,6 @@ setMethod("dataOrigin", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("dataOrigin", "MsBackendCached", function(object, value) { object$dataOrigin <- value - validObject(object) object }) @@ -516,7 +522,6 @@ setMethod("isolationWindowLowerMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowLowerMz", "MsBackendCached", function(object, value) { object$isolationWindowLowerMz <- value - validObject(object) object }) @@ -529,7 +534,6 @@ setMethod("isolationWindowTargetMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowTargetMz", "MsBackendCached", function(object, value) { object$isolationWindowTargetMz <- value - validObject(object) object }) @@ -542,7 +546,6 @@ setMethod("isolationWindowUpperMz", "MsBackendCached", function(object) { setReplaceMethod("isolationWindowUpperMz", "MsBackendCached", function(object, value) { object$isolationWindowUpperMz <- value - validObject(object) object }) @@ -565,7 +568,6 @@ setMethod("polarity", "MsBackendCached", function(object) { setReplaceMethod("polarity", "MsBackendCached", function(object, value) { if (is.numeric(value)) value <- as.integer(value) object$polarity <- value - validObject(object) object }) @@ -592,7 +594,6 @@ setMethod("rtime", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("rtime", "MsBackendCached", function(object, value) { object$rtime <- value - validObject(object) object }) @@ -609,6 +610,5 @@ setMethod("smoothed", "MsBackendCached", function(object) { #' @rdname MsBackendCached setReplaceMethod("smoothed", "MsBackendCached", function(object, value) { object$smoothed <- value - validObject(object) object }) diff --git a/R/MsBackendDataFrame.R b/R/MsBackendDataFrame.R index a97a36fc..b83b0b72 100644 --- a/R/MsBackendDataFrame.R +++ b/R/MsBackendDataFrame.R @@ -22,7 +22,8 @@ setClass("MsBackendDataFrame", version = "0.2")) setValidity("MsBackendDataFrame", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) if (length(msg)) return(msg) msg <- c( @@ -92,6 +93,12 @@ setMethod("backendMerge", "MsBackendDataFrame", function(object, ...) { res }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendDataFrame", + function(object, ...) { + "dataStorage" + }) + ## Data accessors #' @rdname hidden_aliases @@ -181,6 +188,14 @@ setReplaceMethod("dataStorage", "MsBackendDataFrame", function(object, value) { object }) +#' @rdname hidden_aliases +setMethod("extractByIndex", c("MsBackendDataFrame", "ANY"), + function(object, i) { + slot(object, "spectraData", check = FALSE) <- + extractROWS(object@spectraData, i) + object + }) + #' @rdname hidden_aliases setMethod("intensity", "MsBackendDataFrame", function(object) { if (any(colnames(object@spectraData) == "intensity")) @@ -405,16 +420,18 @@ setMethod("selectSpectraVariables", "MsBackendDataFrame", paste(spectraVariables[!(spectraVariables %in% spectraVariables(object))], collapse = ", "), " not available") + bv <- backendRequiredSpectraVariables(object) + if (!all(bv %in% spectraVariables)) + stop("Spectra variables ", + paste(bv[!bv %in% spectraVariables], collapse = ","), + " are required by the backend") keep <- spectraVariables[spectraVariables %in% - colnames(object@spectraData)] + colnames(object@spectraData)] if (length(keep)) object@spectraData <- object@spectraData[, keep, drop = FALSE] - msg <- .valid_spectra_data_required_columns(object@spectraData) - if (length(msg)) - stop(msg) object@peaksVariables <- intersect(object@peaksVariables, - colnames(object@spectraData)) + spectraVariables) validObject(object) object }) @@ -544,6 +561,8 @@ setReplaceMethod("$", "MsBackendDataFrame", function(x, name, value) { #' @importFrom MsCoreUtils i2index #' #' @rdname hidden_aliases +#' +#' @export setMethod("[", "MsBackendDataFrame", function(x, i, j, ..., drop = FALSE) { .subset_backend_data_frame(x, i) }) @@ -583,5 +602,5 @@ setMethod("filterAcquisitionNum", "MsBackendDataFrame", "acquisition number(s) for sub-setting") sel_file <- .sel_file(object, dataStorage, dataOrigin) sel_acq <- acquisitionNum(object) %in% n & sel_file - object[sel_acq | !sel_file] + extractByIndex(object, which(sel_acq | !sel_file)) }) diff --git a/R/MsBackendHdf5Peaks.R b/R/MsBackendHdf5Peaks.R index e5482803..27f14753 100644 --- a/R/MsBackendHdf5Peaks.R +++ b/R/MsBackendHdf5Peaks.R @@ -26,8 +26,8 @@ setClass("MsBackendHdf5Peaks", prototype = prototype(version = "0.1", readonly = FALSE)) setValidity("MsBackendHdf5Peaks", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData, - c("dataStorage", "scanIndex")) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) fls <- unique(object@spectraData$dataStorage) msg <- c(msg, .valid_ms_backend_mod_count(object@modCount, fls)) msg <- c(msg, .valid_ms_backend_files_exist(fls)) @@ -36,6 +36,12 @@ setValidity("MsBackendHdf5Peaks", function(object) { else msg }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendHdf5Peaks", + function(object, ...) { + c("dataStorage", "scanIndex") + }) + #' @rdname hidden_aliases #' #' @importFrom fs path_sanitize @@ -291,6 +297,20 @@ setMethod("[", "MsBackendHdf5Peaks", function(x, i, j, ..., drop = FALSE) { x }) +#' @rdname hidden_aliases +#' +#' @aliases [,MsBackendHdf5Peaks-method +setMethod("extractByIndex", c("MsBackendHdf5Peaks", "ANY"), + function(object, i) { + fls <- unique(object@spectraData$dataStorage) + slot(object, "spectraData", check = FALSE) <- + extractROWS(object@spectraData, i) + slot(object, "modCount", check = FALSE) <- + object@modCount[match( + unique(object@spectraData$dataStorage), fls)] + object +}) + #' @rdname hidden_aliases setMethod("backendMerge", "MsBackendHdf5Peaks", function(object, ...) { object <- unname(c(object, ...)) diff --git a/R/MsBackendMemory.R b/R/MsBackendMemory.R index 3f6770c2..52b6a75a 100644 --- a/R/MsBackendMemory.R +++ b/R/MsBackendMemory.R @@ -122,6 +122,12 @@ setMethod("backendMerge", "MsBackendMemory", function(object, ...) { res }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendMemory", + function(object, ...) { + "dataStorage" + }) + ## Data accessors #' @rdname hidden_aliases @@ -192,6 +198,18 @@ setReplaceMethod("dataStorage", "MsBackendMemory", function(object, value) { object }) +#' @rdname hidden_aliases +setMethod("extractByIndex", c("MsBackendMemory", "ANY"), function(object, i) { + slot(object, "spectraData", check = FALSE) <- + object@spectraData[i, , drop = FALSE] + if (length(object@peaksData)) + slot(object, "peaksData", check = FALSE) <- object@peaksData[i] + if (length(object@peaksDataFrame)) + slot(object, "peaksDataFrame", check = FALSE) <- + object@peaksDataFrame[i] + object +}) + #' @rdname hidden_aliases setMethod("intensity", "MsBackendMemory", function(object) { if (length(object)) { @@ -502,7 +520,8 @@ setMethod("selectSpectraVariables", "MsBackendMemory", z[, keep, drop = FALSE]) } } - msg <- .valid_spectra_data_required_columns(object@spectraData) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) if (length(msg)) stop(msg) validObject(object) diff --git a/R/MsBackendMzR.R b/R/MsBackendMzR.R index 74b00308..a7930e0d 100644 --- a/R/MsBackendMzR.R +++ b/R/MsBackendMzR.R @@ -24,14 +24,20 @@ setClass("MsBackendMzR", prototype = prototype(version = "0.1", readonly = TRUE)) setValidity("MsBackendMzR", function(object) { - msg <- .valid_spectra_data_required_columns(object@spectraData, - c("dataStorage", "scanIndex")) + msg <- .valid_spectra_data_required_columns( + object@spectraData, backendRequiredSpectraVariables(object)) msg <- c(msg, .valid_ms_backend_files_exist( unique(object@spectraData$dataStorage))) if (length(msg)) msg else TRUE }) +#' @rdname hidden_aliases +setMethod("backendRequiredSpectraVariables", "MsBackendMzR", + function(object, ...) { + c("dataStorage", "scanIndex") + }) + #' @rdname hidden_aliases #' #' @importFrom methods callNextMethod @@ -43,12 +49,14 @@ setValidity("MsBackendMzR", function(object) { #' @importFrom BiocParallel bpparam setMethod("backendInitialize", "MsBackendMzR", function(object, files, ..., BPPARAM = bpparam()) { - if (missing(files) || !length(files)) + if (missing(files)) stop("Parameter 'files' is mandatory for 'MsBackendMzR'") if (!is.character(files)) stop("Parameter 'files' is expected to be a character vector", " with the files names from where data should be", " imported") + if (!length(files)) + return(object) files <- normalizePath(files, mustWork = FALSE) msg <- .valid_ms_backend_files_exist(files) if (length(msg)) @@ -214,3 +222,21 @@ setMethod("export", "MsBackendMzR", function(object, x, file = tempfile(), setMethod("backendParallelFactor", "MsBackendMzR", function(object) { factor(dataStorage(object), levels = unique(dataStorage(object))) }) + +#' @importFrom MsCoreUtils common_path +setMethod("dataStorageBasePath", "MsBackendMzR", function(object) { + common_path(dataStorage(object)) +}) + +setReplaceMethod( + "dataStorageBasePath", "MsBackendMzR", function(object, value) { + ds <- dataStorage(object) + ds <- gsub("\\", "/", ds, fixed = TRUE) + value <- gsub("\\", "/", value, fixed = TRUE) + cp <- common_path(ds) + ds <- sub(cp, value, ds, fixed = TRUE) + if (!all(file.exists(unique(ds)))) + stop("Provided path does not contain all data files.") + dataStorage(object) <- normalizePath(ds) + object + }) diff --git a/R/Spectra-estimatePrecursorMz.R b/R/Spectra-estimatePrecursorMz.R index 72743d57..ad6ff630 100644 --- a/R/Spectra-estimatePrecursorMz.R +++ b/R/Spectra-estimatePrecursorMz.R @@ -55,6 +55,10 @@ #' #' @author Mar Garcia-Aloy, Johannes Rainer #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @export #' #' @examples diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 12a82aea..93d9f2db 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -63,7 +63,13 @@ NULL #' @description #' #' This function applies the processing queue and an arbitrary function to -#' the peaks matrix of each spectrum of the `Spectra` object `object`. +#' the peaks matrix of each spectrum of the `Spectra` object `object`. It has +#' build-in parallel and/or chunk-wise processing enabled through parameter +#' `f`, that allows to define how the `Spectra` (or rather its backend) needs +#' to be splitted. The default `f = .parallel_processing_factor(object)` splits +#' the backend by chunk (if a finite chunk size is defined for the `Spectra`) +#' or by it's optimal parallel processing factor. See the description of +#' the `.parallel_processing_factor()` function below for information. #' #' @param object `Spectra` object. #' @@ -78,7 +84,8 @@ NULL #' #' @param f `factor` or `vector` that can be coerced to one defining how the #' data should be split for parallel processing. Set to `NULL` or -#' `factor()` to disable splitting and parallel processing. +#' `factor()` to disable splitting and parallel processing. See function +#' description above for details and information. #' #' @param columns `character` defining the columns that should be returned. #' This will be passed to the backend's `peaksData` function. @@ -207,7 +214,7 @@ NULL #' @export applyProcessing #' -#' @rdname Spectra +#' @rdname addProcessing applyProcessing <- function(object, f = processingChunkFactor(object), BPPARAM = bpparam(), ...) { queue <- object@processingQueue @@ -236,8 +243,9 @@ applyProcessing <- function(object, f = processingChunkFactor(object), }, queue = queue, pv = pv, svars = svars, BPPARAM = BPPARAM) bknds <- backendMerge(bknds) if (is.unsorted(f)) - bknds <- bknds[order(unlist(split(seq_along(bknds), f), - use.names = FALSE))] + bknds <- extractByIndex( + bknds, order(unlist(split(seq_along(bknds), f), + use.names = FALSE))) object@backend <- bknds } else { if (length(svars)) @@ -538,14 +546,14 @@ applyProcessing <- function(object, f = processingChunkFactor(object), #' @export concatenateSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra concatenateSpectra <- function(x, ...) { .concatenate_spectra(unlist(unname(list(unname(x), ...)))) } #' @export combineSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, FUN = combinePeaksData, ..., BPPARAM = bpparam()) { if (!is.factor(f)) @@ -570,39 +578,8 @@ combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, #' @description #' -#' Internal function to check if any (or all) of the provided `mz` values are -#' in the spectras' m/z. -#' -#' @param x `Spectra` object -#' -#' @param mz `numeric` of m/z value(s) to check in each spectrum of `x`. -#' -#' @param tolarance `numeric(1)` with the tolerance. -#' -#' @param ppm `numeric(1)` with the ppm. -#' -#' @param condFun `function` such as `any` or `all`. -#' -#' @param parallel `BiocParallel` parameter object. -#' -#' @return `logical` same length than `x`. -#' -#' @author Johannes Rainer -#' -#' @importFrom MsCoreUtils common -#' -#' @noRd -.has_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, condFun = any, - parallel = SerialParam()) { - mzs <- mz(x, BPPARAM = parallel) - vapply(mzs, FUN = function(z) - condFun(common(mz, z, tolerance = tolerance, ppm = ppm)), logical(1)) -} - -#' @description -#' -#' Same as `.has_mz` only that a different `mz` is used for each spectrum in -#' `x`. Length of `mz` is thus expected to be equal to length of `x`. +#' Check for presence of an m/z value in each spectrum. Each spectrum gets +#' its own m/z. #' #' @param mz `numeric` **same length as `x`**. #' @@ -622,7 +599,7 @@ combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, #' @export joinSpectraData #' -#' @rdname Spectra +#' @rdname combineSpectra joinSpectraData <- function(x, y, by.x = "spectrumId", by.y, @@ -685,87 +662,11 @@ joinSpectraData <- function(x, y, #' @export #' -#' @rdname Spectra +#' @rdname addProcessing processingLog <- function(x) { x@processing } -#' @title Estimate Precursor Intensities -#' -#' @description -#' -#' Some MS instrument manufacturers don't provide precursor intensities for -#' fragment spectra. These can however be estimated, given that also MS1 -#' spectra are available. The `estimatePrecursorIntensity()` funtion defines the -#' precursor intensities for MS2 spectra using the intensity of the matching -#' MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured -#' before the respective MS2 spectrum). With `method = "interpolation"` it is -#' also possible to calculate the precursor intensity based on an interpolation -#' of intensity values (and retention times) of the matching MS1 peaks from the -#' previous and next MS1 spectrum. See below for an example. -#' -#' @param x `Spectra` with MS1 and MS2 spectra. -#' -#' @param ppm `numeric(1)` with the maximal allowed relative difference of m/z -#' values between the precursor m/z of a spectrum and the m/z of the -#' respective ion on the MS1 scan. -#' -#' @param tolerance `numeric(1)` with the maximal allowed difference of m/z -#' values between the precursor m/z of a spectrum and the m/z of the -#' respective ion on the MS1 scan. -#' -#' @param method `character(1)` defining whether the precursor intensity -#' should be estimated on the previous MS1 spectrum (`method = "previous"`, -#' the default) or based on an interpolation on the previous and next -#' MS1 spectrum (`method = "interpolation"`). -#' -#' @param msLevel. `integer(1)` the MS level for which precursor intensities -#' should be estimated. Defaults to `2L`. -#' -#' @param f `factor` (or vector to be coerced to `factor`) defining which -#' spectra belong to the same original data file (sample). -#' Defaults to `f = dataOrigin(x)`. -#' -#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more -#' information. This is passed directly to the [backendInitialize()] method -#' of the [MsBackend-class]. -#' -#' @author Johannes Rainer with feedback and suggestions from Corey Broeckling -#' -#' @export -#' -#' @rdname estimatePrecursorIntensity -#' -#' @examples -#' -#' #' ## Calculating the precursor intensity for MS2 spectra: -#' ## -#' ## Some MS instrument manufacturer don't report the precursor intensities -#' ## for MS2 spectra. The `estimatePrecursorIntensity` function can be used -#' ## in these cases to calculate the precursor intensity on MS1 data. Below -#' ## we load an mzML file from a vendor providing precursor intensities and -#' ## compare the estimated and reported precursor intensities. -#' tmt <- Spectra(msdata::proteomics(full.names = TRUE)[5], -#' backend = MsBackendMzR()) -#' pmi <- estimatePrecursorIntensity(tmt) -#' plot(pmi, precursorIntensity(tmt)) -#' -#' ## We can also replace the original precursor intensity values with the -#' ## newly calculated ones -#' tmt$precursorIntensity <- pmi -estimatePrecursorIntensity <- function(x, ppm = 20, tolerance = 0, - method = c("previous", "interpolation"), - msLevel. = 2L, f = dataOrigin(x), - BPPARAM = bpparam()) { - if (is.factor(f)) - f <- as.character(f) - f <- factor(f, levels = unique(f)) - BPPARAM <- backendBpparam(x@backend, BPPARAM) - unlist(bplapply(split(x, f), FUN = .estimate_precursor_intensity, ppm = ppm, - tolerance = tolerance, method = method, msLevel = msLevel., - BPPARAM = BPPARAM), use.names = FALSE) -} - #' estimate precursor intensities based on MS1 peak intensity. This function #' assumes that `x` is a `Spectra` with data **from a single file/sample**. #' @@ -907,9 +808,7 @@ chunkapply <- function(x, FUN, ..., chunkSize = 1000L, chunks = factor()) { as.factor(rep(1:ceiling(len / chunkSize), each = chunkSize)[seq_len(len)]) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export deisotopeSpectra <- @@ -921,9 +820,7 @@ deisotopeSpectra <- substDefinition = im, charge = charge) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export reduceSpectra <- function(x, tolerance = 0, ppm = 20) { @@ -932,9 +829,7 @@ reduceSpectra <- function(x, tolerance = 0, ppm = 20) { addProcessing(x, .peaks_reduce, tolerance = tolerance, ppm = ppm) } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { @@ -967,9 +862,7 @@ filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { x } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorIsotopes <- @@ -1002,9 +895,7 @@ filterPrecursorIsotopes <- x } -#' @rdname Spectra -#' -#' @author Johannes Rainer +#' @rdname addProcessing #' #' @export scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { @@ -1017,7 +908,7 @@ scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { x } -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @export filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, @@ -1068,6 +959,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' per file parallel processing if `f` or `chunkSize` is not defined. #' Other on-disk backends: only if requested by the user. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra` object. #' #' @param chunkSize `integer` defining the size of chunks into which `x` should @@ -1143,6 +1039,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' For these, the `backendBpparam()` function will always return a #' `SerialParam()` independently on how parallel processing was defined. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra`. #' #' @param value `integer(1)` defining the chunk size. @@ -1182,3 +1083,189 @@ processingChunkFactor <- function(x) { stop("'x' is supposed to be a 'Spectra' object") .parallel_processing_factor(x) } + +#' @title Filter peaks based on spectra and peaks variable ranges +#' +#' @description +#' +#' The `filterPeaksRanges()` function allows to filter the peaks matrices of a +#' [Spectra] object using any set of range-based filters on numeric spectra +#' variables or peaks variables. These ranges can be passed to the function +#' using the `...` as ` = ` pairs. `` +#' has to be an available spectra or peaks variable. `` can be a +#' `numeric` of length 2 defining the lower and upper boundary, or a `numeric` +#' two-column matrix (multi-row matrices are also supported, see further +#' below). `filterPeaksRanges(s, mz = c(200, 300))` would for example reduce +#' the peaks matrices of the `Spectra` object `s` to mass peaks with an m/z +#' value between 200 and 300. `filterPeaksRanges()` returns the original +#' `Spectra` object with the filter operation added to the processing queue. +#' Thus, the filter gets **only** applied when the peaks data gets extracted +#' with `mz()`, `intensity()` or `peaksData()`. If ranges for both spectra +#' **and** peaks variables are defined, the function evaluates first whether +#' the spectra variable value for a spectrum is within the provided range and, +#' if so, applies also the peaks variable-based filter (otherwise an empty +#' peaks matrix is returned). +#' +#' If more than one spectra variable and/or peaks variable are defined, their +#' filter results are combined with a logical AND: a peak matrix is only +#' returned for a spectrum if all values of spectra variables are within the +#' provided (respective) ranges for spectra variables, and this matrix is +#' further filtered to contain only those peaks which values are within the +#' provided peaks variable ranges. +#' +#' **Filtering with multiple ranges** per spectra and peaks variables is also +#' supported: ranges can also be provided as multi-row numeric (two-column) +#' matrices. In this case, the above described procedure is applied for each +#' row separately and their results are combined with a logical OR, i.e. +#' peaks matrices are returned that match any of the conditions/filters +#' of a row. The number of rows of the provided ranges (being it for spectra +#' or peaks variables) have to match. +#' +#' **Missing value handling**: any comparison which involves a missing value +#' (being it a spectra variable value, a peaks variable value or a value +#' in one of the provided ranges) is treated as a logical `FALSE`. For +#' example, if the retention time of a spectrum is `NA` and the data is +#' filtered using a retention time range, an empty peaks matrix is returned +#' (for `keep = TRUE`, for `keep = FALSE` the full peaks matrix is returned). +#' +#' @note +#' +#' In contrast to some other *filter* functions, this function does not provide +#' a `msLevel` parameter that allows to define the MS level of spectra on which +#' the filter should be applied. The filter(s) will always be applied to +#' **all** spectra (irrespectively of their MS level). Through combination of +#' multiple filter ranges it is however possible to apply MS level-dependent +#' filters (see examples below for details). +#' +#' The filter will not be applied immediately to the data but only executed when +#' the mass peak data is accessed (through `peaksData()`, `mz()` or +#' `intensity()`) or by calling `applyProcessing()`. +#' +#' @param object A [Spectra] object. +#' +#' @param ... the ranges for the spectra and/or peaks variables. Has to be +#' provided as ` = ` pairs with `` being the name of a +#' spectra or peaks variable (of numeric data type) and `` being +#' either a `numeric` of length 2 or a `numeric` two column matrix (see +#' function desription above for details), +#' +#' @param keep `logical(1)` whether to keep (default) or remove peaks that +#' match the provided range(s). +#' +#' @author Johannes Rainer +#' +#' @name filterPeaksRanges +#' +#' @export +#' +#' @examples +#' +#' ## Define a test Spectra +#' d <- data.frame(rtime = c(123.2, 134.2), msLevel = c(1L, 2L)) +#' d$mz <- list(c(100.1, 100.2, 100.3, 200.1, 200.2, 300.3), +#' c(100.3, 100.4, 200.2, 400.3, 400.4)) +#' ## Use the index of the mass peak within the spectrum as index for +#' ## better illustration of filtering results +#' d$intensity <- list(c(1:6), 1:5) +#' s <- Spectra(d) +#' s +#' +#' ## Filter peaks removing all mass peaks with an m/z between 200 and 300 +#' res <- filterPeaksRanges(s, mz = c(200, 300), keep = FALSE) +#' res +#' +#' ## The Spectra object has still the same length and spectra variables +#' length(res) +#' res$rtime +#' +#' ## The filter gets applied when mass peak data gets extracted, using either +#' ## `mz()`, `intensity()` or `peaksData()`. The filtered peaks data does +#' ## not contain any mass peaks with m/z values between 200 and 300: +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## We next combine spectra and filter variables. We want to keep only mass +#' ## peaks of MS2 spectra that have an m/z between 100 and 110. +#' res <- filterPeaksRanges(s, mz = c(100, 110), msLevel = c(2, 2)) +#' res +#' length(res) +#' +#' ## Only data for peaks are returned for which the spectra's MS level is +#' ## between 2 and 2 and with an m/z between 100 and 110. The peaks data for +#' ## the first spectrum, that has MS level 1, is thus empty: +#' peaksData(res)[[1L]] +#' +#' ## While the peaks matrix for the second spectrum (with MS level 2) contains +#' ## the mass peaks with m/z between 100 and 110. +#' peaksData(res)[[2L]] +#' +#' ## To keep also the peaks data for the first spectrum, we need to define +#' ## an additional set of ranges, which we define using a second row in each +#' ## ranges matrix. We use the same filter as above, i.e. keeping only mass +#' ## peaks with an m/z between 100 and 110 for spectra with MS level 2, but +#' ## add an additional row for MS level 1 spectra keeping mass peaks with an +#' ## m/z between 0 and 2000. Filter results of different rows are combined +#' ## using a logical OR, i.e. peaks matrices with mass peaks are returned +#' ## matching either the first, or the second row. +#' res <- filterPeaksRanges(s, mz = rbind(c(100, 110), c(0, 1000)), +#' msLevel = rbind(c(2, 2), c(1, 1))) +#' +#' ## The results for the MS level 2 spectrum are the same as before, but with +#' ## the additional row we keep the full peaks matrix of the MS1 spectrum: +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## As a last example we define a filter that keeps all mass peaks with an +#' ## m/z either between 100 and 200, or between 300 and 400. +#' res <- filterPeaksRanges(s, mz = rbind(c(100, 200), c(300, 400))) +#' peaksData(res)[[1L]] +#' peaksData(res)[[2L]] +#' +#' ## Such filters could thus be defined to restrict/filter the MS data to +#' ## specific e.g. retention time and m/z ranges. +filterPeaksRanges <- function(object, ..., keep = TRUE) { + if (!inherits(object, "Spectra")) + stop("'object' is expected to be a 'Spectra' object.") + dots <- list(...) + variables <- names(dots) + if (!length(variables)) + return(object) + ## check that: + ## - variables are in spectraVariables + pvars <- peaksVariables(object) + svars <- spectraVariables(object) + if (!all(variables %in% c(svars, pvars))) + stop("Provided filter variable(s): ", + paste0("\"", variables[!variables %in% c(svars, pvars)], "\"", + collapse = ", "), " are not valid spectra variables. ", + "Use 'spectraVariables(object)' and 'peaksVariables()' to list ", + "available variables.") + ## - range parameters are defined correctly + err <- paste0("Range parameters have to be either a 'numeric' of length ", + "2 or a 'numeric' matrix with two columns.") + dots <- lapply(dots, function(z) { + if (is.null(nrow(z))) { + if (length(z) != 2) + stop(err) + z <- matrix(z, ncol = 2) + } + if (!is.matrix(z) | !is.numeric(z)) stop(err) + z + }) + ## - number for rows of matrices matches. + nr <- unlist(lapply(dots, nrow), use.names = FALSE) + if (any(nr != nr[1L])) + stop("Number of rows of the range matrices have to match.") + ## OK, now proceed to split by svar and pvar and pass to the peaks function. + pvars <- intersect(variables, pvars) + svars <- intersect(variables, svars) + object <- addProcessing(object, .peaks_filter_ranges, ranges = dots, + svars = svars, pvars = pvars, + spectraVariables = c(svars, "msLevel"), keep = keep) + if (keep) keep_or_remove <- "select" + else keep_or_remove <- "remove" + object@processing <- .logging( + object@processing, "Filter: ", keep_or_remove, " peaks based on ", + "user-provided ranges for ", length(variables), " variables") + object +} diff --git a/R/Spectra-neutralLoss.R b/R/Spectra-neutralLoss.R index 53f3b2b5..dc9cf32c 100644 --- a/R/Spectra-neutralLoss.R +++ b/R/Spectra-neutralLoss.R @@ -87,6 +87,10 @@ setClassUnion("functionOrNull", c("function", "NULL")) #' Analysis in METLIN. Journal of the American Society for Mass Spectrometry. #' \doi{10.1021/jasms.1c00343} #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @examples #' #' ## Create a simple example Spectra object with some MS1, MS2 and MS3 spectra. diff --git a/R/Spectra.R b/R/Spectra.R index 87479125..0d9217ef 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -1,68 +1,106 @@ #' @include hidden_aliases.R NULL +################################################################################ +## +## Spectra class, creation, data representation, export +## +################################################################################ + #' @title The Spectra class to manage and access MS data #' -#' @aliases Spectra-class [,Spectra-method -#' @aliases uniqueMsLevels uniqueMsLevels,Spectra-method -#' @aliases combinePeaks -#' #' @name Spectra #' +#' @aliases Spectra-class +#' @aliases Spectra +#' @aliases setBackend +#' @aliases export +#' #' @description #' -#' The `Spectra` class encapsules spectral mass spectrometry data and -#' related metadata. +#' The `Spectra` class encapsules spectral mass spectrometry (MS) data and +#' related metadata. The MS data is represented by a *backend* extending the +#' virual [MsBackend] class which provides the data to the `Spectra` object. +#' The `Spectra` class implements only data accessor, filtering and analysis +#' methods for the MS data and relies on its *backend* to provide the MS data. +#' This allows to change data representations of a `Spectra` object depending +#' on the user's needs and properties of the data. Different backends and +#' their properties are explained in the [MsBackend] documentation. +#' +#' Documentation on other topics and functionality of `Spectra`can be found in: +#' +#' - [spectraData()] for accessing and using MS data through `Spectra` objects. +#' - [filterMsLevel()] to subset and filter `Spectra` objects. +#' - [plotSpectra()] for visualization of `Spectra` orbjects. +#' - [processingChunkSize()] for information on parallel and chunk-wise data +#' processing. +#' - [combineSpectra()] for merging, aggregating and splitting of `Spectra` +#' objects. +#' - [combinePeaks()] for merging and aggregating `Spectra`'s mass peaks data. +#' - [addProcessing()] for data analysis functions. +#' - [compareSpectra()] for spectra similarity calculations. #' -#' It supports multiple data backends, e.g. in-memory ([MsBackendMemory], -#' [MsBackendDataFrame()]), on-disk as mzML ([MsBackendMzR()]) or HDF5 -#' ([MsBackendHdf5Peaks()]). +#' @param backend For `Spectra()`: [MsBackend-class] to be used as backend. See +#' section on creation of `Spectra` objects for details. For `setBackend()`: +#' instance of [MsBackend-class] that supports `setBackend()` (i.e. for +#' which `supportsSetBackend()` returns `TRUE`). Such backends have a +#' parameter `data` in their `backendInitialize()` function that support +#' passing the full spectra data to the initialize method. See section on +#' creation of `Spectra` objects for details. +#' For `export()`: [MsBackend-class] to be used to export the data. #' -#' @details +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. #' -#' The `Spectra` class uses by default a lazy data manipulation strategy, -#' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` -#' are not applied immediately to the data, but applied on-the-fly to the -#' spectrum data once it is retrieved. For some backends that allow to write -#' data back to the data storage (such as the [MsBackendMemory()], -#' [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it is possible to apply -#' to queue with the `applyProcessing` function. See the *Data manipulation and -#' analysis *methods* section below for more details. +#' @param f For `setBackend()`: factor defining how to split the data +#' for parallelized copying of the spectra data to the new backend. For +#' some backends changing this parameter can lead to errors. Defaults to +#' [processingChunkFactor()]. #' -#' For more information on parallel or chunk-wise processing (especially -#' helpful for very large data sets) see [processingChunkSize()]. +#' @param metadata For `Spectra()`: optional `list` with metadata information. #' -#' To apply arbitrary functions to a `Spectra` use the `spectrapply()` function -#' (or directly [chunkapply()] for chunk-wise processing). See description of -#' the `spectrapply()` function below for details. +#' @param object For `Spectra()`: an object to instantiate the `Spectra` +#' object and initialize the with data.. See section on creation of +#' `Spectra` objects for details. For all other methods a `Spectra` object. #' -#' For details on plotting spectra, see [plotSpectra()]. +#' @param processingQueue For `Spectra()`: optional `list` of +#' [ProcessingStep-class] objects. #' -#' Clarifications regarding scan/acquisition numbers and indices: +#' @param source For `Spectra()`: instance of [MsBackend-class] that can be +#' used to import spectrum data from the provided files. See section +#' *Creation of objects* for more details. #' -#' - A `spectrumId` (or `spectrumID`) is a vendor specific field in -#' the mzML file that contains some information about the -#' run/spectrum, e.g.: `controllerType=0 controllerNumber=1 -#' scan=5281 file=2` +#' @param value For `dataStorageBasePath()`: A `character` vector that defines +#' the base directory where the data storage files can be found. #' -#' - `acquisitionNum` is a more a less sanitize spectrum id generated -#' from the `spectrumId` field by `mzR` (see -#' [here](https://github.com/sneumann/mzR/blob/master/src/pwiz/data/msdata/MSData.cpp#L552-L580)). +#' @param ... Additional arguments. #' -#' - `scanIndex` is the `mzR` generated sequence number of the -#' spectrum in the raw file (which doesn't have to be the same as -#' the `acquisitionNum`) +#' @section Data stored in a `Spectra` object: +#' +#' The `Spectra` object is a container for MS data that includes mass peak +#' data (*m/z* and related intensity values, also referred to as *peaks data* +#' in the context of `Spectra`) and metadata of individual spectra (so called +#' *spectra variables*). While a core set of spectra variables (the +#' `coreSpectraVariables()`) are guaranteed to be provided by a +#' `Spectra`, it is possible to add arbitrary additional spectra variables to +#' a `Spectra` object. +#' +#' The `Spectra` object is designed to contain MS data of a (large) set of mass +#' spectra. The data is organized *linearly* and can be thought of a list of +#' mass spectra, i.e. each element in the `Spectra` is one spectrum. #' -#' See also [this issue](https://github.com/lgatto/MSnbase/issues/525). #' -#' @section Creation of objects, conversion, changing the backend and export: +#' @section Creation of objects: #' #' `Spectra` classes can be created with the `Spectra()` constructor function #' which supports the following formats: #' #' - parameter `object` is a `data.frame` or `DataFrame` containing the -#' spectrum data. The provided `backend` (by default a -#' [MsBackendMemory-class]) will be initialized with that data. +#' full spectrum data (spectra variables in columns as well as columns +#' with the individual MS peak data, *m/z* and intensity). The provided +#' `backend` (by default a [MsBackendMemory-class]) will be initialized +#' with that data. #' #' - parameter `object` is a [MsBackend-class] (assumed to be already #' initialized). @@ -79,45 +117,80 @@ NULL #' #' With `...` additional arguments can be passed to the backend's #' [backendInitialize()] method. Parameter `backend` allows to specify which -#' [MsBackend-class] should be used for data storage. -#' -#' The backend of a `Spectra` object can be changed with the `setBackend()` -#' method that takes an instance of the new backend as second parameter -#' `backend`. A call to `setBackend(sps, backend = MsBackendDataFrame())` +#' [MsBackend-class] should be used for data representation and storage. +#' +#' +#' @section Data representation of a `Spectra`: +#' +#' The MS data which can be accessed through the `Spectra` object is +#' *represented* by its backend, which means that this backend defines how +#' and where the data is stored (e.g. in memory or on disk). The `Specrta` +#' object relies on the backend to provide the MS data whenever it needs it +#' for data processing. +#' Different backends with different properties, such as minimal memory +#' requirement or fast data access, are defined in the *Spectra* package or +#' one of the MsBackend* packages. More information on backends and their +#' properties is provided in the documentation of [MsBackend]. +#' +#' On-disk backends keep only a limited amount of data in memory retrieving +#' most of the data (usually the MS peak data) upon request on-the-fly from +#' their on-disk data representations. Moving the on-disk data storage of such +#' a backend or a serialized object to a different location in the file +#' system will cause data corruption. The `dataStorageBasePath()` and +#' `dataStorageBasePath<-` functions allow in such cases (and if thebackend +#' classes support this operation), to get or change the *base* +#' path to the directory of the backend's data storage. In-memory backends +#' such as [MsBackendMemory] or [MsBackendDataFrame] keeping all MS data in +#' memory don't support, and need, this function, but for [MsBackendMzR] this +#' function can be used to update/adapt the path to the directory containing +#' the original data files. Thus, for `Spectra` objects (using this backend) +#' that were moved to another file system or computer, these functions allow to +#' adjust/adapt the base file path. +#' +#' +#' @section Changing data representation of a `Spectra`: +#' +#' The data representation, i.e. the backend of a `Spectra` object can be +#' changed with the `setBackend()` method that takes an instance of the new +#' backend as second parameter `backend`. A call to +#' `setBackend(sps, backend = MsBackendDataFrame())` #' would for example change the backend of `sps` to the *in-memory* #' `MsBackendDataFrame`. Changing to a backend is only supported if that #' backend has a `data` parameter in its `backendInitialize()` method and if #' `supportsSetBackend()` returns `TRUE` for that backend. `setBackend()` will -#' transfer the full spectra data from the originating backend as a -#' `DataFrame` to the new backend. -#' Most *read-only* backends do not support `setBackend()`. It is for example -#' not possible to change the backend to a *read-only* backend (such as -#' the [MsBackendMzR()] backend). +#' transfer the full spectra data from the originating backend as a `DataFrame` +#' to the new backend. +#' +#' Generally, it is not possible to change **to** a read-only backend such as +#' the [MsBackendMzR()] backend. #' #' The definition of the function is: #' `setBackend(object, backend, ..., f = dataStorage(object), #' BPPARAM = bpparam())` and its parameters are: #' -#' - parameter `object`: the `Spectra` object. +#' - `object`: the `Spectra` object. #' -#' - parameter `backend`: an instance of the new backend, e.g. -#' `[MsBackendMemory()]`. +#' - `backend`: an instance of the new backend, e.g. `[MsBackendMemory()]`. #' -#' - parameter `f`: factor allowing to parallelize the change of the backends. -#' By default the process of copying the spectra data from the original to the +#' - `f`: factor allowing to parallelize the change of the backends. By +#' default the process of copying the spectra data from the original to the #' new backend is performed separately (and in parallel) for each file. Users #' are advised to use the default setting. #' -#' - parameter `...`: optional additional arguments passed to the -#' [backendInitialize()] method of the new `backend`. +#' - `...`: optional additional arguments passed to the [backendInitialize()] +#' method of the new `backend`. #' -#' - parameter `BPPARAM`: setup for the parallel processing. See [bpparam()] for +#' - `BPPARAM`: setup for the parallel processing. See [bpparam()] for #' details. #' +#' +#' @section Exporting data from a `Spectra` object: +#' #' Data from a `Spectra` object can be **exported** to a file with the -#' `export()` function. The actual export of the data has to be performed by +#' `export()` function. The actual export of the data is performed by #' the `export` method of the [MsBackend] class defined with the mandatory -#' parameter `backend`. Note however that not all backend classes support +#' parameter `backend` which defines also the format in which the data +#' is exported. Note however that not all backend classes support #' export of data. From the `MsBackend` classes in the `Spectra` package #' currently only the `MsBackendMzR` backend supports data export (to #' mzML/mzXML file(s)); see the help page of the [MsBackend-class] for @@ -137,2441 +210,3118 @@ NULL #' parameter `backend`. #' #' -#' @section Accessing spectra data: -#' -#' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. -#' See examples for details. Note that replacing values of a peaks variable -#' is not supported with a non-empty processing queue, i.e. if any filtering -#' or data manipulations on the peaks data was performed. In these cases -#' [applyProcessing()] needs to be called first to apply all cached data -#' operations. -#' -#' - `[[`, `[[<-`: access or set/add a single spectrum variable (column) in the -#' backend. +#' @details #' -#' - `acquisitionNum()`: returns the acquisition number of each -#' spectrum. Returns an `integer` of length equal to the number of -#' spectra (with `NA_integer_` if not available). +#' The `Spectra` class uses by default a lazy data manipulation strategy, +#' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` +#' are not applied immediately to the data, but applied on-the-fly to the +#' spectrum data once it is retrieved. This enables data manipulation +#' operations also for *read only* data representations. For some backends that +#' allow to write data back to the data storage (such as the +#' [MsBackendMemory()], [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it +#' is possible to apply to queue with the [applyProcessing()] function (see +#' the [applyProcessing()] function for details). #' -#' - `centroided()`, `centroided<-`: gets or sets the centroiding -#' information of the spectra. `centroided()` returns a `logical` -#' vector of length equal to the number of spectra with `TRUE` if a -#' spectrum is centroided, `FALSE` if it is in profile mode and `NA` -#' if it is undefined. See also `isCentroided()` for estimating from -#' the spectrum data whether the spectrum is centroided. `value` -#' for `centroided<-` is either a single `logical` or a `logical` of -#' length equal to the number of spectra in `object`. +#' Clarifications regarding scan/acquisition numbers and indices: #' -#' - `collisionEnergy()`, `collisionEnergy<-`: gets or sets the -#' collision energy for all spectra in `object`. `collisionEnergy()` -#' returns a `numeric` with length equal to the number of spectra -#' (`NA_real_` if not present/defined), `collisionEnergy<-` takes a -#' `numeric` of length equal to the number of spectra in `object`. +#' - A `spectrumId` (or `spectrumID`) is a vendor specific field in +#' the mzML file that contains some information about the +#' run/spectrum, e.g.: `controllerType=0 controllerNumber=1 +#' scan=5281 file=2` #' -#' - `coreSpectraVariables()`: returns the *core* spectra variables along with -#' their expected data type. +#' - `acquisitionNum` is a more a less sanitize spectrum id generated +#' from the `spectrumId` field by `mzR` (see +#' [here](https://github.com/sneumann/mzR/blob/master/src/pwiz/data/msdata/MSData.cpp#L552-L580)). #' -#' - `dataOrigin()`, `dataOrigin<-`: gets or sets the *data origin* for each -#' spectrum. `dataOrigin()` returns a `character` vector (same length than -#' `object`) with the origin of the spectra. `dataOrigin<-` expects a -#' `character` vector (same length than `object`) with the replacement -#' values for the data origin of each spectrum. +#' - `scanIndex` is the `mzR` generated sequence number of the +#' spectrum in the raw file (which doesn't have to be the same as +#' the `acquisitionNum`) #' -#' - `dataStorage()`: returns a `character` vector (same length than `object`) -#' with the data storage location of each spectrum. +#' See also [this issue](https://github.com/lgatto/MSnbase/issues/525). #' -#' - `intensity()`: gets the intensity values from the spectra. Returns -#' a [NumericList()] of `numeric` vectors (intensity values for each -#' spectrum). The length of the list is equal to the number of -#' `spectra` in `object`. +#' @md #' -#' - `ionCount()`: returns a `numeric` with the sum of intensities for -#' each spectrum. If the spectrum is empty (see `isEmpty()`), -#' `NA_real_` is returned. +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail #' -#' - `isCentroided()`: a heuristic approach assessing if the spectra in -#' `object` are in profile or centroided mode. The function takes -#' the `qtl`th quantile top peaks, then calculates the difference -#' between adjacent m/z value and returns `TRUE` if the first -#' quartile is greater than `k`. (See `Spectra:::.isCentroided()` for -#' the code.) +#' @exportClass Spectra #' -#' - `isEmpty()`: checks whether a spectrum in `object` is empty -#' (i.e. does not contain any peaks). Returns a `logical` vector of -#' length equal number of spectra. +#' @exportMethod Spectra #' -#' - `isolationWindowLowerMz()`, `isolationWindowLowerMz<-`: gets or sets the -#' lower m/z boundary of the isolation window. +#' @examples #' -#' - `isolationWindowTargetMz()`, `isolationWindowTargetMz<-`: gets or sets the -#' target m/z of the isolation window. +#' ## -------- CREATION OF SPECTRA OBJECTS -------- #' -#' - `isolationWindowUpperMz()`, `isolationWindowUpperMz<-`: gets or sets the -#' upper m/z boundary of the isolation window. +#' ## Create a Spectra providing a `DataFrame` containing the spectrum data. #' -#' - `containsMz()`: checks for each of the spectra whether they contain mass -#' peaks with an m/z equal to `mz` (given acceptable difference as defined by -#' parameters `tolerance` and `ppm` - see [common()] for details). Parameter -#' `which` allows to define whether any (`which = "any"`, the default) or -#' all (`which = "all"`) of the `mz` have to match. The function returns -#' `NA` if `mz` is of length 0 or is `NA`. +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) #' -#' - `containsNeutralLoss()`: checks for each spectrum in `object` if it has a -#' peak with an m/z value equal to its precursor m/z - `neutralLoss` (given -#' acceptable difference as defined by parameters `tolerance` and `ppm`). -#' Returns `NA` for MS1 spectra (or spectra without a precursor m/z). +#' data <- Spectra(spd) +#' data #' -#' - `length()`: gets the number of spectra in the object. +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex #' -#' - `lengths()`: gets the number of peaks (m/z-intensity values) per -#' spectrum. Returns an `integer` vector (length equal to the -#' number of spectra). For empty spectra, `0` is returned. #' -#' - `msLevel()`: gets the spectra's MS level. Returns an integer vector (names -#' being spectrum names, length equal to the number of spectra) with the MS -#' level for each spectrum. +#' ## -------- CHANGING DATA REPRESENTATIONS -------- #' -#' - `mz()`: gets the mass-to-charge ratios (m/z) from the -#' spectra. Returns a [NumericList()] or length equal to the number of -#' spectra, each element a `numeric` vector with the m/z values of -#' one spectrum. +#' ## The MS data is on disk and will be read into memory on-demand. We can +#' ## however change the backend to a MsBackendMemory backend which will +#' ## keep all of the data in memory. +#' sciex_im <- setBackend(sciex, MsBackendMemory()) +#' sciex_im #' -#' - `peaksData()`: gets the *peaks* data for all spectra in `object`. Peaks -#' data consist of the m/z and intensity values as well as possible additional -#' annotations (variables) of all peaks of each spectrum. The function -#' returns a [SimpleList()] of two dimensional arrays (either `matrix` or -#' `data.frame`), with each array providing the values for the requested -#' *peak variables* (by default `"mz"` and `"intensity"`). Optional parameter -#' `columns` is passed to the backend's `peaksData()` function to allow -#' the selection of specific (or additional) peaks variables (columns) that -#' should be extracted (if available). Importantly, -#' it is **not** guaranteed that each backend supports this parameter (while -#' each backend must support extraction of `"mz"` and `"intensity"` columns). -#' Parameter `columns` defaults to `c("mz", "intensity")` but any value -#' returned by `peaksVariables(object)` is supported. -#' Note also that it is possible to extract the peak data with -#' `as(x, "list")` and `as(x, "SimpleList")` as a `list` and `SimpleList`, -#' respectively. Note however that, in contrast to `peaksData()`, `as()` -#' does not support the parameter `columns`. +#' ## The `MsBackendMemory()` supports the `setBackend()` method: +#' supportsSetBackend(MsBackendMemory()) #' -#' - `peaksVariables()`: lists the available variables for mass peaks provided -#' by the backend. Default peak variables are `"mz"` and `"intensity"` (which -#' all backends need to support and provide), but some backends might provide -#' additional variables. -#' These variables correspond to the column names of the peak data array -#' returned by `peaksData()`. +#' ## Thus, it is possible to change to that backend with `setBackend()`. Most +#' ## read-only backends however don't support that, such as the +#' ## `MsBackendMzR` and `setBackend()` would fail to change to that backend. +#' supportsSetBackend(MsBackendMzR()) #' -#' - `polarity()`, `polarity<-`: gets or sets the polarity for each -#' spectrum. `polarity()` returns an `integer` vector (length equal -#' to the number of spectra), with `0` and `1` representing negative -#' and positive polarities, respectively. `polarity<-` expects an -#' `integer` vector of length 1 or equal to the number of spectra. +#' ## The on-disk object `sciex` is light-weight, because it does not keep the +#' ## MS peak data in memory. The `sciex_im` object in contrast keeps all the +#' ## data in memory and its size is thus much larger. +#' object.size(sciex) +#' object.size(sciex_im) #' -#' - `precursorCharge()`, `precursorIntensity()`, `precursorMz()`, -#' `precScanNum()`, `precAcquisitionNum()`: gets the charge (`integer`), -#' intensity (`numeric`), m/z (`numeric`), scan index (`integer`) -#' and acquisition number (`interger`) of the precursor for MS level > -#' 2 spectra from the object. Returns a vector of length equal to -#' the number of spectra in `object`. `NA` are reported for MS1 -#' spectra of if no precursor information is available. +#' ## The spectra variable `dataStorage` returns for each spectrum the location +#' ## where the data is stored. For in-memory objects: +#' head(dataStorage(sciex_im)) #' -#' - `rtime()`, `rtime<-`: gets or sets the retention times (in seconds) -#' for each spectrum. `rtime()` returns a `numeric` vector (length -#' equal to the number of spectra) with the retention time for each -#' spectrum. `rtime<-` expects a numeric vector with length equal -#' to the number of spectra. +#' ## While objects that use an on-disk backend will list the files where the +#' ## data is stored. +#' head(dataStorage(sciex)) #' -#' - `scanIndex()`: returns an `integer` vector with the *scan index* -#' for each spectrum. This represents the relative index of the -#' spectrum within each file. Note that this can be different to the -#' `acquisitionNum` of the spectrum which represents the index of the -#' spectrum during acquisition/measurement (as reported in the mzML file). +#' ## The spectra variable `dataOrigin` returns for each spectrum the *origin* +#' ## of the data. If the data is read from e.g. mzML files, this will be the +#' ## original mzML file name: +#' head(dataOrigin(sciex)) +#' head(dataOrigin(sciex_im)) #' -#' - `smoothed()`,`smoothed<-`: gets or sets whether a spectrum is -#' *smoothed*. `smoothed()` returns a `logical` vector of length equal -#' to the number of spectra. `smoothed<-` takes a `logical` vector -#' of length 1 or equal to the number of spectra in `object`. #' -#' - `spectraData()`: gets general spectrum metadata (annotation, also called -#' header). `spectraData()` returns a `DataFrame`. Note that this -#' method does by default **not** return m/z or intensity values. +#' ## -------- DATA EXPORT -------- #' -#' - `spectraData<-`: **replaces** the full spectra data of the `Spectra` -#' object with the one provided with `value`. The `spectraData<-` function -#' expects a `DataFrame` to be passed as value with the same number of rows -#' as there a spectra in `object`. Note that replacing values of -#' peaks variables is not supported with a non-empty processing queue, i.e. -#' if any filtering or data manipulations on the peaks data was performed. -#' In these cases [applyProcessing()] needs to be called first to apply all -#' cached data operations and empty the processing queue. +#' ## Some `MsBackend` classes provide an `export()` method to export the data +#' ## to the file format supported by the backend. +#' ## The `MsBackendMzR` for example allows to export MS data to mzML or +#' ## mzXML file(s), the `MsBackendMgf` (defined in the MsBackendMgf R package) +#' ## would allow to export the data in mgf file format. +#' ## Below we export the MS data in `data`. We call the `export()` method on +#' ## this object, specify the backend that should be used to export the data +#' ## (and which also defines the output format) and provide a file name. +#' fl <- tempfile() +#' export(data, MsBackendMzR(), file = fl) #' -#' - `spectraNames()`, `spectraNames<-`: gets or sets the spectra names. +#' ## This exported our data in mzML format. Below we read the first 6 lines +#' ## from that file. +#' readLines(fl, n = 6) #' -#' - `spectraVariables()`: returns a `character` vector with the -#' available spectra variables (columns, fields or attributes of each -#' spectrum) available in `object`. Note that `spectraVariables()` does not -#' list the *peak variables* (`"mz"`, `"intensity"` and eventual additional -#' annotations for each MS peak). Peak variables are returned by -#' `peaksVariables()`. +#' ## If only a single file name is provided, all spectra are exported to that +#' ## file. To export data with the `MsBackendMzR` backend to different files, a +#' ## file name for each individual spectrum has to be provided. +#' ## Below we export each spectrum to its own file. +#' fls <- c(tempfile(), tempfile()) +#' export(data, MsBackendMzR(), file = fls) #' -#' - `tic()`: gets the total ion current/count (sum of signal of a -#' spectrum) for all spectra in `object`. By default, the value -#' reported in the original raw data file is returned. For an empty -#' spectrum, `0` is returned. +#' ## Reading the data from the first file +#' res <- Spectra(backendInitialize(MsBackendMzR(), fls[1])) #' -#' - `uniqueMsLevels()`: get the unique MS levels available in `object`. This -#' function is supposed to be more efficient than `unique(msLevel(object))`. +#' mz(res) +#' mz(data) +NULL + +#' The Spectra class #' -#' @section Data subsetting, filtering and merging: +#' The [Spectra] class encapsulates data and meta-data for mass +#' spectrometry experiments. #' -#' Subsetting and filtering of `Spectra` objects can be performed with the below -#' listed methods. +#' @slot backend A derivate of [MsBackend-class] holding/controlling the spectra +#' data. #' -#' - `[`: subsets the spectra keeping only selected elements (`i`). The method -#' **always** returns a `Spectra` object. +#' @slot processingQueue `list` of `ProcessingStep` objects. #' -#' - `cbind2()`: Appends multiple spectra variables from a `data.frame`, -#' `DataFrame` or `matrix` to the `Spectra` object at once. It does so -#' *blindly* (e.g. do not check rownames compatibility) and is therefore at -#' the risk of the user. For a more controlled way of adding spectra -#' variables, the `joinSpectraData()` should be used. It will return a -#' `Spectra` object with the appended spectra variables. `cbind2()` does -#' check however that the number of rows of the `data.frame` or `DataFrame` -#' matches the number of spectra in the `Spectra` object. +#' @slot processingQueueVariables `character` of spectraVariables that should +#' be passed to the processing step function. #' -#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the -#' monoisotopic peak for groups of isotopologues. Isotopologues are -#' estimated using the [isotopologues()] function from the -#' *MetaboCoreUtils* package. Note that -#' the default parameters for isotope prediction/detection have been -#' determined using data from the Human Metabolome Database (HMDB) and -#' isotopes for elements other than CHNOPS might not be detected. See -#' parameter `substDefinition` in the documentation of [isotopologues()] for -#' more information. The approach and code to define the parameters for -#' isotope prediction is described -#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' @slot processing A `character` storing logging information. #' -#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the -#' object's `spectraData` that contain only missing values (`NA`). Note that -#' while columns with only `NA`s are removed, a `spectraData()` call after -#' `dropNaSpectraVariables()` might still show columns containing `NA` values -#' for *core* spectra variables. +#' @slot metadata A `list` storing experiment metadata. #' -#' - `filterAcquisitionNum()`: filters the object keeping only spectra matching -#' the provided acquisition numbers (argument `n`). If `dataOrigin` or -#' `dataStorage` is also provided, `object` is subsetted to the spectra with -#' an acquisition number equal to `n` **in spectra with matching dataOrigin -#' or dataStorage values** retaining all other spectra. -#' Returns the filtered `Spectra`. +#' @slot version A `character(1)` containing the class version. #' -#' - `filterDataOrigin()`: filters the object retaining spectra matching the -#' provided `dataOrigin`. Parameter `dataOrigin` has to be of type -#' `character` and needs to match exactly the data origin value of the -#' spectra to subset. -#' Returns the filtered `Spectra` object (with spectra ordered according to -#' the provided `dataOrigin` parameter). +#' @docType class #' -#' - `filterDataStorage()`: filters the object retaining spectra stored in the -#' specified `dataStorage`. Parameter `dataStorage` has to be of type -#' `character` and needs to match exactly the data storage value of the -#' spectra to subset. -#' Returns the filtered `Spectra` object (with spectra ordered according to -#' the provided `dataStorage` parameter). +#' @author Sebastian Gibb \email{mail@@sebastiangibb.de} #' -#' - `filterEmptySpectra()`: removes empty spectra (i.e. spectra without peaks). -#' Returns the filtered `Spectra` object (with spectra in their -#' original order). +#' @importClassesFrom S4Vectors DataFrame #' -#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier -#' artefact peaks from spectra (see examples below). The function iterates -#' through all intensity ordered peaks in a spectrum and removes all peaks -#' with an m/z within +/- `halfWindowSize` of the current peak if their -#' intensity is lower than `threshold` times the current peak's intensity. -#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` -#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` -#' being the maximum charge that should be considered and `isotopeTolerance` -#' the absolute acceptable tolerance for matching their m/z). -#' See [filterFourierTransformArtefacts()] for details and background and -#' `deisitopeSpectra()` for an alternative. +#' @importMethodsFrom S4Vectors lapply #' -#' - `filterIntensity()`: filters each spectrum keeping only peaks with -#' intensities that are within the provided range or match the criteria of -#' the provided function. For the former, parameter `intensity` has to be a -#' `numeric` defining the intensity range, for the latter a `function` that -#' takes the intensity values of the spectrum and returns a `logical` whether -#' the peak should be retained or not (see examples below for details) - -#' additional parameters to the function can be passed with `...`. To -#' remove only peaks with intensities below a certain threshold, say 100, use -#' `intensity = c(100, Inf)`. Note: also a single value can be passed with -#' the `intensity` parameter in which case an upper limit of `Inf` is used. -#' Note that this function removes also peaks with missing intensities -#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the -#' filtering to spectra of the specified MS level(s). +#' @importFrom S4Vectors DataFrame #' -#' - `filterIsolationWindow()`: retains spectra that contain `mz` in their -#' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` -#' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` -#' object (with spectra in their original order). +#' @noRd +setClass( + "Spectra", + slots = c( + backend = "MsBackend", + processingQueue = "list", + processingQueueVariables = "character", + ## logging + processing = "character", + ## metadata + metadata = "list", + processingChunkSize = "numeric", + version = "character" + ), + prototype = prototype(version = "0.3", + processingChunkSize = Inf) +) + +setValidity("Spectra", function(object) { + msg <- .valid_processing_queue(object@processingQueue) + if (length(msg)) msg + else TRUE +}) + +#' @rdname hidden_aliases #' -#' - `filterMsLevel()`: filters object by MS level keeping only spectra matching -#' the MS level specified with argument `msLevel`. Returns the filtered -#' `Spectra` (with spectra in their original order). +#' @importMethodsFrom methods show #' -#' - `filterMzRange()`: filters the object keeping or removing peaks in each -#' spectrum that are within the provided m/z range. Whether peaks are -#' retained or removed can be configured with parameter `keep` (default -#' `keep = TRUE`). +#' @importFrom utils capture.output #' -#' - `filterMzValues()`: filters the object keeping **all** peaks in each -#' spectrum that match the provided m/z value(s) (for `keep = TRUE`, the -#' default) or removing **all** of them (for `keep = FALSE`). The m/z -#' matching considers also the absolute `tolerance` and m/z-relative -#' `ppm` values. `tolerance` and `ppm` have to be of length 1. +#' @exportMethod show +setMethod("show", "Spectra", + function(object) { + cat("MSn data (", class(object)[1L], ") with ", + length(object@backend), " spectra in a ", class(object@backend), + " backend:\n", sep = "") + if (length(object@backend)) { + txt <- capture.output(show(object@backend)) + cat(txt[-1], sep = "\n") + } + if (length(object@processingQueue)) + cat("Lazy evaluation queue:", length(object@processingQueue), + "processing step(s)\n") + lp <- length(object@processing) + if (lp) { + lps <- object@processing + if (lp > 3) { + lps <- lps[1:3] + lps <- c(lps, paste0("...", lp - 3, " more processings. ", + "Use 'processingLog' to list all.")) + } + cat("Processing:\n", paste(lps, collapse="\n "), "\n") + } + }) + +#' @rdname Spectra +setMethod("Spectra", "missing", function(object, processingQueue = list(), + metadata = list(), ..., + backend = MsBackendMemory(), + BPPARAM = bpparam()) { + if (length(backend)) + new("Spectra", metadata = metadata, processingQueue = processingQueue, + backend = backend) + else callNextMethod() +}) + +#' @rdname Spectra +setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), + metadata = list(), ..., + BPPARAM = bpparam()) { + new("Spectra", metadata = metadata, processingQueue = processingQueue, + backend = object) +}) + +#' @rdname Spectra #' -#' - `filterPolarity()`: filters the object keeping only spectra matching the -#' provided polarity. Returns the filtered `Spectra` (with spectra in their -#' original order). +#' @importFrom methods callNextMethod +setMethod("Spectra", "character", function(object, processingQueue = list(), + metadata = list(), + source = MsBackendMzR(), + backend = source, + ..., BPPARAM = bpparam()) { + sp <- .create_spectra(object, processingQueue = processingQueue, + metadata = metadata, backend = source, + ..., BPPARAM = BPPARAM) + if (class(source)[1L] != class(backend)[1L]) + setBackend(sp, backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + else sp +}) + +#' @rdname Spectra +setMethod("Spectra", "ANY", function(object, processingQueue = list(), + metadata = list(), + source = MsBackendMemory(), + backend = source, + ..., BPPARAM = bpparam()) { + sp <- .create_spectra(object, processingQueue = processingQueue, + metadata = metadata, backend = source, + ..., BPPARAM = BPPARAM) + if (class(source)[1L] != class(backend)[1L]) + setBackend(sp, backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + else sp +}) + +.create_spectra <- function(object, processingQueue = list(), metadata = list(), + backend = MsBackendMemory(), ..., + BPPARAM = bpparam()) { + if (missing(object)) + backend <- backendInitialize( + backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + else backend <- backendInitialize( + backend, object, ..., BPPARAM = backendBpparam(backend, BPPARAM)) + new("Spectra", metadata = metadata, processingQueue = processingQueue, + backend = backend) +} + +#' @rdname Spectra #' -#' - `filterPrecursorCharge()`: retains spectra with the defined precursor -#' charge(s). +#' @importMethodsFrom ProtGenerics setBackend #' -#' - `filterPrecursorIsotopes()`: groups MS2 spectra based on their precursor -#' m/z and precursor intensity into predicted isotope groups and keep for each -#' only the spectrum representing the monoisotopic precursor. MS1 spectra -#' are returned as is. See documentation for `deisotopeSpectra()` below for -#' details on isotope prediction and parameter description. +#' @exportMethod setBackend +setMethod( + "setBackend", c("Spectra", "MsBackend"), + function(object, backend, f = processingChunkFactor(object), ..., + BPPARAM = bpparam()) { + backend_class <- class(object@backend)[1L] + BPPARAM <- backendBpparam(object@backend, BPPARAM) + BPPARAM <- backendBpparam(backend, BPPARAM) + if (!supportsSetBackend(backend)) + stop(class(backend), " does not support 'setBackend'") + if (!length(object)) { + bknds <- backendInitialize( + backend, data = spectraData(object@backend), ...) + } else { + if (!is.factor(f)) + f <- force(factor(f, levels = unique(f))) + if (length(f) && (length(levels(f)) > 1)) { + if (length(f) != length(object)) + stop("length of 'f' has to match the length of 'object'") + bknds <- bplapply( + split(object@backend, f = f), + function(z, ...) { + backendInitialize(backend, + data = spectraData(z), ..., + BPPARAM = SerialParam()) + }, ..., BPPARAM = BPPARAM) + bknds <- backendMerge(bknds) + ## That below ensures the backend is returned in its original + ## order - unsplit does unfortunately not work. + if (is.unsorted(f)) + bknds <- extractByIndex( + bknds, order(unlist(split(seq_along(bknds), f), + use.names = FALSE))) + } else { + bknds <- backendInitialize( + backend, data = spectraData(object@backend), ...) + } + } + object@backend <- bknds + object@processing <- .logging(object@processing, + "Switch backend from ", + backend_class, " to ", + class(object@backend)) + object + }) + +#' @rdname Spectra #' -#' - `filterPrecursorMaxIntensity()`: filters the `Spectra` keeping for groups -#' of (MS2) spectra with similar precursor m/z values (given parameters -#' `ppm` and `tolerance`) the one with the highest precursor intensity. The -#' function filters only MS2 spectra and returns all MS1 spectra. If -#' precursor intensities are `NA` for all spectra within a spectra group, the -#' first spectrum of that groups is returned. -#' Note: some manufacturers don't provide precursor intensities. These can -#' however also be estimated with [estimatePrecursorIntensity()]. +#' @export +setMethod("export", "Spectra", + function(object, backend, ...) { + if (missing(backend)) + stop("Parameter 'backend' is required.") + export(backend, object, ...) + }) + +#' @rdname Spectra +setMethod("dataStorageBasePath", "Spectra", function(object) { + dataStorageBasePath(object@backend) +}) + +#' @rdname Spectra +setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { + dataStorageBasePath(object@backend) <- value + object +}) + +################################################################################ +## +## Accessing and adding/setting/changing MS data. +## +################################################################################ + +#' @title Accessing mass spectrometry data +#' +#' @name spectraData +#' +#' @aliases acquisitionNum +#' @aliases centroided +#' @aliases collisionEnergy +#' @aliases dataOrigin +#' @aliases dataStorage +#' @aliases intensity +#' @aliases ionCount +#' @aliases isCentroided +#' @aliases isEmpty +#' @aliases isolationWindowLowerMz +#' @aliases isolationWindowUpperMz +#' @aliases isolationWindowTargetMz +#' @aliases lengths +#' @aliases msLevel +#' @aliases mz +#' @aliases peaksData +#' @aliases peaksVariables +#' @aliases polarity +#' @aliases precursorCharge +#' @aliases precursorIntensity +#' @aliases precursorMz +#' @aliases rtime +#' @aliases scanIndex +#' @aliases smoothed +#' @aliases spectraData +#' @aliases spectraNames +#' @aliases spectraVariables +#' @aliases tic +#' @aliases uniqueMsLevels #' -#' - `filterPrecursorMzRange()` (previously `filterPrecursorMz()` which is now -#' deprecated): retains spectra with a precursor m/z within the -#' provided m/z range. See examples for details on selecting spectra with -#' a precursor m/z for a target m/z accepting a small difference in *ppm*. +#' @description #' -#' - `filterPrecursorMzValues()`: retains spectra with precursor m/z matching -#' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with -#' missing precursor m/z value (e.g. MS1 spectra) are dropped. +#' As detailed in the documentation of the [Spectra] class, a `Spectra` object +#' is a container for mass spectrometry (MS) data that includes both the mass +#' peaks data (or *peaks data*, generally *m/z* and intensity values) as well +#' as spectra metadata (so called *spectra variables*). Spectra variables +#' generally define one value per spectrum, while for peaks variables one value +#' per mass peak is defined and hence multiple values per spectrum (depending +#' on the number of mass peaks of a spectrum). #' -#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with -#' an m/z equal or larger than the m/z of the precursor, depending on the -#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching -#' m/z (considering an absolute and relative acceptable difference depending -#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all -#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` -#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` -#' allows to restrict the filter to certain MS levels (by default the filter -#' is applied to all MS levels). Note that no peaks are removed if the -#' precursor m/z is `NA` (e.g. typically for MS1 spectra). +#' Data can be extracted from a `Spectra` object using dedicated accessor +#' functions or also using the `$` operator. Depending on the backend class +#' used by the `Spectra` to represent the data, data can also be added or +#' replaced (again, using dedicated functions or using `$<-`). #' -#' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. -#' MS2) of acquisition number `acquisitionNum`. Returns the filtered -#' `Spectra` (with spectra in their original order). Parameter `f` allows to -#' define which spectra belong to the same sample or original data file ( -#' defaults to `f = dataOrigin(object)`). #' -#' - `filterRt()`: retains spectra of MS level `msLevel` with retention -#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) -#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their -#' original order). +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. See also [processingChunkSize()] for more information +#' on parallel processing. #' -#' - `filterRanges()`: allows filtering of the `Spectra` object based on user -#' defined *numeric* ranges (parameter `ranges`) for one or more available -#' spectra variables in object (spectra variable names can be specified with -#' parameter `spectraVariables`). Spectra for which the value of a spectra -#' variable is within it's defined range are retained. If multiple -#' ranges/spectra variables are defined, the `match` parameter can be used -#' to specify whether all conditions (`match = "all"`; the default) or if -#' any of the conditions must match (`match = "any"`; all spectra for which -#' values are within any of the provided ranges are retained). +#' @param columns For `spectraData()` accessor: optional `character` with +#' column names (spectra variables) that should be included in the +#' returned `DataFrame`. By default, all columns are returned. +#' For `peaksData()` accessor: optional `character` with requested columns +#' in the individual `matrix` of the returned `list`. Defaults to +#' `c("mz", "value")` but any values returned by `peaksVariables(object)` +#' with `object` being the `Spectra` object are supported. #' -#' - `filterValues()`: allows filtering of the `Spectra` object based on -#' similarities of *numeric* values of one or more `spectraVariables(object)` -#' (parameter `spectraVariables`) to provided values (parameter `values`) -#' given acceptable differences (parameters tolerance and ppm). If multiple -#' values/spectra variables are defined, the `match` parameter can be used -#' to specify whether all conditions (`match = "all"`; the default) or if -#' any of the conditions must match (`match = "any"`; all spectra for which -#' values are within any of the provided ranges are retained). +#' @param f For `intensity()`, `mz()` and `peaksData()`: factor defining how +#' data should be chunk-wise loaded an processed. Defaults to +#' [processingChunkFactor()]. #' -#' - `reduceSpectra()`: for groups of peaks within highly similar m/z values -#' within each spectrum (given `ppm` and `tolerance`), this function keeps -#' only the peak with the highest intensity removing all other peaks hence -#' *reducing* each spectrum to the highest intensity peaks per *peak group*. -#' Peak groups are defined using the [group()] function from the -#' *MsCoreUtils* package. +#' @param i For `asDataFrame()`: A `numeric` indicating which scans to coerce +#' to a `DataFrame` (default is `seq_along(object)`). #' -#' - `reset()`: restores the data to its original state (as much as possible): -#' removes any processing steps from the lazy processing queue and calls -#' `reset()` on the backend which, depending on the backend, can also undo -#' e.g. data filtering operations. Note that a `reset*(` call after -#' `applyProcessing()` will not have any effect. See examples below for more -#' information. +#' @param initial For `tic()`: `logical(1)` whether the initially +#' reported total ion current should be reported, or whether the +#' total ion current should be (re)calculated on the actual data +#' (`initial = FALSE`, same as `ionCount()`). #' -#' - `selectSpectraVariables()`: reduces the information within the object to -#' the selected spectra variables: all data for variables not specified will -#' be dropped. For mandatory columns (i.e., those listed by -#' [coreSpectraVariables()], such as *msLevel*, *rtime* ...) only -#' the values will be dropped but not the variable itself. Additional (or -#' user defined) spectra variables will be completely removed. -#' Returns the filtered `Spectra`. +#' @param j For `[`: not supported. #' -#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` -#' of `Spectra` objects. +#' @param name For `$` and `$<-`: the name of the spectra variable to return +#' or set. #' -#' - `joinSpectraData()`: Individual spectra variables can be directly -#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` -#' function allows to merge a `DataFrame` to the existing spectra -#' data. This function diverges from the [merge()] method in two -#' main ways: -#' - The `by.x` and `by.y` column names must be of length 1. -#' - If variable names are shared in `x` and `y`, the spectra -#' variables of `x` are not modified. It's only the `y` -#' variables that are appended the suffix defined in -#' `suffix.y`. This is to avoid modifying any core spectra -#' variables that would lead to an invalid object. -#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not -#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) -#' throw a warning and only the last occurrence is kept. These -#' should be explored and ideally be removed using for -#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar -#' functions. -#' For a more general function that allows to append `data.frame`, -#' `DataFrame` and `matrix` see `cbind2()`. +#' @param object A `Spectra` object. #' -#' Several `Spectra` objects can be concatenated into a single object with the -#' `c()` or the `concatenateSpectra()` function. Concatenation will fail if the -#' processing queue of any of the `Spectra` objects is not empty or if -#' different backends are used in the `Spectra` objects. The spectra variables -#' of the resulting `Spectra` object is the union of the spectra variables of -#' the individual `Spectra` objects. +#' @param spectraVars `character()` indicating what spectra variables to add to +#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all +#' available variables. #' +#' @param use.names For `lengths()`: ignored. #' -#' @section Data manipulation and analysis methods: +#' @param value A vector with values to replace the respective spectra +#' variable. Needs to be of the correct data type for the spectra variable. #' -#' Many data manipulation operations, such as those listed in this section, are -#' not applied immediately to the spectra, but added to a -#' *lazy processing/manipulation queue*. Operations stored in this queue are -#' applied on-the-fly to spectra data each time it is accessed. This lazy -#' execution guarantees the same functionality for `Spectra` objects with -#' any backend, i.e. backends supporting to save changes to spectrum data -#' ([MsBackendMemory()], [MsBackendDataFrame()] or [MsBackendHdf5Peaks()]) as -#' well as read-only backends (such as the [MsBackendMzR()]). -#' Note that for the former it is possible to apply the processing queue and -#' write the modified peak data back to the data storage with the -#' `applyProcessing()` function. +#' @param x A `Spectra` object. #' -#' - `addProcessing()`: adds an arbitrary function that should be applied to the -#' peaks matrix of every spectrum in `object`. The function (can be passed -#' with parameter `FUN`) is expected to take a peaks matrix as input and to -#' return a peaks matrix. A peaks matrix is a numeric matrix with two columns, -#' the first containing the m/z values of the peaks and the second the -#' corresponding intensities. The function has to have `...` in its -#' definition. Additional arguments can be passed with `...`. With parameter -#' `spectraVariables` it is possible to define additional spectra variables -#' from `object` that should be passed to the function `FUN`. These will be -#' passed by their name (e.g. specifying `spectraVariables = "precursorMz"` -#' will pass the spectra's precursor m/z as a parameter named `precursorMz` -#' to the function. The only exception is the spectra's MS level, these will -#' be passed to the function as a parameter called `spectrumMsLevel` (i.e. -#' with `spectraVariables = "msLevel"` the MS levels of each spectrum will be -#' submitted to the function as a parameter called `spectrumMsLevel`). -#' Examples are provided in the package vignette. +#' @param ... Additional arguments. #' -#' - `applyProcessing()`: for `Spectra` objects that use a **writeable** backend -#' only: apply all steps from the lazy processing queue to the peak data and -#' write it back to the data storage. Parameter `f` allows to specify how -#' `object` should be split for parallel processing. This should either be -#' equal to the `dataStorage`, or `f = rep(1, length(object))` to disable -#' parallel processing alltogether. Other partitionings might result in -#' errors (especially if a `MsBackendHdf5Peaks` backend is used). #' -#' - `bin()`: aggregates individual spectra into discrete (m/z) bins. Binning is -#' performed only on spectra of the specified MS level(s) (parameter -#' `msLevel`, by default all MS levels of `x`). The bins can be defined with -#' parameter `breaks` which by default are equally sized bins, with size -#' being defined by parameter `binSize`, from the minimal to the maximal m/z -#' of all spectra (of MS level `msLevel`) within `x`. The same bins are used -#' for all spectra in `x`. All intensity values for peaks falling into the -#' same bin are aggregated using the function provided with parameter `FUN` -#' (defaults to `FUN = sum`, i.e. all intensities are summed up). Note that -#' the binning operation is applied to the peak data on-the-fly upon data -#' access and it is possible to *revert* the operation with the `reset()` -#' function (see description of `reset()` above). -#' -#' - `combinePeaks()`: combines mass peaks within each spectrum with a -#' difference in their m/z values that is smaller than the maximal -#' acceptable difference defined by `ppm` and `tolerance`. Parameters -#' `intensityFun` and `mzFun` allow to define functions to aggregate the -#' intensity and m/z values for each such group of peaks. With -#' `weighted = TRUE` (the default), the m/z value of the combined peak is -#' calculated using an intensity-weighted mean and parameter `mzFun` is -#' ignored. The [MsCoreUtils::group()] function is used for the grouping of -#' mass peaks. Parameter `msLevel.` allows to define selected MS levels for -#' which peaks should be combined. This function returns a `Spectra` with -#' the same number of spectra than the input object, but with possibly -#' combined peaks within each spectrum. -# Additional peak variables (other than `"mz"` and `"intensity"`) are -#' dropped (i.e. their values are replaced with `NA`) for combined peaks -#' unless they are constant across the combined peaks. See also -#' `reduceSpectra()` for a function to select a single *representative* -#' mass peak for each peak group. -#' -#' - `combineSpectra()`: combines sets of spectra into a single spectrum per -#' set. For each spectrum group (set), spectra variables from the first -#' spectrum are used and the peak matrices are combined using the function -#' specified with `FUN`, which defaults to [combinePeaksData()]. Please -#' refer to the [combinePeaksData()] help page for details and options of -#' the actual combination of peaks across the sets of spectra and to the -#' package vignette for examples and alternative ways to aggregate spectra. -#' The sets of spectra can be specified with parameter `f`. -#' In addition it is possible to define, with parameter `p` if and how to -#' split the input data for parallel processing. -#' This defaults to `p = x$dataStorage` and hence a per-file parallel -#' processing is applied for `Spectra` with file-based backends (such as the -#' [MsBackendMzR()]). -#' Prior combination of the spectra all processings queued in the lazy -#' evaluation queue are applied. Be aware that calling `combineSpectra()` on a -#' `Spectra` object with certain backends that allow modifications might -#' **overwrite** the original data. This does not happen with a -#' `MsBackendMemory` or `MsBackendDataFrame` backend, but with a -#' `MsBackendHdf5Peaks` backend the m/z and intensity values in the original -#' hdf5 file(s) will be overwritten. -#' The function returns a `Spectra` of length equal to the unique levels -#' of `f`. -#' -#' - `compareSpectra()`: compares each spectrum in `x` with each spectrum in `y` -#' using the function provided with `FUN` (defaults to [ndotproduct()]). If -#' `y` is missing, each spectrum in `x` is compared with each other spectrum -#' in `x`. -#' The matching/mapping of peaks between the compared spectra is done with the -#' `MAPFUN` function. The default [joinPeaks()] matches peaks of both spectra -#' and allows to keep all peaks from the first spectrum (`type = "left"`), -#' from the second (`type = "right"`), from both (`type = "outer"`) and to -#' keep only matching peaks (`type = "inner"`); see [joinPeaks()] for more -#' information and examples). The `MAPFUN` function should have parameters -#' `x`, `y`, `xPrecursorMz` and `yPrecursorMz` as these values are passed to -#' the function. In addition to `joinPeaks()` also [joinPeaksGnps()] is -#' supported for GNPS-like similarity score calculations. Note that -#' `joinPeaksGnps()` should only be used in combination with -#' `FUN = MsCoreUtils::gnps` (see [joinPeaksGnps()] for more information and -#' details). Use `MAPFUN = joinPeaksNone` to disable internal peak -#' matching/mapping if a similarity scoring function is used that performs -#' the matching internally. -#' `FUN` is supposed to be a function to compare intensities of (matched) -#' peaks of the two spectra that are compared. The function needs to take two -#' matrices with columns `"mz"` and `"intensity"` as input and is supposed -#' to return a single numeric as result. In addition to the two peak matrices -#' the spectra's precursor m/z values are passed to the function as parameters -#' `xPrecursorMz` (precursor m/z of the `x` peak matrix) and `yPrecursorMz` -#' (precursor m/z of the `y` peak matrix). Additional parameters to functions -#' `FUN` and `MAPFUN` can be passed with `...`. Parameters `ppm` and -#' `tolerance` are passed to both `MAPFUN` and `FUN`. -#' The function returns a `matrix` with the results of `FUN` for each -#' comparison, number of rows equal to `length(x)` and number of columns -#' equal `length(y)` (i.e. element in row 2 and column 3 is the result from -#' the comparison of `x[2]` with `y[3]`). If `SIMPLIFY = TRUE` the `matrix` -#' is *simplified* to a `numeric` if length of `x` or `y` is one. See also -#' the vignette for additional examples, such as using spectral entropy -#' similarity in the scoring. +#' @section Spectra variables: +#' +#' A common set of *core spectra variables* are defined for `Spectra`. These +#' have a pre-defined data type and each `Spectra` will return a value for +#' these if requested. If no value for a spectra variable is defined, a missing +#' value (of the correct data type) is returned. The list of core spectra +#' variables and their respective data type is: +#' +#' - *acquisitionNum* `integer(1)`: the index of acquisition of a spectrum +#' during an MS run. +#' - *centroided* `logical(1)`: whether the spectrum is in profile or centroid +#' mode. +#' - *collisionEnergy* `numeric(1)`: collision energy used to create an MSn +#' spectrum. +#' - *dataOrigin* `character(1)`: the *origin* of the spectrum's data, e.g. the +#' mzML file from which it was read. +#' - *dataStorage* `character(1)`: the (current) storage location of the +#' spectrum data. This value depends on the backend used to handle and +#' provide the data. For an *in-memory* backend like the `MsBackendDataFrame` +#' this will be `""`, for an on-disk backend such as the +#' `MsBackendHdf5Peaks` it will be the name of the HDF5 file where the +#' spectrum's peak data is stored. +#' - *isolationWindowLowerMz* `numeric(1)`: lower m/z for the isolation +#' window in which the (MSn) spectrum was measured. +#' - *isolationWindowTargetMz* `numeric(1)`: the target m/z for the isolation +#' window in which the (MSn) spectrum was measured. +#' - *isolationWindowUpperMz* `numeric(1)`: upper m/z for the isolation window +#' in which the (MSn) spectrum was measured. +#' - *msLevel* `integer(1)`: the MS level of the spectrum. +#' - *polarity* `integer(1)`: the polarity of the spectrum (`0` and `1` +#' representing negative and positive polarity, respectively). +#' - *precScanNum* `integer(1)`: the scan (acquisition) number of the precursor +#' for an MSn spectrum. +#' - *precursorCharge* `integer(1)`: the charge of the precursor of an MSn +#' spectrum. +#' - *precursorIntensity* `numeric(1)`: the intensity of the precursor of an +#' MSn spectrum. +#' - *precursorMz* `numeric(1)`: the m/z of the precursor of an MSn spectrum. +#' - *rtime* `numeric(1)`: the retention time of a spectrum. +#' - *scanIndex* `integer(1)`: the index of a spectrum within a (raw) file. +#' - *smoothed* `logical(1)`: whether the spectrum was smoothed. +#' +#' For each of these spectra variable a dedicated accessor function is defined +#' (such as `msLevel()` or `rtime()`) that allows to extract the values of +#' that spectra variable for all spectra in a `Spectra` object. Also, +#' replacement functions are defined, but not all backends might support +#' replacing values for spectra variables. As described above, additional +#' spectra variables can be defined or added. The `spectraVariables()` function +#' can be used to +#' +#' Values for multiple spectra variables, or all spectra vartiables* can be +#' extracted with the `spectraData()` function. +#' +#' +#' @section Peaks variables: +#' +#' `Spectra` also provide mass peak data with the *m/z* and intensity values +#' being the *core* peaks variables: +#' +#' - *intensity* `numeric`: intensity values for the spectrum's peaks. +#' - *mz* `numeric`: the m/z values for the spectrum's peaks. +#' +#' Values for these can be extracted with the `mz()` and `intensity()` +#' functions, or the `peaksData()` function. The former functions return a +#' `NumericList` with the respective values, while the latter returns a `List` +#' with `numeric` two-column matrices. The list of peaks matrices can also +#' be extracted using `as(x, "list")` or `as(x, "SimpleList")` with `x` being +#' a `Spectra` object. +#' +#' Some `Spectra`/backends provide also values for additional peaks variables. +#' The set of available peaks variables can be extracted with the +#' `peaksVariables()` function. +#' +#' +#' @section Functions to access MS data: +#' +#' The set of available functions to extract data from, or set data in, a +#' `Spectra` object are (in alphabetical order) listed below. Note that there +#' are also other functions to extract information from a `Spectra` object +#' documented in [addProcessing()]. #' -#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the -#' monoisotopic peak for groups of isotopologues. Isotopologues are -#' estimated using the [isotopologues()] function from the *MetaboCoreUtils* -#' package. Note that the default parameters for isotope -#' prediction/detection have been determined using data from the Human -#' Metabolome Database (HMDB) and isotopes for elements other than CHNOPS -#' might not be detected. See parameter `substDefinition` in the -#' documentation of [isotopologues()] for more information. The approach -#' and code to define the parameters for isotope prediction is described -#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. +#' See examples for details. Note that replacing values of a peaks variable +#' is not supported with a non-empty processing queue, i.e. if any filtering +#' or data manipulations on the peaks data was performed. In these cases +#' [applyProcessing()] needs to be called first to apply all cached data +#' operations. #' -#' - `entropy()`: calculates the entropy of each spectra based on the metrics -#' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). -#' See also [nentropy()] in the *MsCoreUtils* package for details. +#' - `[[`, `[[<-`: access or set/add a single spectrum variable (column) in the +#' backend. #' -#' - `estimatePrecursorIntensity()`: defines the precursor intensities for MS2 -#' spectra using the intensity of the matching MS1 peak from the -#' closest MS1 spectrum (i.e. the last MS1 spectrum measured before the -#' respective MS2 spectrum). With `method = "interpolation"` it is also -#' possible to calculate the precursor intensity based on an interpolation of -#' intensity values (and retention times) of the matching MS1 peaks from the -#' previous and next MS1 spectrum. See [estimatePrecursorIntensity()] for -#' examples and more details. +#' - `acquisitionNum()`: returns the acquisition number of each +#' spectrum. Returns an `integer` of length equal to the number of +#' spectra (with `NA_integer_` if not available). #' -#' - `estimatePrecursorMz()`: **for DDA data**: allows to estimate a fragment -#' spectra's precursor m/z based on the reported precursor m/z and the data -#' from the previous MS1 spectrum. See [estimatePrecursorMz()] for details. +#' - `asDataFrame()`: converts the `Spectra` to a `DataFrame` (in long format) +#' contining all data. Returns a `DataFrame`. #' -#' - `neutralLoss()`: calculates neutral loss spectra for fragment spectra. See -#' [neutralLoss()] for detailed documentation. +#' - `centroided()`, `centroided<-`: gets or sets the centroiding +#' information of the spectra. `centroided()` returns a `logical` +#' vector of length equal to the number of spectra with `TRUE` if a +#' spectrum is centroided, `FALSE` if it is in profile mode and `NA` +#' if it is undefined. See also `isCentroided()` for estimating from +#' the spectrum data whether the spectrum is centroided. `value` +#' for `centroided<-` is either a single `logical` or a `logical` of +#' length equal to the number of spectra in `object`. #' -#' - `processingLog()`: returns a `character` vector with the processing log -#' messages. +#' - `collisionEnergy()`, `collisionEnergy<-`: gets or sets the +#' collision energy for all spectra in `object`. `collisionEnergy()` +#' returns a `numeric` with length equal to the number of spectra +#' (`NA_real_` if not present/defined), `collisionEnergy<-` takes a +#' `numeric` of length equal to the number of spectra in `object`. #' -#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in -#' (given `ppm` and `tolerance`) in each spectrum only the peak with the -#' highest intensity removing all other peaks hence *reducing* each -#' spectrum to the highest intensity peaks per *peak group*. -#' Peak groups are defined using the [group()] function from the -#' *MsCoreUtils* package. See also the `combinePeaks()` function for an -#' alternative function to combine peaks within each spectrum. +#' - `coreSpectraVariables()`: returns the *core* spectra variables along with +#' their expected data type. #' -#' - `scalePeaks()`: scales intensities of peaks within each spectrum depending -#' on parameter `by`. With `by = sum` (the default) peak intensities are -#' divided by the sum of peak intensities within each spectrum. The sum of -#' intensities is thus 1 for each spectrum after scaling. Parameter -#' `msLevel.` allows to apply the scaling of spectra of a certain MS level. -#' By default (`msLevel. = uniqueMsLevels(x)`) intensities for all -#' spectra will be scaled. +#' - `dataOrigin()`, `dataOrigin<-`: gets or sets the *data origin* for each +#' spectrum. `dataOrigin()` returns a `character` vector (same length than +#' `object`) with the origin of the spectra. `dataOrigin<-` expects a +#' `character` vector (same length than `object`) with the replacement +#' values for the data origin of each spectrum. #' -#' - `spectrapply()`: applies a given function to each individual spectrum or -#' sets of a `Spectra` object. By default, the `Spectra` is split into -#' individual spectra (i.e. `Spectra` of length 1) and the function `FUN` -#' is applied to each of them. An alternative splitting can be defined with -#' parameter `f`. Parameters for `FUN` can be passed using `...`. -#' The returned result and its order depend on the function `FUN` and how -#' `object` is split (hence on `f`, if provided). Parallel processing is -#' supported and can be configured with parameter `BPPARAM`, is however only -#' suggested for computational intense `FUN`. -#' As an alternative to the (eventual parallel) processing of the full -#' `Spectra`, `spectrapply()` supports also a chunk-wise processing. For this, -#' parameter `chunkSize` needs to be specified. `object` is then split into -#' chunks of size `chunkSize` which are then (stepwise) processed by `FUN`. -#' This guarantees a lower memory demand (especially for on-disk backends) -#' since only the data for one chunk needs to be loaded into memory in each -#' iteration. Note that by specifying `chunkSize`, parameters `f` and -#' `BPPARAM` will be ignored. -#' See also [chunkapply()] or examples below for details on chunk-wise -#' processing. +#' - `dataStorage()`: returns a `character` vector (same length than `object`) +#' with the data storage location of each spectrum. #' -#' - `smooth()`: smooths individual spectra using a moving window-based approach -#' (window size = `2 * halfWindowSize`). Currently, the -#' Moving-Average- (`method = "MovingAverage"`), -#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, -#' weights depending on the distance of the center and calculated -#' `1/2^(-halfWindowSize:halfWindowSize)`) and -#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. -#' For details how to choose the correct `halfWindowSize` please see -#' [`MsCoreUtils::smooth()`]. +#' - `intensity()`: gets the intensity values from the spectra. Returns +#' a [NumericList()] of `numeric` vectors (intensity values for each +#' spectrum). The length of the list is equal to the number of +#' `spectra` in `object`. #' -#' - `pickPeaks()`: picks peaks on individual spectra using a moving -#' window-based approach (window size = `2 * halfWindowSize`). For noisy -#' spectra there are currently two different noise estimators available, -#' the *M*edian *A*bsolute *D*eviation (`method = "MAD"`) and -#' Friedman's Super Smoother (`method = "SuperSmoother"`), -#' as implemented in the [`MsCoreUtils::noise()`]. -#' The method supports also to optionally *refine* the m/z value of -#' the identified centroids by considering data points that belong (most -#' likely) to the same mass peak. Therefore the m/z value is calculated as an -#' intensity weighted average of the m/z values within the peak region. -#' The peak region is defined as the m/z values (and their respective -#' intensities) of the `2 * k` closest signals to the centroid or the closest -#' valleys (`descending = TRUE`) in the `2 * k` region. For the latter the `k` -#' has to be chosen general larger. See [`MsCoreUtils::refineCentroids()`] for -#' details. -#' If the ratio of the signal to the highest intensity of the peak is below -#' `threshold` it will be ignored for the weighted average. +#' - `ionCount()`: returns a `numeric` with the sum of intensities for +#' each spectrum. If the spectrum is empty (see `isEmpty()`), +#' `NA_real_` is returned. #' -#' - `replaceIntensitiesBelow()`: replaces intensities below a specified -#' threshold with the provided `value`. Parameter `threshold` can be either -#' a single numeric value or a function which is applied to all non-`NA` -#' intensities of each spectrum to determine a threshold value for each -#' spectrum. The default is `threshold = min` which replaces all values -#' which are <= the minimum intensity in a spectrum with `value` (the -#' default for `value` is `0`). Note that the function specified with -#' `threshold` is expected to have a parameter `na.rm` since `na.rm = TRUE` -#' will be passed to the function. If the spectrum is in profile mode, -#' ranges of successive non-0 peaks <= `threshold` are set to 0. -#' Parameter `msLevel.` allows to apply this to only spectra of certain MS -#' level(s). +#' - `isCentroided()`: a heuristic approach assessing if the spectra in +#' `object` are in profile or centroided mode. The function takes +#' the `qtl`th quantile top peaks, then calculates the difference +#' between adjacent m/z value and returns `TRUE` if the first +#' quartile is greater than `k`. (See `Spectra:::.isCentroided()` for +#' the code.) #' +#' - `isEmpty()`: checks whether a spectrum in `object` is empty +#' (i.e. does not contain any peaks). Returns a `logical` vector of +#' length equal number of spectra. #' -#' @return See individual method description for the return value. +#' - `isolationWindowLowerMz()`, `isolationWindowLowerMz<-`: gets or sets the +#' lower m/z boundary of the isolation window. #' -#' @param acquisitionNum for `filterPrecursorScan()`: `integer` with the -#' acquisition number of the spectra to which the object should be -#' subsetted. +#' - `isolationWindowTargetMz()`, `isolationWindowTargetMz<-`: gets or sets the +#' target m/z of the isolation window. #' -#' @param backend For `Spectra()`: [MsBackend-class] to be used as backend. See -#' section on creation of `Spectra` objects for details. For `setBackend()`: -#' instance of [MsBackend-class] that supports `setBackend()` (i.e. for -#' which `supportsSetBackend()` returns `TRUE`). Such backends have a -#' parameter `data` in their `backendInitialize()` function that support -#' passing the full spectra data to the initialize method. See section on -#' creation of `Spectra` objects for details. -#' For `export()`: [MsBackend-class] to be used to export the data. +#' - `isolationWindowUpperMz()`, `isolationWindowUpperMz<-`: gets or sets the +#' upper m/z boundary of the isolation window. #' -#' @param binSize For `bin()`: `numeric(1)` defining the size for the m/z bins. -#' Defaults to `binSize = 1`. +#' - `length()`: gets the number of spectra in the object. #' -#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more -#' information. This is passed directly to the [backendInitialize()] method -#' of the [MsBackend-class]. +#' - `lengths()`: gets the number of peaks (m/z-intensity values) per +#' spectrum. Returns an `integer` vector (length equal to the +#' number of spectra). For empty spectra, `0` is returned. #' -#' @param breaks For `bin()`: `numeric` defining the m/z breakpoints between -#' bins. +#' - `msLevel()`: gets the spectra's MS level. Returns an integer vector (names +#' being spectrum names, length equal to the number of spectra) with the MS +#' level for each spectrum. #' -#' @param by For `scalePeaks()`: function to calculate a single `numeric` from -#' intensity values of a spectrum by which all intensities (of -#' that spectrum) should be divided by. The default `by = sum` will -#' divide intensities of each spectrum by the sum of intensities of that -#' spectrum. +#' - `mz()`: gets the mass-to-charge ratios (m/z) from the +#' spectra. Returns a [NumericList()] or length equal to the number of +#' spectra, each element a `numeric` vector with the m/z values of +#' one spectrum. #' -#' @param by.x A `character(1)` specifying the spectra variable used -#' for merging. Default is `"spectrumId"`. +#' - `peaksData()`: gets the *peaks* data for all spectra in `object`. Peaks +#' data consist of the m/z and intensity values as well as possible additional +#' annotations (variables) of all peaks of each spectrum. The function +#' returns a [SimpleList()] of two dimensional arrays (either `matrix` or +#' `data.frame`), with each array providing the values for the requested +#' *peak variables* (by default `"mz"` and `"intensity"`). Optional parameter +#' `columns` is passed to the backend's `peaksData()` function to allow +#' the selection of specific (or additional) peaks variables (columns) that +#' should be extracted (if available). Importantly, +#' it is **not** guaranteed that each backend supports this parameter (while +#' each backend must support extraction of `"mz"` and `"intensity"` columns). +#' Parameter `columns` defaults to `c("mz", "intensity")` but any value +#' returned by `peaksVariables(object)` is supported. +#' Note also that it is possible to extract the peak data with +#' `as(x, "list")` and `as(x, "SimpleList")` as a `list` and `SimpleList`, +#' respectively. Note however that, in contrast to `peaksData()`, `as()` +#' does not support the parameter `columns`. #' -#' @param by.y A `character(1)` specifying the column used for -#' merging. Set to `by.x` if missing. +#' - `peaksVariables()`: lists the available variables for mass peaks provided +#' by the backend. Default peak variables are `"mz"` and `"intensity"` (which +#' all backends need to support and provide), but some backends might provide +#' additional variables. +#' These variables correspond to the column names of the peak data array +#' returned by `peaksData()`. #' -#' @param charge For `deisotopeSpectra()`: expected charge of the ionized -#' compounds. See [isotopologues()] for details. +#' - `polarity()`, `polarity<-`: gets or sets the polarity for each +#' spectrum. `polarity()` returns an `integer` vector (length equal +#' to the number of spectra), with `0` and `1` representing negative +#' and positive polarities, respectively. `polarity<-` expects an +#' `integer` vector of length 1 or equal to the number of spectra. #' -#' @param chunkSize For `spectrapply()`: size of the chunks into which `Spectra` -#' should be split. This parameter overrides parameters `f` and `BPPARAM`. +#' - `precursorCharge()`, `precursorIntensity()`, `precursorMz()`, +#' `precScanNum()`, `precAcquisitionNum()`: gets the charge (`integer`), +#' intensity (`numeric`), m/z (`numeric`), scan index (`integer`) +#' and acquisition number (`interger`) of the precursor for MS level > +#' 2 spectra from the object. Returns a vector of length equal to +#' the number of spectra in `object`. `NA` are reported for MS1 +#' spectra of if no precursor information is available. #' -#' @param columns For `spectraData()` accessor: optional `character` with -#' column names (spectra variables) that should be included in the -#' returned `DataFrame`. By default, all columns are returned. -#' For `peaksData()` accessor: optional `character` with requested columns -#' in the individual `matrix` of the returned `list`. Defaults to -#' `c("mz", "value")` but any values returned by `peaksVariables(object)` -#' with `object` being the `Spectra` object are supported. +#' - `rtime()`, `rtime<-`: gets or sets the retention times (in seconds) +#' for each spectrum. `rtime()` returns a `numeric` vector (length +#' equal to the number of spectra) with the retention time for each +#' spectrum. `rtime<-` expects a numeric vector with length equal +#' to the number of spectra. #' -#' @param match For `filterRanges()` and `filterValues()`: `character(1) ` -#' defining whether the condition has to match for all provided -#' `ranges`/`values` (`match = "all"`; the default), or for any of them -#' (`match = "any"`) for spectra to be retained. +#' - `scanIndex()`: returns an `integer` vector with the *scan index* +#' for each spectrum. This represents the relative index of the +#' spectrum within each file. Note that this can be different to the +#' `acquisitionNum` of the spectrum which represents the index of the +#' spectrum during acquisition/measurement (as reported in the mzML file). #' -#' @param dataOrigin For `filterDataOrigin()`: `character` to define which -#' spectra to keep. -#' For `filterAcquisitionNum()`: optionally specify if filtering should -#' occurr only for spectra of selected `dataOrigin`. +#' - `smoothed()`,`smoothed<-`: gets or sets whether a spectrum is +#' *smoothed*. `smoothed()` returns a `logical` vector of length equal +#' to the number of spectra. `smoothed<-` takes a `logical` vector +#' of length 1 or equal to the number of spectra in `object`. #' -#' @param dataStorage For `filterDataStorage()`: `character` to define which -#' spectra to keep. -#' For `filterAcquisitionNum()`: optionally specify if filtering should -#' occur only for spectra of selected `dataStorage`. +#' - `spectraData()`: gets general spectrum metadata (annotation, also called +#' header). `spectraData()` returns a `DataFrame`. Note that this +#' method does by default **not** return m/z or intensity values. #' -#' @param descending For `pickPeaks()`: `logical`, if `TRUE` just values between -#' the nearest valleys around the peak centroids are used. -# -#' @param drop For `[`, `split()`: not considered. +#' - `spectraData<-`: **replaces** the full spectra data of the `Spectra` +#' object with the one provided with `value`. The `spectraData<-` function +#' expects a `DataFrame` to be passed as value with the same number of rows +#' as there a spectra in `object`. Note that replacing values of +#' peaks variables is not supported with a non-empty processing queue, i.e. +#' if any filtering or data manipulations on the peaks data was performed. +#' In these cases [applyProcessing()] needs to be called first to apply all +#' cached data operations and empty the processing queue. #' -#' @param f For `split()`: factor defining how to split `x`. See [base::split()] -#' for details. For `setBackend()`: factor defining how to split the data -#' for parallelized copying of the spectra data to the new backend. For some -#' backends changing this parameter can lead to errors. -#' For `combineSpectra()`: `factor` defining the grouping of the spectra -#' that should be combined. For `spectrapply()`: `factor` how `object` -#' should be splitted. For `filterPrecursorScan()`: defining which spectra -#' belong to the same original data file (sample): Defaults to -#' `f = dataOrigin(x)`. -#' For `intensity()`, `mz()` and `peaksData()`: factor defining how data -#' should be chunk-wise loaded an processed. Defaults to -#' [processingChunkFactor()]. +#' - `spectraNames()`, `spectraNames<-`: gets or sets the spectra names. #' -#' @param FUN For `addProcessing()`: function to be applied to the peak matrix -#' of each spectrum in `object`. For `compareSpectra()`: function to compare -#' intensities of peaks between two spectra with each other. -#' For `combineSpectra()`: function to combine the (peak matrices) of the -#' spectra. See section *Data manipulations* and examples below for more -#' details. -#' For `bin()`: function to aggregate intensity values of peaks falling -#' into the same bin. Defaults to `FUN = sum` thus summing up intensities. -#' For `spectrapply()` and `chunkapply()`: function to be applied to -#' `Spectra`. +#' - `spectraVariables()`: returns a `character` vector with the +#' available spectra variables (columns, fields or attributes of each +#' spectrum) available in `object`. Note that `spectraVariables()` does not +#' list the *peak variables* (`"mz"`, `"intensity"` and eventual additional +#' annotations for each MS peak). Peak variables are returned by +#' `peaksVariables()`. #' -#' @param halfWindowSize -#' - For `pickPeaks()`: `integer(1)`, used in the -#' identification of the mass peaks: a local maximum has to be the maximum -#' in the window from `(i - halfWindowSize):(i + halfWindowSize)`. -#' - For `smooth()`: `integer(1)`, used in the smoothing algorithm, the -#' window reaches from `(i - halfWindowSize):(i + halfWindowSize)`. -#' - For `filterFourierTransformArtefacts()`: `numeric(1)` defining the m/z -#' window left and right of a peak where to remove fourier transform -#' artefacts. +#' - `tic()`: gets the total ion current/count (sum of signal of a +#' spectrum) for all spectra in `object`. By default, the value +#' reported in the original raw data file is returned. For an empty +#' spectrum, `0` is returned. #' -#' @param i For `[`: `integer`, `logical` or `character` to subset the object. +#' - `uniqueMsLevels()`: get the unique MS levels available in `object`. This +#' function is supposed to be more efficient than `unique(msLevel(object))`. #' -#' @param j For `[`: not supported. +#' @md #' -#' @param initial For `tic()`: `logical(1)` whether the initially -#' reported total ion current should be reported, or whether the -#' total ion current should be (re)calculated on the actual data -#' (`initial = FALSE`, same as `ionCount()`). +#' @seealso #' -#' @param intensity For `filterIntensity()`: `numeric` of length 1 or 2 -#' defining either the lower or the lower and upper intensity limit for the -#' filtering, or a `function` that takes the intensities as input and -#' returns a `logical` (same length then peaks in the spectrum) whether the -#' peak should be retained or not. Defaults to `intensity = c(0, Inf)` thus -#' only peaks with `NA` intensity are removed. +#' - [addProcessing()] for functions to analyze `Spectra`. #' -#' @param intensityFun For `combinePeaks()`: function to be used to aggregate -#' intensities for all peaks in each peak group into a single intensity -#' value. +#' - [Spectra] for a general description of the `Spectra` object. #' -#' @param isotopeTolerance For `filterFourierTransformArtefacts()`: the m/z -#' `tolerance` to be used to define whether peaks might be isotopes of -#' the current tested peak. +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail #' -#' @param k For `pickPeaks()`: `integer(1)`, number of values left and right of -#' the peak that should be considered in the weighted mean calculation. +#' @examples #' -#' @param keep For `filterMzValues()` and `filterMzRange()`: `logical(1)` -#' whether the matching peaks should be retained (`keep = TRUE`, the -#' default) or dropped (`keep = FALSE`). +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex #' -#' @param keepIsotopes For `filterFourierTransformArtefacts()`: whether isotope -#' peaks should not be removed as fourier artefacts. +#' ## Get the number of spectra in the data set +#' length(sciex) #' -#' @param maxCharge For `filterFourierTransformArtefacts()`: the maximum charge -#' to be considered for isotopes. +#' ## Get the number of mass peaks per spectrum - limit to the first 6 +#' lengths(sciex) |> head() #' -#' @param MAPFUN For `compareSpectra()`: function to map/match peaks between the -#' two compared spectra. See [joinPeaks()] for more information and possible -#' functions. +#' ## Get the MS level for each spectrum - limit to the first 6 spectra +#' msLevel(sciex) |> head() #' -#' @param method -#' - For `pickPeaks()`: `character(1)`, the noise estimators that -#' should be used, currently the the *M*edian *A*bsolute *D*eviation -#' (`method = "MAD"`) and Friedman's Super Smoother -#' (`method = "SuperSmoother"`) are supported. -#' - For `smooth()`: `character(1)`, the smoothing function that should be -#' used, currently, the Moving-Average- (`method = "MovingAverage"`), -#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, -#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' ## Alternatively, we could also use $ to access a specific spectra variable. +#' ## This could also be used to add additional spectra variables to the +#' ## object (see further below). +#' sciex$msLevel |> head() #' -#' @param metadata For `Spectra()`: optional `list` with metadata information. +#' ## Get the intensity and m/z values. +#' intensity(sciex) +#' mz(sciex) #' -#' @param msLevel. `integer` defining the MS level(s) of the spectra to which -#' the function should be applied (defaults to all MS levels of `object`. -#' For `filterMsLevel()`: the MS level to which `object` should be -#' subsetted. +#' ## Convert a subset of the Spectra object to a long DataFrame. +#' asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) #' -#' @param mz For `filterIsolationWindow()`: `numeric(1)` with the m/z value to -#' filter the object. For `filterPrecursorMz()` and `filterMzRange()`: -#' `numeric(2)` defining the lower and upper m/z boundary. -#' For `filterMzValues()` and `filterPrecursorMzValues()`: `numeric` with -#' the m/z values to match peaks or precursor m/z against. +#' ## Create a Spectra providing a `DataFrame` containing the spectrum data. #' -#' @param mzFun For `combinePeaks()`: function to aggregate m/z values for all -#' peaks within each peak group into a single m/z value. This parameter -#' is ignored if `weighted = TRUE` (the default). +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) #' -#' @param n for `filterAcquisitionNum()`: `integer` with the acquisition -#' numbers to filter for. +#' s <- Spectra(spd) +#' s #' -#' @param name For `$` and `$<-`: the name of the spectra variable to return -#' or set. +#' ## List all available spectra variables (i.e. spectrum data and metadata). +#' spectraVariables(s) #' -#' @param neutralLoss for `containsNeutralLoss()`: `numeric(1)` defining the -#' value which should be subtracted from the spectrum's precursor m/z. +#' ## For all *core* spectrum variables accessor functions are available. These +#' ## return NA if the variable was not set. +#' centroided(s) +#' dataStorage(s) +#' rtime(s) +#' precursorMz(s) #' -#' @param normalized for `entropy()`: `logical(1)` whether the normalized -#' entropy should be calculated (default). See also [nentropy()] for -#' details. +#' ## The core spectra variables are: +#' coreSpectraVariables() #' -#' @param object For `Spectra()`: either a `DataFrame` or `missing`. See -#' section on creation of `Spectra` objects for details. For all other -#' methods a `Spectra` object. +#' ## Add an additional metadata column. +#' s$spectrum_id <- c("sp_1", "sp_2") #' -#' @param p For `combineSpectra()`: `factor` defining how to split the input -#' `Spectra` for parallel processing. Defaults to `x$dataStorage`, i.e., -#' depending on the used backend, per-file parallel processing will be -#' performed. +#' ## List spectra variables, "spectrum_id" is now also listed +#' spectraVariables(s) #' -#' @param polarity for `filterPolarity()`: `integer` specifying the polarity to -#' to subset `object`. -#' -#' @param ppm For `compareSpectra()`, `containsMz()`, `deisotopeSpectra()`, -#' `filterMzValues()` and `reduceSpectra()`: `numeric(1)` -#' defining a relative, m/z-dependent, maximal accepted difference between -#' m/z values for peaks to be matched (or grouped). -#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the relative -#' maximal accepted difference of precursor m/z values of spectra for -#' grouping them into *precursor groups*. For `filterPrecursorIsotopes()`: -#' passed directly to the [isotopologues()] function. -#' For `filterValues()`: `numeric` of any length allowing to define -#' a maximal accepted difference between user input `values` and the -#' `spectraVariables` values. If it is not equal to the length of the -#' value provided with parameter `spectraVariables`, `ppm[1]` will be -#' recycled. -#' -#' @param processingQueue For `Spectra()`: optional `list` of -#' [ProcessingStep-class] objects. -#' -#' @param ranges for `filterRanges()`: A `numeric` vector of paired values -#' (upper and lower boundary) that define the ranges to filter the `object`. -#' These paired values need to be in the same order as the -#' `spectraVariables` parameter (see below). -#' -#' @param rt for `filterRt()`: `numeric(2)` defining the retention time range to -#' be used to subset/filter `object`. -#' -#' @param SIMPLIFY For `compareSpectra()` whether the result matrix should be -#' *simplified* to a `numeric` if possible (i.e. if either `x` or `y` is -#' of length 1). -#' -#' @param snr For `pickPeaks()`: `double(1)` defining the -#' *S*ignal-to-*N*oise-*R*atio. The intensity of a local maximum has to be -#' higher than `snr * noise` to be considered as peak. -#' -#' @param source For `Spectra()`: instance of [MsBackend-class] that can be used -#' to import spectrum data from the provided files. See section *Creation -#' of objects, conversion and changing the backend* for more details. -#' -#' @param spectraVariables -#' - For `selectSpectraVariables()`: `character` with the -#' names of the spectra variables to which the backend should be -#' subsetted. -#' - For `addProcessing()`: `character` with additional spectra variables -#' that should be passed along to the function defined with `FUN`. See -#' function description for details. -#' - For `filterRanges()` and `filterValues()`: `character` vector -#' specifying the column(s) from `spectraData(object)` on which to filter -#' the data and that correspond to the the names of the spectra variables -#' that should be used for the filtering. -#' -#' @param substDefinition For `deisotopeSpectra()` and -#' `filterPrecursorIsotopes()`: `matrix` or `data.frame` with definitions -#' of isotopic substitutions. Uses by default isotopic substitutions -#' defined from all compounds in the Human Metabolome Database (HMDB). See -#' [isotopologues()] or [isotopicSubstitutionMatrix()] for details. -#' -#' @param suffix.y A `character(1)` specifying the suffix to be used -#' for making the names of columns in the merged spectra variables -#' unique. This suffix will be used to amend `names(y)`, while -#' `spectraVariables(x)` will remain unchanged. -#' -#' @param tolerance For `compareSpectra()`, `containsMz()`, -#' `deisotopeSpectra()`, `filterMzValues()` and `reduceSpectra()`: -#' `numeric(1)` allowing to define a constant maximal accepted difference -#' between m/z values for peaks to be matched (or grouped). For -#' `containsMz()` it can also be of length equal `mz` to specify a different -#' tolerance for each m/z value. -#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the -#' (constant) maximal accepted difference of precursor m/z values of -#' spectra for grouping them into *precursor groups*. For -#' `filterPrecursorIsotopes()`: passed directly to the [isotopologues()] -#' function. For `filterValues()`: `numeric` of any length allowing to -#' define a maximal accepted difference between user input `values` and the -#' `spectraVariables` values. If it is not equal to the length of the -#' value provided with parameter `spectraVariables`, `tolerance[1]` will be -#' recycled. Default is `tolerance = 0` -#' -#' @param threshold -#' - For `pickPeaks()`: a `double(1)` defining the proportion of the maximal -#' peak intensity. Just values above are used for the weighted mean -#' calculation. -#' - For `replaceIntensitiesBelow()`: a `numeric(1)` defining the threshold -#' or a `function` to calculate the threshold for each spectrum on its -#' intensity values. Defaults to `threshold = min`. -#' - For `filterFourierTransformArtefacts()`: the relative intensity (to a -#' peak) below which peaks are considered fourier artefacts. Defaults to -#' `threshold = 0.2` hence removing peaks that have an intensity below 0.2 -#' times the intensity of the tested peak (within the selected -#' `halfWindowSize`). -#' -#' @param use.names For `lengths()`: ignored. -#' -#' @param value replacement value for `<-` methods. See individual -#' method description or expected data type. -#' -#' @param values for `filterValues()`: A `numeric` vector that define the -#' values to filter the Spectra data. These values need to be in the same -#' order as the `spectraVariables` parameter. -#' -#' @param weighted For `combinePeaks()`: `logical(1)` whether m/z values of -#' peaks within each peak group should be aggregated into a single m/z -#' value using an intensity-weighted mean. Defaults to `weighted = TRUE`. -#' -#' @param which for `containsMz()`: either `"any"` or `"all"` defining whether -#' any (the default) or all provided `mz` have to be present in the -#' spectrum. -#' -#' @param x A `Spectra` object. -#' -#' @param y A `Spectra` object. -#' - For `joinSpectraData()`: a `DataFrame`. -#' - For `cbind2()` a `data.frame`, `DataFrame` or `matrix`. -#' -#' @param z For `filterPrecursorCharge()`: `integer()` with the precursor -#' charges to be used as filter. -#' -#' @param zero.rm `logical`. For `bin()`: indicating whether to remove bins -#' with zero intensity. Defaults to `TRUE`, meaning the function will -#' discard bins created with an intensity of 0 to enhance memory efficiency. -#' -#' @param ... Additional arguments. -#' -#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto -#' -#' @md -#' -#' @exportClass Spectra -#' -#' @exportMethod Spectra -#' -#' @examples -#' -#' ## Create a Spectra providing a `DataFrame` containing the spectrum data. -#' -#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) -#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) -#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) -#' -#' data <- Spectra(spd) -#' data -#' -#' ## Get the number of spectra -#' length(data) -#' -#' ## Get the number of peaks per spectrum -#' lengths(data) -#' -#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk -#' ## backend. -#' sciex_file <- dir(system.file("sciex", package = "msdata"), -#' full.names = TRUE) -#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) -#' sciex -#' -#' ## The MS data is on disk and will be read into memory on-demand. We can -#' ## however change the backend to a MsBackendMemory backend which will -#' ## keep all of the data in memory. -#' sciex_im <- setBackend(sciex, MsBackendMemory()) -#' sciex_im -#' -#' ## The `MsBackendMemory()` supports the `setBackend()` method: -#' supportsSetBackend(MsBackendMemory()) -#' -#' ## Thus, it is possible to change to that backend with `setBackend()`. Most -#' ## read-only backends however don't support that, such as the -#' ## `MsBackendMzR` and `setBackend()` would fail to change to that backend. -#' supportsSetBackend(MsBackendMzR()) -#' -#' ## The on-disk object `sciex` is light-weight, because it does not keep the -#' ## MS peak data in memory. The `sciex_im` object in contrast keeps all the -#' ## data in memory and its size is thus much larger. -#' object.size(sciex) -#' object.size(sciex_im) -#' -#' ## The spectra variable `dataStorage` returns for each spectrum the location -#' ## where the data is stored. For in-memory objects: -#' head(dataStorage(sciex_im)) -#' -#' ## While objects that use an on-disk backend will list the files where the -#' ## data is stored. -#' head(dataStorage(sciex)) -#' -#' ## The spectra variable `dataOrigin` returns for each spectrum the *origin* -#' ## of the data. If the data is read from e.g. mzML files, this will be the -#' ## original mzML file name: -#' head(dataOrigin(sciex)) -#' head(dataOrigin(sciex_im)) -#' -#' -#' ## ---- ACCESSING AND ADDING DATA ---- -#' -#' ## Get the MS level for each spectrum. -#' msLevel(data) -#' -#' ## Alternatively, we could also use $ to access a specific spectra variable. -#' ## This could also be used to add additional spectra variables to the -#' ## object (see further below). -#' data$msLevel -#' -#' ## Get the intensity and m/z values. -#' intensity(data) -#' mz(data) +#' ## Get the values for the new spectra variable +#' s$spectrum_id #' -#' ## Determine whether one of the spectra has a specific m/z value -#' containsMz(data, mz = 120.4) +#' ## Extract specific spectra variables. +#' spectraData(s, columns = c("spectrum_id", "msLevel")) #' -#' ## Accessing spectra variables works for all backends: -#' intensity(sciex) -#' intensity(sciex_im) #' -#' ## Get the m/z for the first spectrum. -#' mz(data)[[1]] +#' ## -------- PEAKS VARIABLES AND DATA -------- #' #' ## Get the peak data (m/z and intensity values). -#' pks <- peaksData(data) +#' pks <- peaksData(s) #' pks #' pks[[1]] #' pks[[2]] #' #' ## Note that we could get the same resulb by coercing the `Spectra` to #' ## a `list` or `SimpleList`: -#' as(data, "list") -#' as(data, "SimpleList") -#' -#' ## List all available spectra variables (i.e. spectrum data and metadata). -#' spectraVariables(data) -#' -#' ## For all *core* spectrum variables accessor functions are available. These -#' ## return NA if the variable was not set. -#' centroided(data) -#' dataStorage(data) -#' rtime(data) -#' precursorMz(data) -#' -#' ## The core spectra variables are: -#' coreSpectraVariables() -#' -#' ## Add an additional metadata column. -#' data$spectrum_id <- c("sp_1", "sp_2") -#' -#' ## List spectra variables, "spectrum_id" is now also listed -#' spectraVariables(data) -#' -#' ## Get the values for the new spectra variable -#' data$spectrum_id -#' -#' ## Extract specific spectra variables. -#' spectraData(data, columns = c("spectrum_id", "msLevel")) -#' -#' ## Drop spectra variable data and/or columns. -#' res <- selectSpectraVariables(data, c("mz", "intensity")) -#' -#' ## This removed the additional columns "spectrum_id" and deleted all values -#' ## for all spectra variables, except "mz" and "intensity". -#' spectraData(res) -#' -#' ## Compared to the data before selectSpectraVariables. -#' spectraData(data) +#' as(s, "list") +#' as(s, "SimpleList") #' +#' ## Or use `mz()` and `intensity()` to extract the m/z and intensity values +#' ## separately +#' mz(s) +#' intensity(s) #' -#' ## ---- SUBSETTING, FILTERING AND COMBINING -#' -#' ## Subset to all MS2 spectra. -#' data[msLevel(data) == 2] -#' -#' ## Append new `spectraVariables` to the `spectraData` -#' df <- data.frame(cola = 4:5, colb = "b") -#' data_append <- cbind2(data, df) -#' -#' ## Same with the filterMsLevel function -#' filterMsLevel(data, 2) -#' -#' ## Below we combine the `data` and `sciex_im` objects into a single one. -#' data_comb <- c(data, sciex_im) -#' -#' ## The combined Spectra contains a union of all spectra variables: -#' head(data_comb$spectrum_id) -#' head(data_comb$rtime) -#' head(data_comb$dataStorage) -#' head(data_comb$dataOrigin) -#' -#' ## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm -#' spd$precursorMz <- c(323.4, 543.2302) -#' data_filt <- Spectra(spd) -#' filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10)) -#' -#' ## Filter a Spectra keeping only peaks matching certain m/z values -#' sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3) -#' mz(sps_sub) +#' ## Some `MsBackend` classes provide support for arbitrary peaks variables +#' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below +#' ## we create a simple data frame with an additional peak variable `"pk_ann"` +#' ## and create a `Spectra` with a `MsBackendMemory` for that data. +#' ## Importantly the number of values (per spectrum) need to be the same +#' ## for all peak variables. #' -#' ## This function can also be used to remove specific peaks from a spectrum -#' ## by setting `keep = FALSE`. -#' sps_sub <- filterMzValues(data, mz = c(103, 104), -#' tolerance = 0.3, keep = FALSE) -#' mz(sps_sub) +#' tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) +#' tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) +#' tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) +#' tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) #' -#' ## Note that `filterMzValues()` keeps or removes all peaks with a matching -#' ## m/z given the provided `ppm` and `tolerance` parameters. +#' ## Create the Spectra. With parameter `peaksVariables` we can define +#' ## the columns in `tmp` that contain peaks variables. +#' sps <- Spectra(tmp, source = MsBackendMemory(), +#' peaksVariables = c("mz", "intensity", "pk_ann")) +#' peaksVariables(sps) #' -#' ## Filter a Spectra keeping only peaks within a m/z range -#' sps_sub <- filterMzRange(data, mz = c(100, 300)) -#' mz(sps_sub) +#' ## Extract just the m/z and intensity values +#' peaksData(sps)[[1L]] #' -#' ## Remove empty spectra variables -#' sciex_noNA <- dropNaSpectraVariables(sciex) +#' ## Extract the full peaks data +#' peaksData(sps, columns = peaksVariables(sps))[[1L]] #' -#' ## Available spectra variables before and after `dropNaSpectraVariables()` -#' spectraVariables(sciex) -#' spectraVariables(sciex_noNA) +#' ## Access just the pk_ann variable +#' sps$pk_ann #' #' -#' ## Adding new spectra variables -#' sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) -#' spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging -#' var1 = rnorm(10), -#' var2 = sample(letters, 10)) -#' spv +NULL + +#' @importFrom methods setAs +setAs("Spectra", "list", function(from, to) { + .peaksapply(from) +}) + +setAs("Spectra", "SimpleList", function(from, to) { + peaksData(from) +}) + +#' @export #' -#' sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") +#' @rdname spectraData +asDataFrame <- function(object, i = seq_along(object), + spectraVars = spectraVariables(object)) { + stopifnot(inherits(object, "Spectra")) + object <- object[i] + n <- sapply(peaksData(object), nrow) + v <- spectraData(object)[rep(seq_along(object), n), spectraVars] + p <- do.call(rbind, as.list(peaksData(object))) + cbind(p, v) +} + +#' @rdname spectraData #' -#' spectraVariables(sciex2) -#' spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +#' @export +setMethod("acquisitionNum", "Spectra", function(object) + acquisitionNum(object@backend)) + +#' @rdname spectraData +setMethod("centroided", "Spectra", function(object) { + centroided(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("centroided", "Spectra", function(object, value) { + centroided(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("collisionEnergy", "Spectra", function(object) { + collisionEnergy(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("collisionEnergy", "Spectra", function(object, value) { + collisionEnergy(object@backend) <- value + object +}) + +#' @rdname spectraData #' -#' ## Removing fourier transform artefacts seen in Orbitra data. +#' @export +coreSpectraVariables <- function() .SPECTRA_DATA_COLUMNS + +#' @rdname spectraData +setMethod("dataOrigin", "Spectra", function(object) dataOrigin(object@backend)) + +#' @rdname spectraData +setReplaceMethod("dataOrigin", "Spectra", function(object, value) { + dataOrigin(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("dataStorage", "Spectra", + function(object) dataStorage(object@backend)) + +#' @rdname spectraData +setMethod("intensity", "Spectra", function(object, + f = processingChunkFactor(object), + ...) { + if (length(object@processingQueue) || length(f)) + NumericList(.peaksapply(object, FUN = function(z, ...) z[, 2], + f = f, ...), compress = FALSE) + else intensity(object@backend) +}) + +#' @rdname spectraData +setMethod("ionCount", "Spectra", function(object) { + if (length(object)) + unlist(.peaksapply( + object, FUN = function(pks, ...) sum(pks[, 2], na.rm = TRUE)), + use.names = FALSE) + else numeric() +}) + +#' @rdname spectraData +setMethod("isCentroided", "Spectra", function(object, ...) { + if (length(object)) + unlist(.peaksapply(object, FUN = .peaks_is_centroided), + use.names = FALSE) + else logical() +}) + +#' @rdname spectraData +setMethod("isEmpty", "Spectra", function(x) { + if (length(x)) + unlist(.peaksapply(x, FUN = function(pks, ...) nrow(pks) == 0), + use.names = FALSE) + else logical() +}) + +#' @rdname spectraData +setMethod("isolationWindowLowerMz", "Spectra", function(object) { + isolationWindowLowerMz(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("isolationWindowLowerMz", "Spectra", function(object, value) { + isolationWindowLowerMz(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("isolationWindowTargetMz", "Spectra", function(object) { + isolationWindowTargetMz(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("isolationWindowTargetMz", "Spectra", function(object, value) { + isolationWindowTargetMz(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("isolationWindowUpperMz", "Spectra", function(object) { + isolationWindowUpperMz(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("isolationWindowUpperMz", "Spectra", function(object, value) { + isolationWindowUpperMz(object@backend) <- value + object +}) + +#' @rdname spectraData +#' +#' @exportMethod length +setMethod("length", "Spectra", function(x) length(x@backend)) + +#' @rdname spectraData +#' +#' @exportMethod lengths +setMethod("lengths", "Spectra", function(x, use.names = FALSE) { + f <- .parallel_processing_factor(x) + if (length(x)) { + if (length(x@processingQueue) || length(f)) + unlist(.peaksapply(x, FUN = function(pks, ...) nrow(pks)), + use.names = use.names) + else lengths(x@backend, use.names = use.names) + } else integer() +}) + +#' @rdname spectraData +setMethod("msLevel", "Spectra", function(object) msLevel(object@backend)) + +#' @rdname spectraData +setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), + ...) { + if (length(object@processingQueue) || length(f)) + NumericList(.peaksapply(object, FUN = function(z, ...) z[, 1], + f = f, ...), compress = FALSE) + else mz(object@backend) +}) + +#' @rdname spectraData +#' +#' @export +setMethod( + "peaksData", "Spectra", + function(object, columns = c("mz", "intensity"), + f = processingChunkFactor(object), ..., BPPARAM = bpparam()) { + if (length(object@processingQueue) || length(f)) + SimpleList(.peaksapply(object, columns = columns, f = f)) + else SimpleList(peaksData(object@backend, columns = columns)) + }) + +#' @rdname spectraData +setMethod("peaksVariables", "Spectra", function(object) + peaksVariables(object@backend)) + +#' @rdname spectraData +setMethod("polarity", "Spectra", function(object) { + polarity(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("polarity", "Spectra", function(object, value) { + polarity(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("precScanNum", "Spectra", function(object) { + precScanNum(object@backend) +}) + +#' @rdname spectraData +setMethod("precursorCharge", "Spectra", function(object) { + precursorCharge(object@backend) +}) + +#' @rdname spectraData +setMethod("precursorIntensity", "Spectra", function(object) { + precursorIntensity(object@backend) +}) + +#' @rdname spectraData +setMethod("precursorMz", "Spectra", function(object) { + precursorMz(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("precursorMz", "Spectra", function(object, ..., value) { + precursorMz(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("rtime", "Spectra", function(object) { + rtime(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("rtime", "Spectra", function(object, value) { + rtime(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("scanIndex", "Spectra", function(object) { + scanIndex(object@backend) +}) + +#' @rdname spectraData +setMethod("smoothed", "Spectra", function(object) { + smoothed(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("smoothed", "Spectra", function(object, value) { + smoothed(object@backend) <- value + object +}) + +#' @rdname spectraData +#' +#' @importMethodsFrom ProtGenerics spectraData +#' +#' @exportMethod spectraData +setMethod( + "spectraData", "Spectra", + function(object, columns = spectraVariables(object)) { + if (length(object@processingQueue) && + length(pcns <- intersect(columns, peaksVariables(object)))) { + ## If user requests peaks variables we need to ensure that the + ## processing queue is executed. + scns <- setdiff(columns, pcns) + if (length(scns)) + spd <- spectraData(object@backend, columns = scns) + else + spd <- make_zero_col_DFrame(nrow = length(object)) + pkd <- peaksData(object, columns = pcns) + ## Add individual peaks variables to the `DataFrame`. + for (pcn in pcns) { + vals <- lapply(pkd, `[`, , pcn) + if (pcn %in% c("mz", "intensity")) + vals <- NumericList(vals, compress = FALSE) + spd <- do.call(`[[<-`, list(spd, i = pcn, value = vals)) + } + spd + } else + spectraData(object@backend, columns = columns) + }) + +#' @rdname spectraData +#' +#' @importMethodsFrom ProtGenerics spectraData<- +#' +#' @exportMethod spectraData<- +setReplaceMethod("spectraData", "Spectra", function(object, value) { + if (!inherits(value, "DataFrame")) + stop("'spectraData<-' expects a 'DataFrame' as input.", call. = FALSE) + pvs <- peaksVariables(object) + if (length(object@processingQueue) && + any(colnames(value) %in% pvs)) + stop("Can not replace peaks variables with a non-empty processing ", + "queue. Please use 'object <- applyProcessing(object)' to apply ", + "and clear the processing queue. Note that 'applyProcessing' ", + "requires a *writeable* backend. Use e.g. 'object <- ", + "setBackend(object, MsBackendMemory())' if needed.") + pvs <- setdiff(pvs, colnames(value)) + if (length(pvs)) { + sd <- spectraData(object, pvs) + for (pv in pvs) { + value <- do.call("$<-", list(value, name = pv, sd[, pv])) + } + object@processingQueue <- list() + } + spectraData(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("spectraNames", "Spectra", function(object) { + spectraNames(object@backend) +}) + +#' @rdname spectraData +setReplaceMethod("spectraNames", "Spectra", function(object, value) { + spectraNames(object@backend) <- value + object +}) + +#' @rdname spectraData +setMethod("spectraVariables", "Spectra", function(object) { + setdiff(spectraVariables(object@backend), peaksVariables(object@backend)) +}) + +#' @rdname spectraData +setMethod("tic", "Spectra", function(object, initial = TRUE) { + if (!length(object)) + return(numeric()) + if (initial) + tic(object@backend, initial = initial) + else ionCount(object) +}) + +#' @rdname spectraData +setMethod("uniqueMsLevels", "Spectra", function(object, ...) { + uniqueMsLevels(object@backend, ...) +}) + +#' @rdname spectraData +#' +#' @importMethodsFrom S4Vectors $ +#' +#' @export +setMethod("$", "Spectra", function(x, name) { + if (!(name %in% c(spectraVariables(x@backend), peaksVariables(x@backend)))) + stop("No spectra variable '", name, "' available") + if (name == "mz") + mz(x) + else if (name == "intensity") + intensity(x) + else { + if (length(x@processingQueue) && name %in% peaksVariables(x)) + .peaksapply(x, FUN = function(z, ...) z[, name], + columns = c("mz", "intensity", name)) + else + do.call("$", list(x@backend, name)) + } +}) + +#' @rdname spectraData +#' +#' @export +setReplaceMethod("$", "Spectra", function(x, name, value) { + if (length(x@processingQueue) && + any(name %in% peaksVariables(x))) + stop("Can not replace peaks variables with a non-empty processing ", + "queue. Please use 'object <- applyProcessing(object)' to apply ", + "and clear the processing queue. Note that 'applyProcessing' ", + "requires a *writeable* backend. Use e.g. 'object <- ", + "setBackend(object, MsBackendMemory())' if needed.") + x@backend <- do.call("$<-", list(x@backend, name, value)) + x +}) + +#' @rdname spectraData +#' +#' @export +setMethod("[[", "Spectra", function(x, i, j, ...) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to access.") + if (!missing(j)) + stop("'j' is not supported.") + if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) + stop("No spectra variable '", i, "' available") + if (i == "mz") + mz(x) + else if (i == "intensity") + intensity(x) + else + do.call("[[", list(x@backend, i)) +}) + +#' @rdname spectraData +#' +#' @export +setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to replace or create.") + if (!missing(j)) + stop("'j' is not supported.") + x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) + x +}) + + +################################################################################ +## +## Merging, splitting and aggregating Spectra: length of Spectra is changed +## +################################################################################ + +#' @title Merging, aggregating and splitting Spectra +#' +#' @name combineSpectra +#' +#' @aliases combineSpectra +#' @aliases split +#' @aliases joinSpectraData +#' +#' @description +#' +#' Various functions are availabe to combine, aggregate or split data from one +#' of more `Spectra` objects. These are: +#' +#' - `c()` and `concatenateSpectra()`: combines several `Spectra` objects into +#' a single object. The resulting `Spectra` contains all data from all +#' individual `Spectra`, i.e. the union of all their spectra variables. +#' Concatenation will fail if the processing queue of any of the `Spectra` +#' objects is not empty or if different backends are used for the `Spectra` +#' objects. In such cases it is suggested to first change the backends of +#' all `Spectra` to the same type of backend (using the [setBackend()] +#' function and to eventually (if needed) apply the processing queue using +#' the [applyProcessing()] function. +#' +#' - `combineSpectra()`: combines sets of spectra (defined with parameter `f`) +#' into a single spectrum per set aggregating their MS data (i.e. their +#' *peaks data* matrices with the *m/z* and intensity values of their +#' mass peaks). The spectra variable values of the first spectrum per set +#' are reported for the combined spectrum. The peak matrices of the spectra +#' per set are combined using the function specified with parameter `FUN` +#' which uses by default the [combinePeaksData()] function. See the +#' documentation of [combinePeaksData()] for details on the aggregation of +#' the peak data and the package vignette for examples. +#' The sets of spectra can be specified with parameter `f` which is expected +#' to be a `factor` or `vector` of length equal to the length of the +#' `Spectra` specifying to which set a spectrum belongs to. The function +#' returns a `Spectra` of length equal to the unique levels of `f`. The +#' optional parameter `p` allows to define how the `Spectra` should be +#' split for potential parallel processing. The default is +#' `p = x$dataStorage` and hence a per storage file parallel processing is +#' applied for `Spectra` with on disk data representations (such as the +#' [MsBackendMzR()]). This also prevents that spectra from different data +#' files/samples are combined (eventually use e.g. `p = x$dataOrigin` or any +#' other spectra variables defining the originating samples for a spectrum). +#' Before combining the peaks data, all eventual present processing steps are +#' applied (by calling [applyProcessing()] on the `Spectra`). This function +#' will replace the original *m/z* and intensity values of a `Spectra` hence +#' it can not be called on a `Spectra` with a *read-only* backend. In such +#' cases, the backend should be changed to a *writeable* backend before +#' using the [setBackend()] function (to e.g. a [MsBackendMemory()] backend). +#' +#' - `joinSpectraData()`: Individual spectra variables can be directly +#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` +#' function allows to merge a `DataFrame` to the existing spectra +#' data of a `Spectra`. This function diverges from the [merge()] method in +#' two main ways: +#' - The `by.x` and `by.y` column names must be of length 1. +#' - If variable names are shared in `x` and `y`, the spectra +#' variables of `x` are not modified. It's only the `y` +#' variables that are appended with the suffix defined in +#' `suffix.y`. This is to avoid modifying any core spectra +#' variables that would lead to an invalid object. +#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not +#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) +#' throw a warning and only the last occurrence is kept. These +#' should be explored and ideally be removed using for +#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar +#' functions. +#' +#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` +#' of `Spectra` objects. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @param by.x A `character(1)` specifying the spectra variable used +#' for merging. Default is `"spectrumId"`. +#' +#' @param by.y A `character(1)` specifying the column used for +#' merging. Set to `by.x` if missing. +#' +#' @param drop For `split()`: not considered. +#' +#' @param f For `split()`: factor defining how to split `x`. See [base::split()] +#' for details. +#' For `combineSpectra()`: `factor` defining the grouping of the spectra +#' that should be combined. Defaults to `x$dataStorage`. +#' +#' @param FUN For `combineSpectra()`: function to combine the (peak matrices) +#' of the spectra. Defaults to [combinePeaksData()]. +#' +#' @param p For `combineSpectra()`: `factor` defining how to split the input +#' `Spectra` for parallel processing. Defaults to `x$dataStorage`, i.e., +#' depending on the used backend, per-file parallel processing will be +#' performed. +#' +#' @param suffix.y A `character(1)` specifying the suffix to be used +#' for making the names of columns in the merged spectra variables +#' unique. This suffix will be used to amend `names(y)`, while +#' `spectraVariables(x)` will remain unchanged. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `DataFrame` with the spectra variables to join/add. +#' +#' @param ... Additional arguments. +#' +#' @seealso +#' +#' - [combinePeaks()] for functions to aggregate mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @importFrom MsCoreUtils vapply1c +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra providing a `DataFrame` containing a MS data. +#' +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) +#' +#' s <- Spectra(spd) +#' s +#' +#' ## Create a second Spectra from mzML files and use the `MsBackendMzR` +#' ## on-disk backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex +#' +#' ## Subset to the first 100 spectra to reduce running time of the examples +#' sciex <- sciex[1:100] +#' +#' +#' ## -------- COMBINE SPECTRA -------- +#' +#' ## Combining the `Spectra` object `s` with the MS data from `sciex`. +#' ## Calling directly `c(s, sciex)` would result in an error because +#' ## both backends use a different backend. We thus have to first change +#' ## the backends to the same backend. We change the backend of the `sciex` +#' ## `Spectra` to a `MsBackendMemory`, the backend used by `s`. +#' +#' sciex <- setBackend(sciex, MsBackendMemory()) +#' +#' ## Combine the two `Spectra` +#' all <- c(s, sciex) +#' all +#' +#' ## The new `Spectra` objects contains the union of spectra variables from +#' ## both: +#' spectraVariables(all) +#' +#' ## The spectra variables that were not present in `s`: +#' setdiff(spectraVariables(all), spectraVariables(s)) +#' +#' ## The values for these were filled with missing values for spectra from +#' ## `s`: +#' all$peaksCount |> head() +#' +#' +#' ## -------- AGGREGATE SPECTRA -------- +#' +#' ## Sets of spectra can be combined into a single, representative spectrum +#' ## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +#' ## the spectra's m/z and intensity values) while using the values for all +#' ## spectra variables from the first spectrum per set. Below we define the +#' ## sets as all spectra measured in the *same second*, i.e. rounding their +#' ## retention time to the next closer integer value. +#' f <- round(rtime(sciex)) +#' head(f) +#' +#' cmp <- combineSpectra(sciex, f = f) +#' +#' ## The length of `cmp` is now equal to the length of unique levels in `f`: +#' length(cmp) +#' +#' ## The spectra variable value from the first spectrum per set is used in +#' ## the representative/combined spectrum: +#' cmp$rtime +#' +#' ## The peaks data was aggregated: the number of mass peaks of the first six +#' ## spectra from the original `Spectra`: +#' lengths(sciex) |> head() +#' +#' ## and for the first aggreagated spectra: +#' lengths(cmp) |> head() +#' +#' ## The default peaks data aggregation method joins all mass peaks. See +#' ## documentation of the `combinePeaksData()` function for more options. +#' +#' +#' ## -------- SPLITTING DATA -------- +#' +#' ## A `Spectra` can be split into a `list` of `Spectra` objects using the +#' ## `split()` function defining the sets into which the `Spectra` should +#' ## be splitted into with parameter `f`. +#' sciex_split <- split(sciex, f) +#' +#' length(sciex_split) +#' sciex_split |> head() +#' +#' +#' ## -------- ADDING SPECTRA DATA -------- +#' +#' ## Adding new spectra variables +#' sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +#' spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging +#' var1 = rnorm(10), +#' var2 = sample(letters, 10)) +#' spv +#' +#' sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") +#' +#' spectraVariables(sciex2) +#' spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +NULL + +#' @rdname combineSpectra +#' +#' @exportMethod c +setMethod("c", "Spectra", function(x, ...) { + .concatenate_spectra(unname(list(unname(x), ...))) +}) + +#' @rdname combineSpectra +setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { + bcknds <- split(x@backend, f, ...) + lapply(bcknds, function(b) { + slot(x, "backend", check = FALSE) <- b + x + }) +}) + + +################################################################################ +## +## Aggregating peaks data +## +################################################################################ + +#' @title Aggregating and combining mass peaks data +#' +#' @name combinePeaks +#' +#' @description +#' +#' In addition to aggregating content of spectra variables (describe in +#' [combineSpectra()]) it is also possible to aggregate and combine mass peaks +#' data from individual spectra within a `Spectra`. These `combinePeaks()` +#' function combines mass peaks **within each spectrum** with a difference in +#' their m/z values that is smaller than the maximal acceptable difference +#' defined by `ppm` and `tolerance`. Parameters `intensityFun` and `mzFun` +#' allow to define functions to aggregate the intensity and m/z values for +#' each such group of peaks. With `weighted = TRUE` (the default), the m/z +#' value of the combined peak is calculated using an intensity-weighted mean +#' and parameter `mzFun` is ignored. The [MsCoreUtils::group()] function is +#' used for the grouping of mass peaks. Parameter `msLevel.` allows to define +#' selected MS levels for which peaks should be combined. This function +#' returns a `Spectra` with the same number of spectra than the input object, +#' but with possibly combined peaks within each spectrum. +#' Additional peak variables (other than `"mz"` and `"intensity"`) are +#' dropped (i.e. their values are replaced with `NA`) for combined peaks +#' unless they are constant across the combined peaks. See also +#' [reduceSpectra()] for a function to select a single *representative* +#' mass peak for each peak group. +#' +#' @param intensityFun Function to aggregate intensities for all peaks in +#' each peak group into a single intensity value. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mzFun Function to aggregate m/z values for all mass peaks within +#' each peak group into a single m/z value. This parameter is ignored if +#' `weighted = TRUE` (the default). +#' +#' @param object A `Spectra` object. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `ppm = 20`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `tolerance = 0`. +#' +#' @param weighted `logical(1)` whether m/z values of peaks within each peak +#' group should be aggregated into a single m/z value using an +#' intensity-weighted mean. Defaults to `weighted = TRUE`. +#' +#' @param ... ignored. +#' +#' @md +#' +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`'s +#' spectra data. +#' +#' - [combinePeaksData()] for the function to combine the mass peaks data. +#' +#' - [reduceSpectra()] and similar functions to filter mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' +#' ## Combine mass peaks per spectrum with a difference in their m/z value +#' ## that is smaller than 20 ppm. The intensity values of such peaks are +#' ## combined by summing their values, while for the m/z values the median +#' ## is reported +#' sciex_comb <- combinePeaks(sciex, ppm = 20, +#' intensityFun = sum, mzFun = median) +#' +#' ## Comparing the number of mass peaks before and after aggregation +#' lengths(sciex) |> head() +#' lengths(sciex_comb) |> head() +#' +#' ## Plotting the first spectrum before and after aggregation +#' par(mfrow = c(1, 2)) +#' plotSpectra(sciex[2L]) +#' plotSpectra(sciex_comb[2L]) +#' +#' ## Using `reduceSpectra()` to keep for each group of mass peaks with a +#' ## difference in their m/z values < 20ppm the one with the highest intensity. +#' sciex_red <- reduceSpectra(sciex, ppm = 20) +#' +#' ## Comparing the number of mass peaks before and after the operation +#' lengths(sciex) |> head() +#' lengths(sciex_red) |> head() +NULL + +#' @rdname hidden_aliases +setMethod("combinePeaks", "list", function(object, ...) { + .Deprecated("combinePeaksData", old = "combinePeaks", + msg = paste0("'combinePeaks' for lists of peak matrices is ", + "deprecated; please use 'combinePeaksData' ", + "instead.")) + combinePeaksData(object, ...) +}) + +#' @rdname combinePeaks +#' +#' @exportMethod combinePeaks +setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, + intensityFun = base::mean, + mzFun = base::mean, + weighted = TRUE, + msLevel. = uniqueMsLevels(object), + ...) { + object <- addProcessing( + object, .peaks_combine, ppm = ppm, tolerance = tolerance, + intensityFun = intensityFun, mzFun = mzFun, weighted = weighted, + msLevel = force(msLevel.), spectraVariables = "msLevel") + object@processing <- .logging( + object@processing, "Combining peaks within each spectrum with ppm = ", + ppm, " and tolerance = ", tolerance, ".") + object +}) + + +################################################################################ +## +## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## +################################################################################ + +#' @title Filter and subset Spectra objects +#' +#' @name filterMsLevel +#' +#' @aliases [,Spectra-method +#' @aliases filterAcquisitionNum +#' @aliases filterDataOrigin +#' @aliases filterDataStorage +#' @aliases filterEmptySpectra +#' @aliases filterIsolationWindow +#' @aliases filterMsLevel +#' @aliases filterPolarity +#' @aliases filterPrecursorCharge +#' @aliases filterPrecursorIsotopes +#' @aliases filterPrecursorMzRange +#' @aliases filterPrecursorMzValues +#' @aliases filterPrecursorScan +#' @aliases filterRanges +#' @aliases filterRt +#' @aliases filterValues +#' @aliases dropNaSpectraVariables +#' @aliases selectSpectraVariables +#' @aliases filterIntensity +#' @aliases filterMzRange +#' @aliases filterMzValues +#' @aliases reduceSpectra +#' +#' @description +#' +#' A variety of functions to filter or subset `Spectra` objects are available. +#' These can be generally separated into two main classes: I) *classical* +#' subset operations that immediately reduce the number of spectra in the +#' object and II) filters that reduce the **content** of the object without +#' changing its length (i.e. the number of spectra). The latter can be further +#' subdivided into functions that affect the content of the `spectraData` (i.e. +#' the general spectrum metadata) and those that reduce the content of the +#' object's `peaksData` (i.e. the m/z and intensity values of a spectrum's +#' mass peaks). +#' +#' A description of functions from these 3 different categories are given below +#' in sections *Subset `Spectra`*, *Filter content of `spectraData()`* and +#' *Filter content of `peaksData()`*, respectively. +#' +#' +#' @section Subset `Spectra`: +#' +#' These functions affect the number of spectra in a `Spectra` object creating +#' a subset of the original object without affecting its content. +#' +#' - `[`: subsets the spectra keeping only selected elements (`i`). The method +#' **always** returns a `Spectra` object. +#' +#' - `cbind2()`: Appends multiple spectra variables from a `data.frame`, +#' `DataFrame` or `matrix` to the `Spectra` object at once. It does so +#' *blindly* (e.g. do not check rownames compatibility) and is therefore at +#' the risk of the user. For a more controlled way of adding spectra +#' variables, the `joinSpectraData()` should be used. It will return a +#' `Spectra` object with the appended spectra variables. `cbind2()` does +#' check however that the number of rows of the `data.frame` or `DataFrame` +#' matches the number of spectra in the `Spectra` object. +#' +#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the +#' monoisotopic peak for groups of isotopologues. Isotopologues are +#' estimated using the [isotopologues()] function from the +#' *MetaboCoreUtils* package. Note that +#' the default parameters for isotope prediction/detection have been +#' determined using data from the Human Metabolome Database (HMDB) and +#' isotopes for elements other than CHNOPS might not be detected. See +#' parameter `substDefinition` in the documentation of [isotopologues()] for +#' more information. The approach and code to define the parameters for +#' isotope prediction is described +#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' +#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the +#' object's `spectraData` that contain only missing values (`NA`). Note that +#' while columns with only `NA`s are removed, a `spectraData()` call after +#' `dropNaSpectraVariables()` might still show columns containing `NA` values +#' for *core* spectra variables. +#' +#' - `filterAcquisitionNum()`: filters the object keeping only spectra matching +#' the provided acquisition numbers (argument `n`). If `dataOrigin` or +#' `dataStorage` is also provided, `object` is subsetted to the spectra with +#' an acquisition number equal to `n` **in spectra with matching dataOrigin +#' or dataStorage values** retaining all other spectra. +#' Returns the filtered `Spectra`. +#' +#' - `filterDataOrigin()`: filters the object retaining spectra matching the +#' provided `dataOrigin`. Parameter `dataOrigin` has to be of type +#' `character` and needs to match exactly the data origin value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataOrigin` parameter). +#' +#' - `filterDataStorage()`: filters the object retaining spectra stored in the +#' specified `dataStorage`. Parameter `dataStorage` has to be of type +#' `character` and needs to match exactly the data storage value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataStorage` parameter). +#' +#' - `filterEmptySpectra()`: removes empty spectra (i.e. spectra without peaks). +#' Returns the filtered `Spectra` object (with spectra in their +#' original order). +#' +#' - `filterIsolationWindow()`: retains spectra that contain `mz` in their +#' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` +#' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` +#' object (with spectra in their original order). +#' +#' - `filterMsLevel()`: filters object by MS level keeping only spectra matching +#' the MS level specified with argument `msLevel`. Returns the filtered +#' `Spectra` (with spectra in their original order). +#' +#' - `filterPolarity()`: filters the object keeping only spectra matching the +#' provided polarity. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterPrecursorCharge()`: retains spectra with the defined precursor +#' charge(s). +#' +#' - `filterPrecursorIsotopes()`: groups MS2 spectra based on their precursor +#' m/z and precursor intensity into predicted isotope groups and keep for each +#' only the spectrum representing the monoisotopic precursor. MS1 spectra +#' are returned as is. See documentation for `deisotopeSpectra()` below for +#' details on isotope prediction and parameter description. +#' +#' - `filterPrecursorMaxIntensity()`: filters the `Spectra` keeping for groups +#' of (MS2) spectra with similar precursor m/z values (given parameters +#' `ppm` and `tolerance`) the one with the highest precursor intensity. The +#' function filters only MS2 spectra and returns all MS1 spectra. If +#' precursor intensities are `NA` for all spectra within a spectra group, the +#' first spectrum of that groups is returned. +#' Note: some manufacturers don't provide precursor intensities. These can +#' however also be estimated with [estimatePrecursorIntensity()]. +#' +#' - `filterPrecursorMzRange()` (previously `filterPrecursorMz()` which is now +#' deprecated): retains spectra with a precursor m/z within the +#' provided m/z range. See examples for details on selecting spectra with +#' a precursor m/z for a target m/z accepting a small difference in *ppm*. +#' +#' - `filterPrecursorMzValues()`: retains spectra with precursor m/z matching +#' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with +#' missing precursor m/z value (e.g. MS1 spectra) are dropped. +#' +#' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. +#' MS2) of acquisition number `acquisitionNum`. Returns the filtered +#' `Spectra` (with spectra in their original order). Parameter `f` allows to +#' define which spectra belong to the same sample or original data file ( +#' defaults to `f = dataOrigin(object)`). +#' +#' - `filterRanges()`: allows filtering of the `Spectra` object based on user +#' defined *numeric* ranges (parameter `ranges`) for one or more available +#' spectra variables in object (spectra variable names can be specified with +#' parameter `spectraVariables`). Spectra for which the value of a spectra +#' variable is within it's defined range are retained. If multiple +#' ranges/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' - `filterRt()`: retains spectra of MS level `msLevel` with retention +#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) +#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterValues()`: allows filtering of the `Spectra` object based on +#' similarities of *numeric* values of one or more `spectraVariables(object)` +#' (parameter `spectraVariables`) to provided values (parameter `values`) +#' given acceptable differences (parameters tolerance and ppm). If multiple +#' values/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' +#' @section Filter content of `spectraData()`: +#' +#' The functions described in this section filter the content from a +#' `Spectra`'s spectra data, i.e. affect values of, or complete, spectra +#' variables. None of these functions reduces the object's number of spectra. +#' +#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the +#' object's `spectraData` that contain only missing values (`NA`). Note that +#' while columns with only `NA`s are removed, a `spectraData()` call after +#' `dropNaSpectraVariables()` might still show columns containing `NA` values +#' for *core* spectra variables. The total number of spectra is not changed +#' by this function. +#' +#' - `selectSpectraVariables()`: reduces the information within the object to +#' the selected spectra variables: all data for variables not specified will +#' be dropped. For mandatory columns (i.e., those listed by +#' [coreSpectraVariables()], such as *msLevel*, *rtime* ...) only +#' the values will be dropped but not the variable itself. Additional (or +#' user defined) spectra variables will be completely removed. +#' Returns the filtered `Spectra`. +#' +#' +#' - `joinSpectraData()`: Individual spectra variables can be directly +#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` +#' function allows to merge a `DataFrame` to the existing spectra +#' data. This function diverges from the [merge()] method in two +#' main ways: +#' - The `by.x` and `by.y` column names must be of length 1. +#' - If variable names are shared in `x` and `y`, the spectra +#' variables of `x` are not modified. It's only the `y` +#' variables that are appended the suffix defined in +#' `suffix.y`. This is to avoid modifying any core spectra +#' variables that would lead to an invalid object. +#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not +#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) +#' throw a warning and only the last occurrence is kept. These +#' should be explored and ideally be removed using for +#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar +#' functions. +#' For a more general function that allows to append `data.frame`, +#' `DataFrame` and `matrix` see `cbind2()`. +#' +#' @section Filter content of `peaksData()`: +#' +#' The functions described in this section filter the content of the +#' `Spectra`'s peaks data, i.e. either the number or the values (*m/z* or +#' intensity values) of the mass peaks. Also, the actual operation is only +#' executed once peaks data is accessed (through `peaksData()`, +#' `mz()` or `intensity()`) or `applyProcessing()` is called. +#' These operations don't affect the number of spectra in the `Spectra` object. +#' +#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the +#' monoisotopic peak for groups of isotopologues. Isotopologues are +#' estimated using the [isotopologues()] function from the +#' *MetaboCoreUtils* package. Note that +#' the default parameters for isotope prediction/detection have been +#' determined using data from the Human Metabolome Database (HMDB) and +#' isotopes for elements other than CHNOPS might not be detected. See +#' parameter `substDefinition` in the documentation of [isotopologues()] for +#' more information. The approach and code to define the parameters for +#' isotope prediction is described +#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' +#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier +#' artefact peaks from spectra (see examples below). The function iterates +#' through all intensity ordered peaks in a spectrum and removes all peaks +#' with an m/z within +/- `halfWindowSize` of the current peak if their +#' intensity is lower than `threshold` times the current peak's intensity. +#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` +#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` +#' being the maximum charge that should be considered and `isotopeTolerance` +#' the absolute acceptable tolerance for matching their m/z). +#' See [filterFourierTransformArtefacts()] for details and background and +#' `deisitopeSpectra()` for an alternative. +#' +#' - `filterIntensity()`: filters mass peaks in each spectrum keeping only +#' those with intensities that are within the provided range or match the +#' criteria of the provided function. For the former, parameter `intensity` +#' has to be a `numeric` defining the intensity range, for the latter a +#' `function` that takes the intensity values of the spectrum and returns +#' a `logical` whether the peak should be retained or not (see examples +#' below for details) - additional parameters to the function can be passed +#' with `...`. +#' To remove only peaks with intensities below a certain threshold, say +#' 100, use `intensity = c(100, Inf)`. Note: also a single value can be +#' passed with the `intensity` parameter in which case an upper limit of +#' `Inf` is used. +#' Note that this function removes also peaks with missing intensities +#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the +#' filtering to spectra of the specified MS level(s). +#' +#' - `filterMzRange()`: filters mass peaks in the object keeping or removing +#' those in each spectrum that are within the provided m/z range. Whether +#' peaks are retained or removed can be configured with parameter `keep` +#' (default `keep = TRUE`). +#' +#' - `filterMzValues()`: filters mass peaks in the object keeping all +#' peaks in each spectrum that match the provided m/z value(s) (for +#' `keep = TRUE`, the default) or removing all of them (for `keep = FALSE`). +#' The m/z matching considers also the absolute `tolerance` and m/z-relative +#' `ppm` values. `tolerance` and `ppm` have to be of length 1. +#' +#' - `filterPeaksRanges()`: filters mass peaks of a `Spectra` object using any +#' set of range-based filters on numeric spectra or peaks variables. See +#' [filterPeaksRanges()] for more information. +#' +#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with +#' an m/z equal or larger than the m/z of the precursor, depending on the +#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching +#' m/z (considering an absolute and relative acceptable difference depending +#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all +#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` +#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` +#' allows to restrict the filter to certain MS levels (by default the filter +#' is applied to all MS levels). Note that no peaks are removed if the +#' precursor m/z is `NA` (e.g. typically for MS1 spectra). +#' +#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in +#' (given `ppm` and `tolerance`) in each spectrum only the mass peak with the +#' highest intensity removing all other peaks hence *reducing* each +#' spectrum to the highest intensity peaks per *peak group*. +#' Peak groups are defined using the [group()] function from the +#' *MsCoreUtils* package. See also the [combinePeaks()] function for an +#' alternative function to combine peaks within each spectrum. #' -#' ## Loading an Orbitrap spectrum with artefacts. -#' data(fft_spectrum) -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' @param acquisitionNum for `filterPrecursorScan()`: `integer` with the +#' acquisition number of the spectra to which the object should be +#' subsetted. #' -#' fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) -#' fft_spectrum -#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' @param charge For `deisotopeSpectra()`: expected charge of the ionized +#' compounds. See [isotopologues()] for details. #' -#' ## Using a few examples peaks in your data you can optimize the parameters -#' fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, -#' halfWindowSize = 0.2, -#' threshold = 0.005, -#' keepIsotopes = TRUE, -#' maxCharge = 5, -#' isotopeTolerance = 0.005 -#' ) +#' @param dataOrigin For `filterDataOrigin()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occurr only for spectra of selected `dataOrigin`. #' -#' fft_spectrum_filtered -#' length(mz(fft_spectrum_filtered)[[1]]) -#' plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' @param dataStorage For `filterDataStorage()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occur only for spectra of selected `dataStorage`. #' -#' ## Using filterRanges to filter spectra object based on variables available -#' ## in `spectraData`. -#' ## First, determine the variable(s) on which to base the filtering: -#' sv <- c("rtime", "precursorMz", "peaksCount") -#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' @param drop For `[`: not considered. #' -#' ## Define the ranges (pairs of values with lower and upper boundary) to be -#' ## used for the individual spectra variables. The first two values will be -#' ## used for the first spectra variable (e.g., rtime here), the next two for -#' ## the second (e.g. precursorMz here) and so on: -#' ranges <- c(30, 350, 200,500, 350, 600) +#' @param f For `filterPrecursorScan()`: defining which spectra +#' belong to the same original data file (sample): Defaults to +#' `f = dataOrigin(x)`. #' -#' ## Input the parameters within the filterRanges function: -#' filt_spectra <- filterRanges(sciex, spectraVariables = sv, -#' ranges = ranges) +#' @param halfWindowSize For `filterFourierTransformArtefacts()`: `numeric(1)` +#' defining the m/z window left and right of a peak where to remove +#' fourier transform artefacts. #' -#' ## Using `filterRanges()` to filter spectra object with multiple ranges for -#' ## the same `spectraVariable` (e.g, here rtime) -#' sv <- c("rtime", "rtime") -#' ranges <- c(30, 100, 200, 300) -#' filt_spectra <- filterRanges(sciex, spectraVariables = sv, -#' ranges = ranges, match = "any") +#' @param i For `[`: `integer`, `logical` or `character` to subset the +#' object. #' -#' ## Using filterValues in a similar way to a filter spectra object based on -#' ## variables available in `spectraData`. However, this time not based on -#' ## ranges but similarities to user input single values with given -#' ## tolerance/ppm -#' ## First determine the variable(s) on which to base the filtering: -#' sv <- c("rtime", "precursorMz") -#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' @param intensity For `filterIntensity()`: `numeric` of length 1 or 2 +#' defining either the lower or the lower and upper intensity limit for the +#' filtering, or a `function` that takes the intensities as input and +#' returns a `logical` (same length then peaks in the spectrum) whether the +#' peak should be retained or not. Defaults to `intensity = c(0, Inf)` thus +#' only peaks with `NA` intensity are removed. #' -#' ## Define the values that will be used to filter the spectra based on their -#' ## similarities to their respective spectraVariables. -#' ## The first values in the parameters values, tolerance and ppm will be -#' ## used for the first spectra variable (e.g. rtime here), the next for the -#' ## second (e.g. precursorMz here) and so on: -#' values <- c(350, 400) -#' tolerance <- c(100, 0) -#' ppm <- c(0,50) +#' @param isotopeTolerance For `filterFourierTransformArtefacts()`: the m/z +#' `tolerance` to be used to define whether peaks might be isotopes of +#' the current tested peak. #' -#' ## Input the parameters within the `filterValues()` function: -#' filt_spectra <- filterValues(sciex, spectraVariables = sv, -#' values = values, tolerance = tolerance, ppm = ppm) +#' @param j For `[`: not supported. #' -#' ## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ---- +#' @param keep For `filterMzValues()` and `filterMzRange()`: `logical(1)` +#' whether the matching peaks should be retained (`keep = TRUE`, the +#' default) or dropped (`keep = FALSE`). #' -#' ## Set the data to be centroided -#' centroided(data) <- TRUE +#' @param keepIsotopes For `filterFourierTransformArtefacts()`: whether isotope +#' peaks should not be removed as fourier artefacts. #' -#' ## Replace peak intensities below 40 with 3. -#' res <- replaceIntensitiesBelow(data, threshold = 40, value = 3) -#' res +#' @param match For `filterRanges()` and `filterValues()`: `character(1) ` +#' defining whether the condition has to match for all provided +#' `ranges`/`values` (`match = "all"`; the default), or for any of them +#' (`match = "any"`) for spectra to be retained. #' -#' ## Get the intensities of the first and second spectrum. -#' intensity(res)[[1]] -#' intensity(res)[[2]] +#' @param maxCharge For `filterFourierTransformArtefacts()`: the maximum charge +#' to be considered for isotopes. #' -#' ## Remove all peaks with an intensity below 40. -#' res <- filterIntensity(res, intensity = c(40, Inf)) +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' For `filterMsLevel()`: the MS level to which `object` should be +#' subsetted. #' -#' ## Get the intensities of the first and second spectrum. -#' intensity(res)[[1]] -#' intensity(res)[[2]] +#' @param mz For `filterIsolationWindow()`: `numeric(1)` with the m/z value to +#' filter the object. For `filterPrecursorMz()` and `filterMzRange()`: +#' `numeric(2)` defining the lower and upper m/z boundary. +#' For `filterMzValues()` and `filterPrecursorMzValues()`: `numeric` with +#' the m/z values to match peaks or precursor m/z against. +#' For `filterPrecursorPeaks()`: `character(1)` defining whether mass peaks +#' with an m/z matching the spectrum's precursor m/z (`mz = "=="`, +#' the default) or mass peaks with a m/z that is equal or larger +#' (`mz = ">="`) should be removed. #' -#' ## Lengths of spectra is now different -#' lengths(mz(res)) -#' lengths(mz(data)) +#' @param n for `filterAcquisitionNum()`: `integer` with the acquisition +#' numbers to filter for. #' -#' ## In addition it is possible to pass a function to `filterIntensity()`: in -#' ## the example below we want to keep only peaks that have an intensity which -#' ## is larger than one third of the maximal peak intensity in that spectrum. -#' keep_peaks <- function(x, prop = 3) { -#' x > max(x, na.rm = TRUE) / prop -#' } -#' res2 <- filterIntensity(data, intensity = keep_peaks) -#' intensity(res2)[[1L]] -#' intensity(data)[[1L]] +#' @param object `Spectra` object. #' -#' ## We can also change the proportion by simply passing the `prop` parameter -#' ## to the function. To keep only peaks that have an intensity which is -#' ## larger than half of the maximum intensity: -#' res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2) -#' intensity(res2)[[1L]] -#' intensity(data)[[1L]] +#' @param polarity for `filterPolarity()`: `integer` specifying the polarity to +#' to subset `object`. #' -#' ## Since data manipulation operations are by default not directly applied to -#' ## the data but only added to the internal lazy evaluation queue, it is also -#' ## possible to remove these data manipulations with the `reset()` function: -#' res_rest <- reset(res) -#' res_rest -#' lengths(mz(res_rest)) -#' lengths(mz(res)) -#' lengths(mz(data)) -#' -#' ## `reset()` after a `applyProcessing()` can not restore the data, because -#' ## the data in the backend was changed. Similarly, `reset()` after any -#' ## filter operations can not restore data for a `Spectra` with a -#' ## `MsBackendMemory` or `MsBackendDataFrame`. -#' res_2 <- applyProcessing(res) -#' res_rest <- reset(res_2) -#' lengths(mz(res)) -#' lengths(mz(res_rest)) +#' @param ppm For `filterMzValues()` and `reduceSpectra()`: `numeric(1)` +#' defining a relative, m/z-dependent, maximal accepted difference between +#' m/z values for peaks to be matched (or grouped). +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the relative +#' maximal accepted difference of precursor m/z values of spectra for +#' grouping them into *precursor groups*. For `filterPrecursorIsotopes()`: +#' passed directly to the [isotopologues()] function. +#' For `filterValues()`: `numeric` of any length allowing to define +#' a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `ppm[1]` will be +#' recycled. #' +#' @param ranges for `filterRanges()`: A `numeric` vector of paired values +#' (upper and lower boundary) that define the ranges to filter the `object`. +#' These paired values need to be in the same order as the +#' `spectraVariables` parameter (see below). #' -#' ## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using -#' ## the normalized dotproduct method. -#' res <- compareSpectra(sciex_im[2:3], sciex_im[10:20]) -#' ## first row contains comparisons of spectrum 2 with spectra 10 to 20 and -#' ## the second row comparisons of spectrum 3 with spectra 10 to 20 -#' res +#' @param rt for `filterRt()`: `numeric(2)` defining the retention time range to +#' be used to subset/filter `object`. #' -#' ## To use a simple Pearson correlation instead we can define a function -#' ## that takes the two peak matrices and calculates the correlation for -#' ## their second columns (containing the intensity values). -#' correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) { -#' cor(x[, 2], y[, 2], use = use) -#' } -#' res <- compareSpectra(sciex_im[2:3], sciex_im[10:20], -#' FUN = correlateSpectra) -#' res +#' @param spectraVariables For `selectSpectraVariables()`: `character` with the +#' names of the spectra variables to which the backend should be +#' subsetted. For `filterRanges()` and `filterValues()`: `character` +#' vector specifying the column(s) from `spectraData(object)` on which +#' to filter the data and that correspond to the the names of the +#' spectra variables that should be used for the filtering. #' -#' ## Use compareSpectra to determine the number of common (matching) peaks -#' ## with a ppm of 10: -#' ## type = "inner" uses a *inner join* to match peaks, i.e. keeps only -#' ## peaks that can be mapped betwen both spectra. The provided FUN returns -#' ## simply the number of matching peaks. -#' compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner", -#' FUN = function(x, y, ...) nrow(x)) +#' @param substDefinition For `deisotopeSpectra()` and +#' `filterPrecursorIsotopes()`: `matrix` or `data.frame` with definitions +#' of isotopic substitutions. Uses by default isotopic substitutions +#' defined from all compounds in the Human Metabolome Database (HMDB). See +#' [isotopologues()] or [isotopicSubstitutionMatrix()] in the +#' *MetaboCoreUtils* for details. #' -#' ## Apply an arbitrary function to each spectrum in a Spectra. -#' ## In the example below we calculate the mean intensity for each spectrum -#' ## in a subset of the sciex_im data. Note that we can access all variables -#' ## of each individual spectrum either with the `$` operator or the -#' ## corresponding method. -#' res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]])) -#' head(res) +#' @param threshold For `filterFourierTransformArtefacts()`: the relative +#' intensity (to a peak) below which peaks are considered fourier +#' artefacts. Defaults to `threshold = 0.2` hence removing peaks that +#' have an intensity below 0.2 times the intensity of the tested peak +#' (within the selected `halfWindowSize`). #' -#' ## It is however important to note that dedicated methods to access the -#' ## data (such as `intensity`) are much more efficient than using `lapply()`: -#' res <- lapply(intensity(sciex_im[1:20]), mean) -#' head(res) +#' @param tolerance For `filterMzValues()` and `reduceSpectra()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched (or grouped). For +#' `containsMz()` it can also be of length equal `mz` to specify a different +#' tolerance for each m/z value. +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the +#' (constant) maximal accepted difference of precursor m/z values of +#' spectra for grouping them into *precursor groups*. For +#' `filterPrecursorIsotopes()`: passed directly to the [isotopologues()] +#' function. For `filterValues()`: `numeric` of any length allowing to +#' define a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `tolerance[1]` will be +#' recycled. Default is `tolerance = 0`. #' -#' ## As an alternative, applying a function `FUN` to a `Spectra` can be -#' ## performed *chunk-wise*. The advantage of this is, that only the data for -#' ## one chunk at a time needs to be loaded into memory reducing the memory -#' ## demand. This type of processing can be performed by specifying the size -#' ## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` -#' ## parameter -#' spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) +#' @param values for `filterValues()`: A `numeric` vector that define the +#' values to filter the Spectra data. These values need to be in the same +#' order as the `spectraVariables` parameter. #' -#' ## ---- DATA EXPORT ---- +#' @param weighted For `combinePeaks()`: `logical(1)` whether m/z values of +#' peaks within each peak group should be aggregated into a single m/z +#' value using an intensity-weighted mean. Defaults to `weighted = TRUE`. #' -#' ## Some `MsBackend` classes provide an `export()` method to export the data -#' ## to the file format supported by the backend. -#' ## The `MsBackendMzR` for example allows to export MS data to mzML or -#' ## mzXML file(s), the `MsBackendMgf` (defined in the MsBackendMgf R package) -#' ## would allow to export the data in mgf file format. -#' ## Below we export the MS data in `data`. We call the `export()` method on -#' ## this object, specify the backend that should be used to export the data -#' ## (and which also defines the output format) and provide a file name. -#' fl <- tempfile() -#' export(data, MsBackendMzR(), file = fl) +#' @param which for `containsMz()`: either `"any"` or `"all"` defining whether +#' any (the default) or all provided `mz` have to be present in the +#' spectrum. #' -#' ## This exported our data in mzML format. Below we read the first 6 lines -#' ## from that file. -#' readLines(fl, n = 6) +#' @param x A `Spectra` object. #' -#' ## If only a single file name is provided, all spectra are exported to that -#' ## file. To export data with the `MsBackendMzR` backend to different files, a -#' ## file name for each individual spectrum has to be provided. -#' ## Below we export each spectrum to its own file. -#' fls <- c(tempfile(), tempfile()) -#' export(data, MsBackendMzR(), file = fls) +#' @param y A `Spectra` object. +#' - For `joinSpectraData()`: a `DataFrame`. +#' - For `cbind2()` a `data.frame`, `DataFrame` or `matrix`. #' -#' ## Reading the data from the first file -#' res <- Spectra(backendInitialize(MsBackendMzR(), fls[1])) +#' @param x `Spectra` object. #' -#' mz(res) -#' mz(data) +#' @param z For `filterPrecursorCharge()`: `integer()` with the precursor +#' charges to be used as filter. #' -#' ## ---- PEAKS VARIABLES AND DATA ---- +#' @param ... Additional arguments. #' -#' ## Some `MsBackend` classes provide support for arbitrary peaks variables -#' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below -#' ## we create a simple data frame with an additional peak variable `"pk_ann"` -#' ## and create a `Spectra` with a `MsBackendMemory` for that data. -#' ## Importantly the number of values (per spectrum) need to be the same -#' ## for all peak variables. +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`. +#' +#' - [combinePeaks()] for functions to combine or aggregate a `Spectra`'s +#' `peaksData()` +#' +#' @md +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- SUBSET SPECTRA -------- +#' +#' ## Subset to the first 3 spectra +#' tmp <- sps_dda[1:3] +#' tmp +#' length(tmp) +#' +#' ## Subset to all MS2 spectra; this could be done with [, or, more +#' ## efficiently, with the `filterMsLevel` function: +#' sps_dda[msLevel(sps_dda) == 2L] +#' filterMsLevel(sps_dda, 2L) +#' +#' ## Filter the object keeping only MS2 spectra with an precursor m/z value +#' ## between a specified range: +#' filterPrecursorMzRange(sps_dda, c(80, 90)) +#' +#' ## Filter the object to MS2 spectra with an precursor m/z matching a +#' ## pre-defined value (given ppm and tolerance) +#' filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) +#' +#' ## The `filterRanges()` function allows to filter a `Spectra` based on +#' ## numerical ranges of any of its (numerical) spectra variables. +#' ## First, determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz", "peaksCount") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the ranges (pairs of values with lower and upper boundary) to be +#' ## used for the individual spectra variables. The first two values will be +#' ## used for the first spectra variable (e.g., `"rtime"` here), the next two +#' ## for the second (e.g. `"precursorMz"` here) and so on: +#' ranges <- c(30, 350, 200, 500, 350, 600) +#' +#' ## Input the parameters within the filterRanges function: +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges) +#' filt_spectra +#' +#' ## `filterRanges()` can also be used to filter a `Spectra` object with +#' ## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +#' sv <- c("rtime", "rtime") +#' ranges <- c(30, 100, 200, 300) +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges, match = "any") +#' filt_spectra +#' +#' ## While `filterRanges()` filtered on numeric ranges, `filterValues()` +#' ## allows to filter an object matching spectra variable values to user +#' ## provided values (allowing to configure allowed differences using the +#' ## `ppm` and `tolerance` parameters). +#' ## First determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the values that will be used to filter the spectra based on their +#' ## similarities to their respective `spectraVariables`. +#' ## The first values in the parameters values, tolerance and ppm will be +#' ## used for the first spectra variable (e.g. `"rtime"` here), the next for +#' ## the second (e.g. `"precursorMz"` here) and so on: +#' values <- c(350, 80) +#' tolerance <- c(100, 0.1) +#' ppm <- c(0, 50) +#' +#' ## Input the parameters within the `filterValues()` function: +#' filt_spectra <- filterValues(sps_dda, spectraVariables = sv, +#' values = values, tolerance = tolerance, ppm = ppm) +#' filt_spectra +#' +#' +#' ## -------- FILTER SPECTRA DATA -------- +#' +#' ## Remove spectra variables without content (i.e. with only missing values) +#' sps_noNA <- dropNaSpectraVariables(sps_dda) +#' +#' ## Append new `spectraVariables` to the `spectraData` +#' df <- data.frame(cola = 4:5, colb = "b") +#' data_append <- cbind2(data, df) +#' +#' ## Same with the filterMsLevel function +#' filterMsLevel(data, 2) +#' +#' ## This reduced the size of the object slightly +#' print(object.size(sps_dda), unit = "MB") +#' print(object.size(sps_noNA), unit = "MB") +#' +#' ## With the `selectSpectraVariables()` function it is in addition possible +#' ## to subset the data of a `Spectra` to the selected columns/variables, +#' ## keeping only their data: +#' tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", +#' "scanIndex")) +#' print(object.size(tmp), units = "MB") +#' +#' ## Except the selected variables, all data is now removed. Accessing +#' ## core spectra variables still works, but returns only NA +#' rtime(tmp) |> head() #' -#' tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) -#' tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) -#' tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) -#' tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) #' -#' ## Create the Spectra. With parameter `peaksVariables` we can define -#' ## the columns in `tmp` that contain peaks variables. -#' sps <- Spectra(tmp, source = MsBackendMemory(), -#' peaksVariables = c("mz", "intensity", "pk_ann")) -#' peaksVariables(sps) +#' ## -------- FILTER PEAKS DATA -------- #' -#' ## Extract just the m/z and intensity values -#' peaksData(sps)[[1L]] +#' ## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +#' ## only those mass peaks with an m/z value matching the provided value(s). +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) #' -#' ## Extract the full peaks data -#' peaksData(sps, columns = peaksVariables(sps))[[1L]] +#' ## The filtered `Spectra` has the same length +#' length(sps_dda) +#' length(sps_sub) #' -#' ## Access just the pk_ann variable -#' sps$pk_ann -NULL - -#' The Spectra class +#' ## But the number of mass peaks changed +#' lengths(sps_dda) |> head() +#' lengths(sps_sub) |> head() #' -#' The [Spectra-class] encapsulates data and meta-data for mass -#' spectrometry experiments. +#' ## This function can also be used to remove specific peaks from a spectrum +#' ## by setting `keep = FALSE`. +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), +#' tolerance = 0.3, keep = FALSE) +#' lengths(sps_sub) |> head() #' +#' ## With the `filterMzRange()` function it is possible to keep (or remove) +#' ## mass peaks with m/z values within a specified numeric range. +#' sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +#' lengths(sps_sub) |> head() #' -#' @slot backend A derivate of [MsBackend-class] holding/controlling the spectra -#' data. -#' @slot processingQueue `list` of `ProcessingStep` objects. -#' @slot processingQueueVariables `character` of spectraVariables that should -#' be passed to the processing step function. -#' @slot processing A `character` storing logging information. -#' @slot metadata A `list` storing experiment metadata. -#' @slot version A `characher(1)` containing the class version. +#' ## See also the `filterPeaksRanges()` function for a more flexible framework +#' ## to filter mass peaks #' -#' @name Spectra-class -#' @docType class -#' @author Sebastian Gibb \email{mail@@sebastiangibb.de} #' -#' @importClassesFrom S4Vectors DataFrame +#' ## Removing fourier transform artefacts seen in Orbitra data. #' -#' @importMethodsFrom S4Vectors lapply +#' ## Loading an Orbitrap spectrum with artefacts. +#' data(fft_spectrum) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) #' -#' @importFrom S4Vectors DataFrame +#' fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +#' fft_spectrum +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) #' -#' @noRd -setClass( - "Spectra", - slots = c( - backend = "MsBackend", - processingQueue = "list", - processingQueueVariables = "character", - ## logging - processing = "character", - ## metadata - metadata = "list", - processingChunkSize = "numeric", - version = "character" - ), - prototype = prototype(version = "0.3", - processingChunkSize = Inf) -) - -setValidity("Spectra", function(object) { - msg <- .valid_processing_queue(object@processingQueue) - if (length(msg)) msg - else TRUE -}) - -#' @rdname hidden_aliases +#' ## Using a few examples peaks in your data you can optimize the parameters +#' fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, +#' halfWindowSize = 0.2, +#' threshold = 0.005, +#' keepIsotopes = TRUE, +#' maxCharge = 5, +#' isotopeTolerance = 0.005 +#' ) #' -#' @importMethodsFrom methods show +#' fft_spectrum_filtered +#' length(mz(fft_spectrum_filtered)[[1]]) +#' plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) #' -#' @importFrom utils capture.output #' -#' @exportMethod show -setMethod("show", "Spectra", - function(object) { - cat("MSn data (", class(object)[1L], ") with ", - length(object@backend), " spectra in a ", class(object@backend), - " backend:\n", sep = "") - if (length(object@backend)) { - txt <- capture.output(show(object@backend)) - cat(txt[-1], sep = "\n") - } - if (length(object@processingQueue)) - cat("Lazy evaluation queue:", length(object@processingQueue), - "processing step(s)\n") - lp <- length(object@processing) - if (lp) { - lps <- object@processing - if (lp > 3) { - lps <- lps[1:3] - lps <- c(lps, paste0("...", lp - 3, " more processings. ", - "Use 'processingLog' to list all.")) - } - cat("Processing:\n", paste(lps, collapse="\n "), "\n") - } - }) - -#' @rdname Spectra -setMethod("Spectra", "missing", function(object, processingQueue = list(), - metadata = list(), ..., - backend = MsBackendMemory(), - BPPARAM = bpparam()) { - new("Spectra", metadata = metadata, processingQueue = processingQueue, - backend = backend) -}) - -#' @rdname Spectra -setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), - metadata = list(), ..., - BPPARAM = bpparam()) { - new("Spectra", metadata = metadata, processingQueue = processingQueue, - backend = object) -}) - -#' @rdname Spectra +#' ## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +#' ## by similarity of their m/z values) only one representative peak. This +#' ## function helps cleaning fragment spectra. +#' ## Filter the data set to MS2 spectra +#' ms2 <- filterMsLevel(sps_dda, 2L) #' -#' @importFrom methods callNextMethod -setMethod("Spectra", "character", function(object, processingQueue = list(), - metadata = list(), - source = MsBackendMzR(), - backend = source, - ..., BPPARAM = bpparam()) { - if (!length(object)) - Spectra(backend, metadata = metadata, - processingQueue = processingQueue) - else - callNextMethod(object = object, processingQueue = processingQueue, - metadata = metadata, source = source, backend = backend, - ..., BPPARAM = BPPARAM) -}) +#' ## For groups of fragment peaks with a difference in m/z < 0.1, keep only +#' ## the largest one. +#' ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +#' lengths(ms2) |> tail() +#' lengths(ms2_red) |> tail() +NULL -#' @rdname Spectra -setMethod("Spectra", "ANY", function(object, processingQueue = list(), - metadata = list(), - source = MsBackendMemory(), - backend = source, - ..., BPPARAM = bpparam()) { - sp <- new("Spectra", metadata = metadata, processingQueue = processingQueue, - backend = backendInitialize( - source, object, ..., - BPPARAM = backendBpparam(source, BPPARAM))) - if (class(source)[1L] != class(backend)[1L]) - setBackend(sp, backend, ..., BPPARAM = backendBpparam(backend, BPPARAM)) - else sp +#' @rdname filterMsLevel +setMethod("dropNaSpectraVariables", "Spectra", function(object) { + object@backend <- dropNaSpectraVariables(object@backend) + object }) -#' @rdname Spectra -#' -#' @importMethodsFrom ProtGenerics setBackend -#' -#' @exportMethod setBackend +#' @rdname filterMsLevel setMethod( - "setBackend", c("Spectra", "MsBackend"), - function(object, backend, f = processingChunkFactor(object), ..., - BPPARAM = bpparam()) { - backend_class <- class(object@backend)[1L] - BPPARAM <- backendBpparam(object@backend, BPPARAM) - BPPARAM <- backendBpparam(backend, BPPARAM) - if (!supportsSetBackend(backend)) - stop(class(backend), " does not support 'setBackend'") - if (!length(object)) { - bknds <- backendInitialize( - backend, data = spectraData(object@backend), ...) - } else { - if (!is.factor(f)) - f <- force(factor(f, levels = unique(f))) - if (length(f) && (length(levels(f)) > 1)) { - if (length(f) != length(object)) - stop("length of 'f' has to match the length of 'object'") - bknds <- bplapply( - split(object@backend, f = f), - function(z, ...) { - backendInitialize(backend, - data = spectraData(z), ..., - BPPARAM = SerialParam()) - }, ..., BPPARAM = BPPARAM) - bknds <- backendMerge(bknds) - ## That below ensures the backend is returned in its original - ## order - unsplit does unfortunately not work. - if (is.unsorted(f)) - bknds <- bknds[order(unlist(split(seq_along(bknds), f), - use.names = FALSE))] - } else { - bknds <- backendInitialize( - backend, data = spectraData(object@backend), ...) - } - } - object@backend <- bknds - object@processing <- .logging(object@processing, - "Switch backend from ", - backend_class, " to ", - class(object@backend)) + "selectSpectraVariables", "Spectra", + function(object, spectraVariables = union(spectraVariables(object), + peaksVariables(object))) { + spectraVariables <- union(spectraVariables, "dataStorage") + object@backend <- selectSpectraVariables( + object@backend, spectraVariables = spectraVariables) object }) -#' @rdname Spectra -#' -#' @importFrom MsCoreUtils vapply1c +#' @rdname filterMsLevel #' -#' @exportMethod c -setMethod("c", "Spectra", function(x, ...) { - .concatenate_spectra(unname(list(unname(x), ...))) -}) - -#' @rdname Spectra -setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { - bcknds <- split(x@backend, f, ...) - lapply(bcknds, function(b) { - slot(x, "backend", check = FALSE) <- b - x - }) +#' @export +setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { + if (!missing(j)) + stop("Subsetting 'Spectra' by columns is not (yet) supported") + if (missing(i)) + return(x) + slot(x, "backend", check = FALSE) <- extractByIndex( + x@backend, i2index(i, length(x))) + x }) +setClassUnion("dataframeOrDataFrame", c("data.frame", "DataFrame")) #' @rdname Spectra #' #' @export -setMethod("export", "Spectra", - function(object, backend, ...) { - if (missing(backend)) - stop("Parameter 'backend' is required.") - export(backend, object, ...) - }) - -#### --------------------------------------------------------------------------- -## -## ACCESSOR METHODS -## -#### --------------------------------------------------------------------------- - -#' @rdname Spectra -setMethod("acquisitionNum", "Spectra", function(object) - acquisitionNum(object@backend)) - -#' @rdname Spectra -setMethod( - "peaksData", "Spectra", - function(object, columns = c("mz", "intensity"), - f = processingChunkFactor(object), ..., BPPARAM = bpparam()) { - if (length(object@processingQueue) || length(f)) - SimpleList(.peaksapply(object, columns = columns, f = f)) - else SimpleList(peaksData(object@backend, columns = columns)) - }) - -#' @rdname Spectra -setMethod("peaksVariables", "Spectra", function(object) - peaksVariables(object@backend)) +setMethod("cbind2", signature(x = "Spectra", + y = "dataframeOrDataFrame"), function(x, y, ...) { + x@backend <- cbind2(x@backend, y, ...) + x + }) -#' @importFrom methods setAs -setAs("Spectra", "list", function(from, to) { - .peaksapply(from) +#' @rdname filterMsLevel +setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), + dataStorage = character(), + dataOrigin = character()) { + if (length(dataStorage) && !is.character(dataStorage)) + stop("'dataStorage' is expected to be of type character") + if (length(dataOrigin) && !is.character(dataOrigin)) + stop("'dataOrigin' is expected to be of type character") + object@backend <- filterAcquisitionNum(object@backend, n, + dataStorage, dataOrigin) + object@processing <- .logging(object@processing, + "Filter: select by: ", length(n), + " acquisition number(s) in ", + max(length(dataStorage), length(dataOrigin)), + " file(s)") + object }) -setAs("Spectra", "SimpleList", function(from, to) { - peaksData(from) +#' @rdname filterMsLevel +setMethod("filterEmptySpectra", "Spectra", function(object) { + object@backend <- extractByIndex(object@backend, + which(as.logical(lengths(object)))) + object@processing <- .logging(object@processing, + "Filter: removed empty spectra.") + object }) -#' @rdname Spectra -setMethod("centroided", "Spectra", function(object) { - centroided(object@backend) +#' @rdname filterMsLevel +setMethod("filterDataOrigin", "Spectra", function(object, + dataOrigin = character()) { + if (length(dataOrigin) && !is.character(dataOrigin)) + stop("'dataOrigin' is expected to be of type character") + object@backend <- filterDataOrigin(object@backend, dataOrigin = dataOrigin) + object@processing <- .logging(object@processing, + "Filter: select data origin(s) ", + paste0(dataOrigin, collapse = ", ")) + object }) -#' @rdname Spectra -setReplaceMethod("centroided", "Spectra", function(object, value) { - centroided(object@backend) <- value +#' @rdname filterMsLevel +setMethod("filterDataStorage", "Spectra", function(object, + dataStorage = character()) { + if (length(dataStorage) && !is.character(dataStorage)) + stop("'dataStorage' is expected to be of type character") + object@backend <- filterDataStorage(object@backend, dataStorage) + object@processing <- .logging(object@processing, + "Filter: select data storage(s) ", + paste0(dataStorage, collapse = ", ")) object }) -#' @rdname Spectra -setMethod("collisionEnergy", "Spectra", function(object) { - collisionEnergy(object@backend) -}) +#' @rdname filterMsLevel +#' +#' @exportMethod filterFourierTransformArtefacts +setMethod("filterFourierTransformArtefacts", "Spectra", + function(object, halfWindowSize = 0.05, threshold = 0.2, + keepIsotopes = TRUE, maxCharge = 5, + isotopeTolerance = 0.005) { + object <- addProcessing(object, .peaks_remove_fft_artifact, + halfWindowSize = halfWindowSize, + threshold = threshold, + keepIsotopes = keepIsotopes, + maxCharge = maxCharge, + isotopeTolerance = isotopeTolerance) + object@processing <- .logging( + object@processing, "Remove fast fourier artefacts.") + object + }) -#' @rdname Spectra -setReplaceMethod("collisionEnergy", "Spectra", function(object, value) { - collisionEnergy(object@backend) <- value +#' @rdname filterMsLevel +#' +#' @importMethodsFrom ProtGenerics filterIntensity +#' +#' @exportMethod filterIntensity +setMethod("filterIntensity", "Spectra", + function(object, intensity = c(0, Inf), + msLevel. = uniqueMsLevels(object), ...) { + if (!.check_ms_level(object, msLevel.)) + return(object) + if (is.numeric(intensity)) { + if (length(intensity) == 1) + intensity <- c(intensity, Inf) + if (length(intensity) != 2) + stop("'intensity' should be of length specifying a ", + "lower intensity limit or of length two defining ", + "a lower and upper limit.") + object <- addProcessing(object, .peaks_filter_intensity, + intensity = intensity, + msLevel = msLevel., + spectraVariables = "msLevel") + object@processing <- .logging( + object@processing, "Remove peaks with intensities ", + "outside [", intensity[1], ", ", intensity[2], + "] in spectra of MS level(s) ", + paste0(msLevel., collapse = ", "), ".") + } else { + if (is.function(intensity)) { + object <- addProcessing( + object, .peaks_filter_intensity_function, + intfun = intensity, msLevel = msLevel., + args = list(...), spectraVariables = "msLevel") + object@processing <- .logging( + object@processing, "Remove peaks based on their ", + "intensities and a user-provided function ", + "in spectra of MS level(s) ", + paste0(msLevel., collapse = ", "), ".") + } + else stop("'intensity' has to be numeric or a function") + } + object + }) + + +#' @rdname filterMsLevel +setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { + object@backend <- filterIsolationWindow(object@backend, mz = mz) + object@processing <- .logging(object@processing, + "Filter: select spectra containing m/z ", + mz, " in their isolation window") object }) -#' @rdname Spectra -setMethod("dataOrigin", "Spectra", function(object) dataOrigin(object@backend)) - -#' @rdname Spectra -setReplaceMethod("dataOrigin", "Spectra", function(object, value) { - dataOrigin(object@backend) <- value +#' @rdname filterMsLevel +setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { + object@backend <- filterMsLevel(object@backend, msLevel = msLevel.) + object@processing <- .logging(object@processing, + "Filter: select MS level(s) ", + paste0(unique(msLevel.), collapse = " ")) object }) -#' @rdname Spectra -setMethod("dataStorage", "Spectra", - function(object) dataStorage(object@backend)) +#' @rdname filterMsLevel +#' +#' @importMethodsFrom ProtGenerics filterMzRange +#' +#' @export +setMethod("filterMzRange", "Spectra", + function(object, mz = numeric(), msLevel. = uniqueMsLevels(object), + keep = TRUE) { + if (!.check_ms_level(object, msLevel.)) + return(object) + if (!length(mz)) mz <- c(-Inf, Inf) + else mz <- range(mz) + object <- addProcessing(object, .peaks_filter_mz_range, mz = mz, + msLevel = msLevel., keep = keep, + spectraVariables = "msLevel") + if (keep) keep_or_remove <- "select" + else keep_or_remove <- "remove" + object@processing <- .logging( + object@processing, "Filter: ", keep_or_remove, + " peaks with an m/z within [", mz[1L], ", ", mz[2L], "]") + object + }) -#' @rdname Spectra -setMethod("dropNaSpectraVariables", "Spectra", function(object) { - object@backend <- dropNaSpectraVariables(object@backend) +#' @rdname filterMsLevel +#' +#' @importMethodsFrom ProtGenerics filterMzValues +#' +#' @export +setMethod("filterMzValues", "Spectra", + function(object, mz = numeric(), tolerance = 0, ppm = 20, + msLevel. = uniqueMsLevels(object), keep = TRUE) { + if (!.check_ms_level(object, msLevel.)) + return(object) + l <- length(mz) + if (length(tolerance) != 1) + stop("'tolerance' should be of length 1") + if (length(ppm) != 1) + stop("'ppm' should be of length 1") + if (is.unsorted(mz)) { + idx <- order(mz) + mz <- mz[idx] + if (length(tolerance) == l) + tolerance <- tolerance[idx] + if (length(ppm) == l) + ppm <- ppm[idx] + } + object <- addProcessing(object, .peaks_filter_mz_value, + mz = mz, tolerance = tolerance, + ppm = ppm, msLevel = msLevel., + keep = keep, spectraVariables = "msLevel") + if (length(mz) <= 3) + what <- paste0(format(mz, digits = 4), collapse = ", ") + else what <- "" + if (keep) + keep_or_remove <- "select" + else keep_or_remove <- "remove" + object@processing <- .logging( + object@processing, "Filter: ", keep_or_remove, + " peaks matching provided m/z values ", what) + object + }) + +#' @rdname filterMsLevel +setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { + object@backend <- filterPolarity(object@backend, polarity = polarity) + object@processing <- .logging(object@processing, + "Filter: select spectra with polarity ", + paste0(polarity, collapse = " ")) object }) -#' @rdname Spectra -setMethod("intensity", "Spectra", function(object, - f = processingChunkFactor(object), - ...) { - if (length(object@processingQueue) || length(f)) - NumericList(.peaksapply(object, FUN = function(z, ...) z[, 2], - f = f, ...), compress = FALSE) - else intensity(object@backend) -}) +#' @rdname filterMsLevel +#' +#' @export +setMethod("filterPrecursorMz", "Spectra", + function(object, mz = numeric()) { + .Deprecated( + msg = paste0("'filterPrecursorMz' is deprecated. Please use", + " 'filterPrecursorMzRange' instead.")) + object@backend <- filterPrecursorMzRange(object@backend, mz) + object@processing <- .logging( + object@processing, + "Filter: select spectra with a precursor m/z within [", + paste0(mz, collapse = ", "), "]") + object + }) -#' @rdname Spectra -setMethod("ionCount", "Spectra", function(object) { - if (length(object)) - unlist(.peaksapply( - object, FUN = function(pks, ...) sum(pks[, 2], na.rm = TRUE)), - use.names = FALSE) - else numeric() -}) +#' @rdname filterMsLevel +setMethod("filterPrecursorMzRange", "Spectra", + function(object, mz = numeric()) { + object@backend <- filterPrecursorMzRange(object@backend, mz) + object@processing <- .logging( + object@processing, + "Filter: select spectra with a precursor m/z within [", + paste0(mz, collapse = ", "), "]") + object + }) -#' @rdname Spectra -setMethod("isCentroided", "Spectra", function(object, ...) { - if (length(object)) - unlist(.peaksapply(object, FUN = .peaks_is_centroided), - use.names = FALSE) - else logical() -}) +#' @rdname filterMsLevel +setMethod("filterPrecursorMzValues", "Spectra", + function(object, mz = numeric(), ppm = 20, tolerance = 0) { + object@backend <- filterPrecursorMzValues( + object@backend, sort(mz), ppm = ppm, tolerance = tolerance) + object@processing <- .logging( + object@processing, + "Filter: select spectra with precursor m/z matching ", + paste0(mz, collapse = ", "), "") + object + }) -#' @rdname Spectra -setMethod("isEmpty", "Spectra", function(x) { - if (length(x)) - unlist(.peaksapply(x, FUN = function(pks, ...) nrow(pks) == 0), - use.names = FALSE) - else logical() -}) +#' @rdname filterMsLevel +setMethod("filterPrecursorCharge", "Spectra", + function(object, z = integer()) { + z <- unique(z) + object@backend <- filterPrecursorCharge(object@backend, z) + object@processing <- .logging( + object@processing, + "Filter: select spectra with a precursor charge ", + paste0(z, collapse = ", ")) + object + }) -#' @rdname Spectra -setMethod("isolationWindowLowerMz", "Spectra", function(object) { - isolationWindowLowerMz(object@backend) -}) +#' @rdname filterMsLevel +setMethod("filterPrecursorScan", "Spectra", + function(object, acquisitionNum = integer(), f = dataOrigin(object)) { + if (!all(f %in% unique(dataOrigin(object)))) + stop("'f' must be in dataOrigin().") + object@backend <- filterPrecursorScan(object@backend, + acquisitionNum, + f = dataOrigin(object)) + object@backend <- filterDataOrigin(object@backend, dataOrigin = f) + object@processing <- .logging( + object@processing, + "Filter: select parent/children scans for ", + paste0(acquisitionNum, collapse = " ")) + object + }) -#' @rdname Spectra -setReplaceMethod("isolationWindowLowerMz", "Spectra", function(object, value) { - isolationWindowLowerMz(object@backend) <- value - object -}) +#' @rdname filterMsLevel +setMethod("filterRt", "Spectra", + function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { + if (!is.numeric(msLevel.)) + stop("Please provide a numeric MS level.") + if (length(rt) != 2L || !is.numeric(rt) || rt[1] >= rt[2]) + stop("Please provide a lower and upper numeric retention", + " time range.") + if (length(rt)) + rt <- range(rt) + else rt <- c(-Inf, Inf) + object@backend <- filterRt(object@backend, rt, msLevel.) + object@processing <- .logging( + object@processing, + "Filter: select retention time [", rt[1], "..", rt[2], + "] on MS level(s) ", paste0(msLevel., collapse = " ")) + object + }) -#' @rdname Spectra -setMethod("isolationWindowTargetMz", "Spectra", function(object) { - isolationWindowTargetMz(object@backend) -}) +#' @rdname filterMsLevel +setMethod("filterRanges", "Spectra", + function(object, spectraVariables = character(), ranges = numeric(), + match = c("all", "any")){ + object@backend <- filterRanges(object@backend, spectraVariables, + ranges, match) + object@processing <- .logging(object@processing, + "Filter: select spectra with a ", + spectraVariables, " within: [", + ranges[seq(ranges)%% 2 != 0], ", ", + ranges[seq(ranges)%% 2 == 0], "]" + ) + object + }) -#' @rdname Spectra -setReplaceMethod("isolationWindowTargetMz", "Spectra", function(object, value) { - isolationWindowTargetMz(object@backend) <- value - object -}) +#' @rdname filterMsLevel +setMethod("filterValues", "Spectra", + function(object, spectraVariables = character(), values = numeric(), + ppm = 0, tolerance = 0, match = c("all", "any")){ + object@backend <- filterValues(object@backend, spectraVariables, + values, ppm, tolerance, match) + object@processing <- .logging(object@processing, + "Filter: select spectra with a ", + spectraVariables, " similar to: ", + values) + object + }) -#' @rdname Spectra -setMethod("isolationWindowUpperMz", "Spectra", function(object) { - isolationWindowUpperMz(object@backend) -}) -#' @rdname Spectra -setReplaceMethod("isolationWindowUpperMz", "Spectra", function(object, value) { - isolationWindowUpperMz(object@backend) <- value - object -}) +################################################################################ +## +## Data manipulation and analysis operations (lazy processing) +## +################################################################################ -#' @rdname Spectra +#' @title Data manipulation and analysis methods +#' +#' @name addProcessing +#' +#' @aliases addProcessing +#' @aliases applyProcessing +#' @aliases bin +#' @aliases containsMz +#' @aliases containsNeutralLoss +#' @aliases entropy +#' @aliases pickPeaks +#' @aliases replaceIntensitiesBelow +#' @aliases reset +#' @aliases smooth +#' @aliases spectrapply +#' +#' @description +#' +#' Various data analysis functions are available for `Spectra` objects. These +#' can be categorized into functions that either return a `Spectra` object +#' (with the manipulated data) and functions that directly return the +#' result from the calculation. For the former category, the data manipulations +#' are cached in the result object's *processing queue* and only exectuted +#' on-the-fly when the respective data gets extracted from the `Spectra` (see +#' section *The processing queue* for more information). +#' +#' For the second category, the calculations are directly executed and the +#' result, usually one value per spectrum, returned. Generally, to reduce +#' memory demand, a chunk-wise processing of the data is performed. +#' +#' +#' @section Data analysis methods returning a `Spectra`: +#' +#' The methods listed here return a `Spectra` object as a result. +#' +#' - `addProcessing()`: adds an arbitrary function that should be applied to the +#' peaks matrix of every spectrum in `object`. The function (can be passed +#' with parameter `FUN`) is expected to take a peaks matrix as input and to +#' return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +#' the first containing the m/z values of the peaks and the second the +#' corresponding intensities. The function has to have `...` in its +#' definition. Additional arguments can be passed with `...`. With parameter +#' `spectraVariables` it is possible to define additional spectra variables +#' from `object` that should be passed to the function `FUN`. These will be +#' passed by their name (e.g. specifying `spectraVariables = "precursorMz"` +#' will pass the spectra's precursor m/z as a parameter named `precursorMz` +#' to the function. The only exception is the spectra's MS level, these will +#' be passed to the function as a parameter called `spectrumMsLevel` (i.e. +#' with `spectraVariables = "msLevel"` the MS levels of each spectrum will be +#' submitted to the function as a parameter called `spectrumMsLevel`). +#' Examples are provided in the package vignette. +#' +#' - `bin()`: aggregates individual spectra into discrete (m/z) bins. Binning is +#' performed only on spectra of the specified MS level(s) (parameter +#' `msLevel`, by default all MS levels of `x`). The bins can be defined with +#' parameter `breaks` which by default are equally sized bins, with size +#' being defined by parameter `binSize`, from the minimal to the maximal m/z +#' of all spectra (of MS level `msLevel`) within `x`. The same bins are used +#' for all spectra in `x`. All intensity values for peaks falling into the +#' same bin are aggregated using the function provided with parameter `FUN` +#' (defaults to `FUN = sum`, i.e. all intensities are summed up). Note that +#' the binning operation is applied to the peak data on-the-fly upon data +#' access and it is possible to *revert* the operation with the `reset()` +#' function (see description of `reset()` below). +#' +#' - `countIdentifications`: counts the number of identifications each scan has +#' led to. See [countIdentifications()] for more details. +#' +#' - `pickPeaks()`: picks peaks on individual spectra using a moving +#' window-based approach (window size = `2 * halfWindowSize`). For noisy +#' spectra there are currently two different noise estimators available, +#' the *M*edian *A*bsolute *D*eviation (`method = "MAD"`) and +#' Friedman's Super Smoother (`method = "SuperSmoother"`), +#' as implemented in the [`MsCoreUtils::noise()`]. +#' The method supports also to optionally *refine* the m/z value of +#' the identified centroids by considering data points that belong (most +#' likely) to the same mass peak. Therefore the m/z value is calculated as an +#' intensity weighted average of the m/z values within the peak region. +#' The peak region is defined as the m/z values (and their respective +#' intensities) of the `2 * k` closest signals to the centroid or the closest +#' valleys (`descending = TRUE`) in the `2 * k` region. For the latter the `k` +#' has to be chosen general larger. See [`MsCoreUtils::refineCentroids()`] for +#' details. +#' If the ratio of the signal to the highest intensity of the peak is below +#' `threshold` it will be ignored for the weighted average. +#' +#' - `replaceIntensitiesBelow()`: replaces intensities below a specified +#' threshold with the provided `value`. Parameter `threshold` can be either +#' a single numeric value or a function which is applied to all non-`NA` +#' intensities of each spectrum to determine a threshold value for each +#' spectrum. The default is `threshold = min` which replaces all values +#' which are <= the minimum intensity in a spectrum with `value` (the +#' default for `value` is `0`). Note that the function specified with +#' `threshold` is expected to have a parameter `na.rm` since `na.rm = TRUE` +#' will be passed to the function. If the spectrum is in profile mode, +#' ranges of successive non-0 peaks <= `threshold` are set to 0. +#' Parameter `msLevel.` allows to apply this to only spectra of certain MS +#' level(s). +#' +#' - `scalePeaks()`: scales intensities of peaks within each spectrum depending +#' on parameter `by`. With `by = sum` (the default) peak intensities are +#' divided by the sum of peak intensities within each spectrum. The sum of +#' intensities is thus 1 for each spectrum after scaling. Parameter +#' `msLevel.` allows to apply the scaling of spectra of a certain MS level. +#' By default (`msLevel. = uniqueMsLevels(x)`) intensities for all +#' spectra will be scaled. +#' +#' - `smooth()`: smooths individual spectra using a moving window-based approach +#' (window size = `2 * halfWindowSize`). Currently, the +#' Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' weights depending on the distance of the center and calculated +#' `1/2^(-halfWindowSize:halfWindowSize)`) and +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' For details how to choose the correct `halfWindowSize` please see +#' [`MsCoreUtils::smooth()`]. +#' +#' +#' @section Data analysis methods returning the result from the calculation: +#' +#' The functions listed in this section return immediately the result from the +#' calculation. To reduce memory demand (and allow parallel processing) the +#' calculations a chunk-wise processing is generally performed. +#' +#' - `chunkapply()`: apply an arbitrary function to chunks of spectra. See +#' [chunkapply()] for details and examples. +#' +#' - `containsMz()`: checks for each of the spectra whether they contain mass +#' peaks with an m/z equal to `mz` (given acceptable difference as defined by +#' parameters `tolerance` and `ppm` - see [common()] for details). Parameter +#' `which` allows to define whether any (`which = "any"`, the default) or +#' all (`which = "all"`) of the `mz` have to match. The function returns +#' `NA` if `mz` is of length 0 or is `NA`. +#' +#' - `containsNeutralLoss()`: checks for each spectrum in `object` if it has a +#' peak with an m/z value equal to its precursor m/z - `neutralLoss` (given +#' acceptable difference as defined by parameters `tolerance` and `ppm`). +#' Returns `NA` for MS1 spectra (or spectra without a precursor m/z). +#' +#' - `entropy()`: calculates the entropy of each spectra based on the metrics +#' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +#' See also [nentropy()] in the *MsCoreUtils* package for details. +#' +#' - `estimatePrecursorIntensity()`: defines the precursor intensities for MS2 +#' spectra using the intensity of the matching MS1 peak from the +#' closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +#' respective MS2 spectrum). With `method = "interpolation"` it is also +#' possible to calculate the precursor intensity based on an interpolation of +#' intensity values (and retention times) of the matching MS1 peaks from the +#' previous and next MS1 spectrum. See [estimatePrecursorIntensity()] for +#' examples and more details. +#' +#' - `estimatePrecursorMz()`: **for DDA data**: allows to estimate a fragment +#' spectra's precursor m/z based on the reported precursor m/z and the data +#' from the previous MS1 spectrum. See [estimatePrecursorMz()] for details. +#' +#' - `neutralLoss()`: calculates neutral loss spectra for fragment spectra. See +#' [neutralLoss()] for detailed documentation. +#' +#' - `spectrapply()`: applies a given function to each individual spectrum or +#' sets of a `Spectra` object. By default, the `Spectra` is split into +#' individual spectra (i.e. `Spectra` of length 1) and the function `FUN` +#' is applied to each of them. An alternative splitting can be defined with +#' parameter `f`. Parameters for `FUN` can be passed using `...`. +#' The returned result and its order depend on the function `FUN` and how +#' `object` is split (hence on `f`, if provided). Parallel processing is +#' supported and can be configured with parameter `BPPARAM`, is however only +#' suggested for computational intense `FUN`. +#' As an alternative to the (eventual parallel) processing of the full +#' `Spectra`, `spectrapply()` supports also a chunk-wise processing. For this, +#' parameter `chunkSize` needs to be specified. `object` is then split into +#' chunks of size `chunkSize` which are then (stepwise) processed by `FUN`. +#' This guarantees a lower memory demand (especially for on-disk backends) +#' since only the data for one chunk needs to be loaded into memory in each +#' iteration. Note that by specifying `chunkSize`, parameters `f` and +#' `BPPARAM` will be ignored. +#' See also `chunkapply()` above or examples below for details on chunk-wise +#' processing. +#' +#' +#' @section The processing queue: +#' +#' Operations that modify mass peak data, i.e. the m/z and intensity values of +#' a `Spectra` are generally not applied immediately to the data but are +#' *cached* within the object's *processing queue*. These operations are then +#' applied to the data only upon request, for example when m/z and/or +#' intensity values are extracted. This lazy execution guarantees that the +#' same functionality can be applied to any `Spectra` object, regardless of +#' the type of backend that is used. Thus, data manipulation operations can +#' also be applied to data that is *read only*. As a side effect, this enables +#' also to *undo* operations using the `reset()` function. +#' +#' Functions related to the processing queue are: +#' +#' - `applyProcessing()`: for `Spectra` objects that use a **writeable** backend +#' only: apply all steps from the lazy processing queue to the peak data and +#' write it back to the data storage. Parameter `f` allows to specify how +#' `object` should be split for parallel processing. This should either be +#' equal to the `dataStorage`, or `f = rep(1, length(object))` to disable +#' parallel processing alltogether. Other partitionings might result in +#' errors (especially if a `MsBackendHdf5Peaks` backend is used). +#' +#' - `processingLog()`: returns a `character` vector with the processing log +#' messages. +#' +#' - `reset()`: restores the data to its original state (as much as possible): +#' removes any processing steps from the lazy processing queue and calls +#' `reset()` on the backend which, depending on the backend, can also undo +#' e.g. data filtering operations. Note that a `reset*(` call after +#' `applyProcessing()` will not have any effect. See examples below for more +#' information. +#' +#' @param binSize For `bin()`: `numeric(1)` defining the size for the m/z bins. +#' Defaults to `binSize = 1`. #' -#' @exportMethod containsMz -setMethod("containsMz", "Spectra", function(object, mz = numeric(), - tolerance = 0, - ppm = 20, which = c("any", "all"), - BPPARAM = bpparam()) { - cond_fun <- match.fun(match.arg(which)) - if (all(is.na(mz))) - return(rep(NA, length(object))) - mz <- unique(sort(mz)) - BPPARAM <- backendBpparam(object@backend, BPPARAM) - ## TODO: fix to use .peaksapply instead. - if (is(BPPARAM, "SerialParam")) - .has_mz(object, mz, tolerance = tolerance, ppm = ppm, - condFun = cond_fun, parallel = BPPARAM) - else { - sp <- SerialParam() - f <- as.factor(dataStorage(object)) - res <- .lapply(object, FUN = .has_mz, mz = mz, tolerance = tolerance, - condFun = cond_fun, parallel = sp, f = f, - BPPARAM = BPPARAM) - unsplit(res, f = f) - } -}) - -#' @rdname Spectra +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. See also [processingChunkSize()] for +#' additional information on parallel processing. #' -#' @exportMethod containsNeutralLoss -setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, - tolerance = 0, ppm = 20, - BPPARAM = bpparam()) { - BPPARAM <- backendBpparam(object@backend, BPPARAM) - ## TODO: FIX me to use chunk size. - if (is(BPPARAM, "SerialParam")) { - .has_mz_each(object, precursorMz(object) - neutralLoss, - tolerance = tolerance, ppm = ppm, parallel = BPPARAM) - } else { - sp <- SerialParam() - f <- as.factor(dataStorage(object)) - res <- .lapply(object, FUN = function(obj, n, tol, ppm, par) { - .has_mz_each(obj, precursorMz(obj) - n, tolerance = tol, - ppm = ppm, parallel = sp) - }, n = neutralLoss, tol = tolerance, ppm = ppm, par = sp, f = f, - BPPARAM = BPPARAM) - unsplit(res, f = f) - } -}) - -#' @rdname Spectra +#' @param breaks For `bin()`: `numeric` defining the m/z breakpoints between +#' bins. #' -#' @importMethodsFrom ProtGenerics spectrapply +#' @param by For `scalePeaks()`: function to calculate a single `numeric` from +#' intensity values of a spectrum by which all intensities (of +#' that spectrum) should be divided by. The default `by = sum` will +#' divide intensities of each spectrum by the sum of intensities of that +#' spectrum. #' -#' @exportMethod spectrapply -setMethod("spectrapply", "Spectra", function(object, FUN, ..., - chunkSize = integer(), - f = factor(), - BPPARAM = SerialParam()) { - if (missing(FUN)) - FUN <- identity - if (length(chunkSize)) - return(chunkapply(object, FUN, ..., chunkSize = chunkSize)) - if (!length(f)) - f <- as.factor(seq_along(object)) - .lapply(object, FUN = FUN, f = f, ..., - BPPARAM = backendBpparam(object@backend, BPPARAM)) -}) - -#' @rdname Spectra +#' @param chunkSize For `spectrapply()`: size of the chunks into which the +#' `Spectra` should be split. This parameter overrides parameters +#' `f` and `BPPARAM`. #' -#' @exportMethod length -setMethod("length", "Spectra", function(x) length(x@backend)) - -#' @rdname Spectra -setMethod("msLevel", "Spectra", function(object) msLevel(object@backend)) - -#' @rdname Spectra -setMethod("mz", "Spectra", function(object, f = processingChunkFactor(object), - ...) { - if (length(object@processingQueue) || length(f)) - NumericList(.peaksapply(object, FUN = function(z, ...) z[, 1], - f = f, ...), compress = FALSE) - else mz(object@backend) -}) - -#' @rdname Spectra +#' @param descending For `pickPeaks()`: `logical`, if `TRUE` just values +#' betwee the nearest valleys around the peak centroids are used. +# +#' @param f For `spectrapply()` and `applyProcessing()`: `factor` defining +#' how `object` should be splitted for eventual parallel processing. +#' Defaults to `factor()` for `spectrapply()` hence the object is not +#' splitted while it defaults to `f = processingChunkSize(object)` for +#' `applyProcessing()` splitting thus the object by default into chunks +#' depending on [processingChunkSize()]. #' -#' @exportMethod lengths -setMethod("lengths", "Spectra", function(x, use.names = FALSE) { - f <- .parallel_processing_factor(x) - if (length(x)) { - if (length(x@processingQueue) || length(f)) - unlist(.peaksapply(x, FUN = function(pks, ...) nrow(pks)), - use.names = use.names) - else lengths(x@backend, use.names = use.names) - } else integer() -}) - -#' @rdname Spectra -setMethod("polarity", "Spectra", function(object) { - polarity(object@backend) -}) - -#' @rdname Spectra -setReplaceMethod("polarity", "Spectra", function(object, value) { - polarity(object@backend) <- value - object -}) - -#' @rdname Spectra -setMethod("precScanNum", "Spectra", function(object) { - precScanNum(object@backend) -}) - -#' @rdname Spectra -setMethod("precursorCharge", "Spectra", function(object) { - precursorCharge(object@backend) -}) - -#' @rdname Spectra -setMethod("precursorIntensity", "Spectra", function(object) { - precursorIntensity(object@backend) -}) - -#' @rdname Spectra -setMethod("precursorMz", "Spectra", function(object) { - precursorMz(object@backend) -}) - -#' @rdname Spectra -setMethod("rtime", "Spectra", function(object) { - rtime(object@backend) -}) - -#' @rdname Spectra -setReplaceMethod("rtime", "Spectra", function(object, value) { - rtime(object@backend) <- value - object -}) - -#' @rdname Spectra -setMethod("scanIndex", "Spectra", function(object) { - scanIndex(object@backend) -}) - -#' @rdname Spectra -setMethod( - "selectSpectraVariables", "Spectra", - function(object, spectraVariables = union(spectraVariables(object), - peaksVariables(object))) { - spectraVariables <- union(spectraVariables, "dataStorage") - object@backend <- selectSpectraVariables( - object@backend, spectraVariables = spectraVariables) - object - }) - -#' @rdname Spectra -setMethod("smoothed", "Spectra", function(object) { - smoothed(object@backend) -}) - -#' @rdname Spectra -setReplaceMethod("smoothed", "Spectra", function(object, value) { - smoothed(object@backend) <- value - object -}) - -#' @rdname Spectra +#' @param FUN For `addProcessing()`: function to be applied to the peak matrix +#' of each spectrum in `object`. +#' For `bin()`: function to aggregate intensity values of peaks falling +#' into the same bin. Defaults to `FUN = sum` thus summing up intensities. +#' For `spectrapply()` and `chunkapply()`: function to be applied to +#' each individual or each chunk of `Spectra`. #' -#' @importMethodsFrom ProtGenerics spectraData +#' @param halfWindowSize For `pickPeaks()`: `integer(1)`, used in the +#' identification of the mass peaks: a local maximum has to be the +#' maximum in the window from `(i - halfWindowSize):(i + halfWindowSize)`. +#' For `smooth()`: `integer(1)`, used in the smoothing algorithm, the +#' window reaches from `(i - halfWindowSize):(i + halfWindowSize)`. #' -#' @exportMethod spectraData -setMethod( - "spectraData", "Spectra", - function(object, columns = spectraVariables(object)) { - if (length(object@processingQueue) && - length(pcns <- intersect(columns, peaksVariables(object)))) { - ## If user requests peaks variables we need to ensure that the - ## processing queue is executed. - scns <- setdiff(columns, pcns) - if (length(scns)) - spd <- spectraData(object@backend, columns = scns) - else - spd <- make_zero_col_DFrame(nrow = length(object)) - pkd <- peaksData(object, columns = pcns) - ## Add individual peaks variables to the `DataFrame`. - for (pcn in pcns) { - vals <- lapply(pkd, `[`, , pcn) - if (pcn %in% c("mz", "intensity")) - vals <- NumericList(vals, compress = FALSE) - spd <- do.call(`[[<-`, list(spd, i = pcn, value = vals)) - } - spd - } else - spectraData(object@backend, columns = columns) - }) - -#' @rdname Spectra +#' @param k For `pickPeaks()`: `integer(1)`, number of values left and right of +#' the peak that should be considered in the weighted mean calculation. #' -#' @importMethodsFrom ProtGenerics spectraData<- +#' @param method For `pickPeaks()`: `character(1)`, the noise estimators that +#' should be used, currently the the *M*edian *A*bsolute *D*eviation +#' (`method = "MAD"`) and Friedman's Super Smoother +#' (`method = "SuperSmoother"`) are supported. +#' For `smooth()`: `character(1)`, the smoothing function that should be +#' used, currently, the Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. #' -#' @exportMethod spectraData<- -setReplaceMethod("spectraData", "Spectra", function(object, value) { - if (!inherits(value, "DataFrame")) - stop("'spectraData<-' expects a 'DataFrame' as input.", call. = FALSE) - pvs <- peaksVariables(object) - if (length(object@processingQueue) && - any(colnames(value) %in% pvs)) - stop("Can not replace peaks variables with a non-empty processing ", - "queue. Please use 'object <- applyProcessing(object)' to apply ", - "and clear the processing queue. Note that 'applyProcessing' ", - "requires a *writeable* backend. Use e.g. 'object <- ", - "setBackend(object, MsBackendMemory())' if needed.") - pvs <- setdiff(pvs, colnames(value)) - if (length(pvs)) { - sd <- spectraData(object, pvs) - for (pv in pvs) { - value <- do.call("$<-", list(value, name = pv, sd[, pv])) - } - object@processingQueue <- list() - } - spectraData(object@backend) <- value - object -}) - -#' @rdname Spectra -setMethod("spectraNames", "Spectra", function(object) { - spectraNames(object@backend) -}) - -#' @rdname Spectra -setReplaceMethod("spectraNames", "Spectra", function(object, value) { - spectraNames(object@backend) <- value - object -}) - -#' @rdname Spectra -setMethod("spectraVariables", "Spectra", function(object) { - setdiff(spectraVariables(object@backend), peaksVariables(object@backend)) -}) - -#' @rdname Spectra -setMethod("tic", "Spectra", function(object, initial = TRUE) { - if (!length(object)) - return(numeric()) - if (initial) - tic(object@backend, initial = initial) - else ionCount(object) -}) - -#' @rdname Spectra +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mz For `containsMz()`: `numeric` with the m/z value(s) of the mass +#' peaks to check. +#' +#' @param neutralLoss for `containsNeutralLoss()`: `numeric(1)` defining the +#' value which should be subtracted from the spectrum's precursor m/z. +#' +#' @param normalized for `entropy()`: `logical(1)` whether the normalized +#' entropy should be calculated (default). See also [nentropy()] for +#' details. #' -#' @importMethodsFrom S4Vectors $ +#' @param object A `Spectra` object. #' -#' @export -setMethod("$", "Spectra", function(x, name) { - if (!(name %in% c(spectraVariables(x@backend), peaksVariables(x@backend)))) - stop("No spectra variable '", name, "' available") - if (name == "mz") - mz(x) - else if (name == "intensity") - intensity(x) - else { - if (length(x@processingQueue) && name %in% peaksVariables(x)) - .peaksapply(x, FUN = function(z, ...) z[, name], - columns = c("mz", "intensity", name)) - else - do.call("$", list(x@backend, name)) - } -}) - -#' @rdname Spectra +#' @param ppm For `containsMz()` and `neutralLoss()`: `numeric(1)` defining a +#' relative, m/z-dependent, maximal accepted difference between m/z values +#' for peaks to be matched. #' -#' @export -setReplaceMethod("$", "Spectra", function(x, name, value) { - if (length(x@processingQueue) && - any(name %in% peaksVariables(x))) - stop("Can not replace peaks variables with a non-empty processing ", - "queue. Please use 'object <- applyProcessing(object)' to apply ", - "and clear the processing queue. Note that 'applyProcessing' ", - "requires a *writeable* backend. Use e.g. 'object <- ", - "setBackend(object, MsBackendMemory())' if needed.") - x@backend <- do.call("$<-", list(x@backend, name, value)) - x -}) - -#' @rdname Spectra +#' @param snr For `pickPeaks()`: `double(1)` defining the +#' *S*ignal-to-*N*oise-*R*atio. The intensity of a local maximum has to be +#' higher than `snr * noise` to be considered as peak. #' -#' @export -setMethod("[[", "Spectra", function(x, i, j, ...) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to access.") - if (!missing(j)) - stop("'j' is not supported.") - if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) - stop("No spectra variable '", i, "' available") - if (i == "mz") - mz(x) - else if (i == "intensity") - intensity(x) - else - do.call("[[", list(x@backend, i)) -}) - -#' @rdname Spectra +#' @param spectraVariables For `addProcessing()`: `character` with additional +#' spectra variables that should be passed along to the function defined +#' with `FUN`. See function description for details. #' -#' @export -setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to replace or create.") - if (!missing(j)) - stop("'j' is not supported.") - x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) - x -}) - -#### --------------------------------------------------------------------------- -## -## FILTERING AND SUBSETTING -## -#### --------------------------------------------------------------------------- - -#' @rdname Spectra -setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { - if (!missing(j)) - stop("Subsetting 'Spectra' by columns is not (yet) supported") - if (missing(i)) - return(x) - slot(x, "backend", check = FALSE) <- x@backend[i = i] - x -}) - -setClassUnion("dataframeOrDataFrame", c("data.frame", "DataFrame")) -#' @rdname Spectra +#' @param threshold For `pickPeaks()`: a `numeric(1)` defining the proportion +#' of the maximal peak intensity. Only values above the threshold are +#' used for the weighted mean calculation. +#' For `replaceIntensitiesBelow()`: a `numeric(1)` defining the threshold +#' or a `function` to calculate the threshold for each spectrum on its +#' intensity values. Defaults to `threshold = min`. #' -#' @export -setMethod("cbind2", signature(x = "Spectra", - y = "dataframeOrDataFrame"), function(x, y, ...) { - x@backend <- cbind2(x@backend, y, ...) - x - }) - -#' @rdname Spectra -setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), - dataStorage = character(), - dataOrigin = character()) { - if (length(dataStorage) && !is.character(dataStorage)) - stop("'dataStorage' is expected to be of type character") - if (length(dataOrigin) && !is.character(dataOrigin)) - stop("'dataOrigin' is expected to be of type character") - object@backend <- filterAcquisitionNum(object@backend, n, - dataStorage, dataOrigin) - object@processing <- .logging(object@processing, - "Filter: select by: ", length(n), - " acquisition number(s) in ", - max(length(dataStorage), length(dataOrigin)), - " file(s)") - object -}) - -#' @rdname Spectra -setMethod("filterEmptySpectra", "Spectra", function(object) { - object@backend <- object@backend[as.logical(lengths(object))] - object@processing <- .logging(object@processing, - "Filter: removed empty spectra.") - object -}) - -#' @rdname Spectra -setMethod("filterDataOrigin", "Spectra", function(object, - dataOrigin = character()) { - if (length(dataOrigin) && !is.character(dataOrigin)) - stop("'dataOrigin' is expected to be of type character") - object@backend <- filterDataOrigin(object@backend, dataOrigin = dataOrigin) - object@processing <- .logging(object@processing, - "Filter: select data origin(s) ", - paste0(dataOrigin, collapse = ", ")) - object -}) - -#' @rdname Spectra -setMethod("filterDataStorage", "Spectra", function(object, - dataStorage = character()) { - if (length(dataStorage) && !is.character(dataStorage)) - stop("'dataStorage' is expected to be of type character") - object@backend <- filterDataStorage(object@backend, dataStorage) - object@processing <- .logging(object@processing, - "Filter: select data storage(s) ", - paste0(dataStorage, collapse = ", ")) - object -}) - -#' @rdname Spectra +#' @param tolerance For `containsMz()` and `neutralLoss()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched. #' -#' @exportMethod filterFourierTransformArtefacts -setMethod("filterFourierTransformArtefacts", "Spectra", - function(object, halfWindowSize = 0.05, threshold = 0.2, - keepIsotopes = TRUE, maxCharge = 5, - isotopeTolerance = 0.005) { - object <- addProcessing(object, .peaks_remove_fft_artifact, - halfWindowSize = halfWindowSize, - threshold = threshold, - keepIsotopes = keepIsotopes, - maxCharge = maxCharge, - isotopeTolerance = isotopeTolerance) - object@processing <- .logging( - object@processing, "Remove fast fourier artefacts.") - object - }) - -#' @rdname Spectra +#' @param value For `replaceIntensitiesBelow()`: `numeric(1)` defining the +#' value with which intensities should be replaced with. #' -#' @importMethodsFrom ProtGenerics filterIntensity +#' @param which For `containsMz()`: either `"any"` or `"all"` defining whether +#' any (the default) or all provided `mz` have to be present in the +#' spectrum. #' -#' @exportMethod filterIntensity -setMethod("filterIntensity", "Spectra", - function(object, intensity = c(0, Inf), - msLevel. = uniqueMsLevels(object), ...) { - if (!.check_ms_level(object, msLevel.)) - return(object) - if (is.numeric(intensity)) { - if (length(intensity) == 1) - intensity <- c(intensity, Inf) - if (length(intensity) != 2) - stop("'intensity' should be of length specifying a ", - "lower intensity limit or of length two defining ", - "a lower and upper limit.") - object <- addProcessing(object, .peaks_filter_intensity, - intensity = intensity, - msLevel = msLevel., - spectraVariables = "msLevel") - object@processing <- .logging( - object@processing, "Remove peaks with intensities ", - "outside [", intensity[1], ", ", intensity[2], - "] in spectra of MS level(s) ", - paste0(msLevel., collapse = ", "), ".") - } else { - if (is.function(intensity)) { - object <- addProcessing( - object, .peaks_filter_intensity_function, - intfun = intensity, msLevel = msLevel., - args = list(...), spectraVariables = "msLevel") - object@processing <- .logging( - object@processing, "Remove peaks based on their ", - "intensities and a user-provided function ", - "in spectra of MS level(s) ", - paste0(msLevel., collapse = ", "), ".") - } - else stop("'intensity' has to be numeric or a function") - } - object - }) - - -#' @rdname Spectra -setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { - object@backend <- filterIsolationWindow(object@backend, mz = mz) - object@processing <- .logging(object@processing, - "Filter: select spectra containing m/z ", - mz, " in their isolation window") - object -}) - -#' @rdname Spectra -setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { - object@backend <- filterMsLevel(object@backend, msLevel = msLevel.) - object@processing <- .logging(object@processing, - "Filter: select MS level(s) ", - paste0(unique(msLevel.), collapse = " ")) - object -}) - -#' @rdname Spectra +#' @param x A `Spectra`. +#' +#' @param zero.rm For `bin()`: `logical(1)` indicating whether to remove bins +#' with zero intensity. Defaults to `TRUE`, meaning the function will +#' discard bins created with an intensity of 0 to enhance memory +#' efficiency. +#' +#' @param ... Additional arguments passed to internal and downstream functions. +#' +#' @return +#' +#' See the documentation of the individual functions for a description of the +#' return value. +#' +#' @md +#' +#' @seealso +#' +#' - [compareSpectra()] for calculation of spectra similarity scores. +#' +#' - [processingChunkSize()] for information on parallel and chunk-wise data +#' processing. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- FUNCTIONS RETURNING A SPECTRA -------- +#' +#' ## Replace peak intensities below 40 with a value of 1 +#' sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +#' sps_mod +#' +#' ## Get the intensities of the first spectrum before and after the +#' ## operation +#' intensity(sps_dda[1]) +#' intensity(sps_mod[1]) +#' +#' ## Remove all peaks with an intensity below 5. +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' +#' intensity(sps_mod) +#' +#' ## In addition it is possible to pass a function to `filterIntensity()`: in +#' ## the example below we want to keep only peaks that have an intensity which +#' ## is larger than one third of the maximal peak intensity in that spectrum. +#' keep_peaks <- function(x, prop = 3) { +#' x > max(x, na.rm = TRUE) / prop +#' } +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +#' intensity(sps_mod) +#' +#' ## We can also change the proportion by simply passing the `prop` parameter +#' ## to the function. To keep only peaks that have an intensity which is +#' ## larger than half of the maximum intensity: +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +#' intensity(sps_mod) +#' +#' ## With the `scalePeaks()` function we can alternatively scale the +#' ## intensities of mass peaks per spectrum to relative intensities. This +#' ## is specifically useful for fragment (MS2) spectra. We below thus +#' ## scale the intensities per spectrum by the total sum of intensities +#' ## (such that the sum of all intensities per spectrum is 1). +#' ## Below we scale the intensities of all MS2 spectra in our data set. +#' sps_mod <- scalePeaks(sps_dda, msLevel = 2L) +#' +#' ## MS1 spectra were not affected +#' sps_mod |> +#' filterMsLevel(1L) |> +#' intensity() +#' +#' ## Intensities of MS2 spectra were scaled +#' sps_mod |> +#' filterMsLevel(2L) |> +#' intensity() +#' +#' ## Since data manipulation operations are by default not directly applied to +#' ## the data but only cached in the internal processing queue, it is also +#' ## possible to remove these data manipulations with the `reset()` function: +#' tmp <- reset(sps_mod) +#' tmp +#' lengths(sps_dda) |> head() +#' lengths(sps_mod) |> head() +#' lengths(tmp) |> head() +#' +#' ## Data manipulation operations cached in the processing queue can also be +#' ## applied to the mass peaks data with the `applyProcessing()` function, if +#' ## the `Spectra` uses a backend that supports that (i.e. allows replacing +#' ## the mass peaks data). Below we first change the backend to a +#' ## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +#' ## mass peaks data +#' sps_dda <- setBackend(sps_dda, MsBackendMemory()) +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' sps_mod <- applyProcessing(sps_mod) +#' sps_mod +#' +#' ## While we can't *undo* this filtering operation now using the `reset()` +#' ## function, accessing the data would now be faster, because the operation +#' ## does no longer to be applied to the original data before returning to the +#' ## user. +#' +#' +#' ## -------- FUNCTIONS RETURNING THE RESULT -------- +#' +#' ## With the `spectrapply()` function it is possible to apply an +#' ## arbitrary function to each spectrum in a Spectra. +#' ## In the example below we calculate the mean intensity for each spectrum +#' ## in a subset of the sciex_im data. Note that we can access all variables +#' ## of each individual spectrum either with the `$` operator or the +#' ## corresponding method. +#' res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +#' head(res) #' -#' @importMethodsFrom ProtGenerics filterMzRange +#' ## As an alternative, applying a function `FUN` to a `Spectra` can be +#' ## performed *chunk-wise*. The advantage of this is, that only the data for +#' ## one chunk at a time needs to be loaded into memory reducing the memory +#' ## demand. This type of processing can be performed by specifying the size +#' ## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +#' ## parameter +#' spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) #' -#' @export -setMethod("filterMzRange", "Spectra", - function(object, mz = numeric(), msLevel. = uniqueMsLevels(object), - keep = TRUE) { - if (!.check_ms_level(object, msLevel.)) - return(object) - if (!length(mz)) mz <- c(-Inf, Inf) - else mz <- range(mz) - object <- addProcessing(object, .peaks_filter_mz_range, mz = mz, - msLevel = msLevel., keep = keep, - spectraVariables = "msLevel") - if (keep) keep_or_remove <- "select" - else keep_or_remove <- "remove" - object@processing <- .logging( - object@processing, "Filter: ", keep_or_remove, - " peaks with an m/z within [", mz[1L], ", ", mz[2L], "]") - object - }) - -#' @rdname Spectra +#' ## Precursor intensity estimation. Some manufacturers don't report the +#' ## precursor intensity for MS2 spectra: +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() #' -#' @importMethodsFrom ProtGenerics filterMzValues +#' ## This intensity can however be estimated from the previously measured +#' ## MS1 scan with the `estimatePrecursorIntensity()` function: +#' pi <- estimatePrecursorIntensity(sps_dda) #' -#' @export -setMethod("filterMzValues", "Spectra", - function(object, mz = numeric(), tolerance = 0, ppm = 20, - msLevel. = uniqueMsLevels(object), keep = TRUE) { - if (!.check_ms_level(object, msLevel.)) - return(object) - l <- length(mz) - if (length(tolerance) != 1) - stop("'tolerance' should be of length 1") - if (length(ppm) != 1) - stop("'ppm' should be of length 1") - if (is.unsorted(mz)) { - idx <- order(mz) - mz <- mz[idx] - if (length(tolerance) == l) - tolerance <- tolerance[idx] - if (length(ppm) == l) - ppm <- ppm[idx] - } - object <- addProcessing(object, .peaks_filter_mz_value, - mz = mz, tolerance = tolerance, - ppm = ppm, msLevel = msLevel., - keep = keep, spectraVariables = "msLevel") - if (length(mz) <= 3) - what <- paste0(format(mz, digits = 4), collapse = ", ") - else what <- "" - if (keep) - keep_or_remove <- "select" - else keep_or_remove <- "remove" - object@processing <- .logging( - object@processing, "Filter: ", keep_or_remove, - " peaks matching provided m/z values ", what) - object - }) - -#' @rdname Spectra -setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { - object@backend <- filterPolarity(object@backend, polarity = polarity) - object@processing <- .logging(object@processing, - "Filter: select spectra with polarity ", - paste0(polarity, collapse = " ")) - object -}) - -#' @rdname Spectra +#' ## This function returned the result as a `numeric` vector with one +#' ## value per spectrum: +#' pi #' -#' @export -setMethod("filterPrecursorMz", "Spectra", - function(object, mz = numeric()) { - .Deprecated( - msg = paste0("'filterPrecursorMz' is deprecated. Please use", - " 'filterPrecursorMzRange' instead.")) - object@backend <- filterPrecursorMzRange(object@backend, mz) - object@processing <- .logging( - object@processing, - "Filter: select spectra with a precursor m/z within [", - paste0(mz, collapse = ", "), "]") - object - }) - -#' @rdname Spectra -setMethod("filterPrecursorMzRange", "Spectra", - function(object, mz = numeric()) { - object@backend <- filterPrecursorMzRange(object@backend, mz) - object@processing <- .logging( - object@processing, - "Filter: select spectra with a precursor m/z within [", - paste0(mz, collapse = ", "), "]") - object - }) - -#' @rdname Spectra -setMethod("filterPrecursorMzValues", "Spectra", - function(object, mz = numeric(), ppm = 20, tolerance = 0) { - object@backend <- filterPrecursorMzValues( - object@backend, sort(mz), ppm = ppm, tolerance = tolerance) - object@processing <- .logging( - object@processing, - "Filter: select spectra with precursor m/z matching ", - paste0(mz, collapse = ", "), "") - object - }) - -#' @rdname Spectra -setMethod("filterPrecursorCharge", "Spectra", - function(object, z = integer()) { - z <- unique(z) - object@backend <- filterPrecursorCharge(object@backend, z) - object@processing <- .logging( - object@processing, - "Filter: select spectra with a precursor charge ", - paste0(z, collapse = ", ")) - object - }) - -#' @rdname Spectra -setMethod("filterPrecursorScan", "Spectra", - function(object, acquisitionNum = integer(), f = dataOrigin(object)) { - if (!all(f %in% unique(dataOrigin(object)))) - stop("'f' must be in dataOrigin().") - object@backend <- filterPrecursorScan(object@backend, - acquisitionNum, - f = dataOrigin(object)) - object@backend <- filterDataOrigin(object@backend, dataOrigin = f) - object@processing <- .logging( - object@processing, - "Filter: select parent/children scans for ", - paste0(acquisitionNum, collapse = " ")) - object - }) - -#' @rdname Spectra -setMethod("filterRt", "Spectra", - function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { - if (!is.numeric(msLevel.)) - stop("Please provide a numeric MS level.") - if (length(rt) != 2L || !is.numeric(rt) || rt[1] >= rt[2]) - stop("Please provide a lower and upper numeric retention", - " time range.") - if (length(rt)) - rt <- range(rt) - else rt <- c(-Inf, Inf) - object@backend <- filterRt(object@backend, rt, msLevel.) - object@processing <- .logging( - object@processing, - "Filter: select retention time [", rt[1], "..", rt[2], - "] on MS level(s) ", paste0(msLevel., collapse = " ")) - object - }) +#' ## We can replace the precursor intensity values of the originating +#' ## object: +#' sps_dda$precursorIntensity <- pi +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() +#' +NULL -#' @rdname Spectra -setMethod("reset", "Spectra", function(object, ...) { - object@backend <- reset(object@backend) - object@processingQueue <- list() +#' @exportMethod addProcessing +#' +#' @importFrom ProtGenerics ProcessingStep +#' +#' @importMethodsFrom ProtGenerics addProcessing +#' +#' @importClassesFrom ProtGenerics ProcessingStep +#' +#' @importFrom methods .hasSlot +#' +#' @importFrom BiocGenerics updateObject +#' +#' @rdname addProcessing +setMethod("addProcessing", "Spectra", function(object, FUN, ..., + spectraVariables = character()) { + if (missing(FUN)) + return(object) + object@processingQueue <- c(object@processingQueue, + list(ProcessingStep(FUN, ARGS = list(...)))) if (!.hasSlot(object, "processingQueueVariables")) - object <- updateObject(object, check = FALSE) - object@processingQueueVariables <- character() - object@processing <- .logging(object@processing, "Reset object.") + object <- updateObject(object) + object@processingQueueVariables <- union(object@processingQueueVariables, + spectraVariables) + validObject(object) object }) -#' @rdname Spectra -setMethod("filterRanges", "Spectra", - function(object, spectraVariables = character(), ranges = numeric(), - match = c("all", "any")){ - object@backend <- filterRanges(object@backend, spectraVariables, - ranges, match) - object@processing <- .logging(object@processing, - "Filter: select spectra with a ", - spectraVariables, " within: [", - ranges[seq(ranges)%% 2 != 0], ", ", - ranges[seq(ranges)%% 2 == 0], "]" - ) - object - }) - -#' @rdname Spectra -setMethod("filterValues", "Spectra", - function(object, spectraVariables = character(), values = numeric(), - ppm = 0, tolerance = 0, match = c("all", "any")){ - object@backend <- filterValues(object@backend, spectraVariables, - values, ppm, tolerance, match) - object@processing <- .logging(object@processing, - "Filter: select spectra with a ", - spectraVariables, " similar to: ", - values) - object - }) - -#### --------------------------------------------------------------------------- -## -## DATA MANIPULATION METHODS -## -#### --------------------------------------------------------------------------- - -#' @rdname Spectra +#' @rdname addProcessing #' #' @importMethodsFrom ProtGenerics bin #' @@ -2588,60 +3338,82 @@ setMethod("bin", "Spectra", function(x, binSize = 1L, breaks = NULL, breaks <- seq(floor(mzr[1]), ceiling(mzr[2]), by = binSize) breaks <- MsCoreUtils:::.fix_breaks(breaks, mzr) } - mids <- (breaks[-length(breaks)] + breaks[-1L]) / 2 - x <- addProcessing(x, .peaks_bin, breaks = breaks, mids = mids, - agg_fun = FUN, msLevel = msLevel., zero.rm = zero.rm, - spectraVariables = "msLevel") - x@processing <- .logging(x@processing, - "Spectra of MS level(s) ", - paste0(msLevel., collapse = ", "), - " binned.") - x + mids <- (breaks[-length(breaks)] + breaks[-1L]) / 2 + x <- addProcessing(x, .peaks_bin, breaks = breaks, mids = mids, + agg_fun = FUN, msLevel = msLevel., zero.rm = zero.rm, + spectraVariables = "msLevel") + x@processing <- .logging(x@processing, + "Spectra of MS level(s) ", + paste0(msLevel., collapse = ", "), + " binned.") + x +}) + +#' @rdname addProcessing +#' +#' @exportMethod containsMz +setMethod("containsMz", "Spectra", function(object, mz = numeric(), + tolerance = 0, + ppm = 20, which = c("any", "all"), + BPPARAM = bpparam()) { + if (length(object)) { + cond_fun <- match.fun(match.arg(which)) + if (all(is.na(mz))) + return(rep(NA, length(object))) + mz <- unique(sort(mz)) + BPPARAM <- backendBpparam(object@backend, BPPARAM) + unlist(.peaksapply( + object, FUN = .peaks_contain_mz, mz = mz, tolerance = tolerance, + ppm = ppm, condFun = cond_fun, BPPARAM = BPPARAM), + use.names = FALSE + ) + } else logical() +}) + +#' @rdname addProcessing +#' +#' @exportMethod containsNeutralLoss +setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, + tolerance = 0, ppm = 20, + BPPARAM = bpparam()) { + BPPARAM <- backendBpparam(object@backend, BPPARAM) + ## TODO: FIX me to use chunk size. + if (is(BPPARAM, "SerialParam")) { + .has_mz_each(object, precursorMz(object) - neutralLoss, + tolerance = tolerance, ppm = ppm, parallel = BPPARAM) + } else { + sp <- SerialParam() + f <- as.factor(dataStorage(object)) + res <- .lapply(object, FUN = function(obj, n, tol, ppm, par) { + .has_mz_each(obj, precursorMz(obj) - n, tolerance = tol, + ppm = ppm, parallel = sp) + }, n = neutralLoss, tol = tolerance, ppm = ppm, par = sp, f = f, + BPPARAM = BPPARAM) + unsplit(res, f = f) + } }) -#' @rdname Spectra -#' -#' @exportMethod compareSpectra -#' -#' @importFrom MsCoreUtils ndotproduct +#' @rdname addProcessing #' -#' @importMethodsFrom ProtGenerics compareSpectra +#' @importFrom MsCoreUtils entropy nentropy #' -#' @exportMethod compareSpectra -setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), - function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, - FUN = ndotproduct, ..., SIMPLIFY = TRUE) { - mat <- .compare_spectra_chunk(x, y, MAPFUN = MAPFUN, - tolerance = tolerance, - ppm = ppm, FUN = FUN, ...) - if (SIMPLIFY && (length(x) == 1 || length(y) == 1)) - mat <- as.vector(mat) - mat - }) -#' @rdname Spectra -setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), - function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, - FUN = ndotproduct, ..., SIMPLIFY = TRUE) { - if (length(x) == 1) - return(compareSpectra(x, x, MAPFUN = MAPFUN, - tolerance = tolerance, - ppm = ppm, FUN = FUN, ..., - SIMPLIFY = SIMPLIFY)) - mat <- .compare_spectra_self(x, MAPFUN = MAPFUN, FUN = FUN, - tolerance = tolerance, ppm = ppm, - ...) - if (SIMPLIFY && length(x) == 1) - mat <- as.vector(mat) - mat - }) - -## estimateMzResolution - -## estimateNoise - -## normalize +#' @export +setMethod("entropy", "Spectra", function(object, normalized = TRUE) { + if (length(object)) { + if (normalized) entropy_fun <- nentropy + else entropy_fun <- entropy + unlist(.peaksapply( + object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), + use.names = FALSE + ) + } else numeric() +}) +#' @rdname addProcessing +setMethod("entropy", "ANY", function(object, ...) { + MsCoreUtils::entropy(object) +}) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod pickPeaks setMethod("pickPeaks", "Spectra", @@ -2683,11 +3455,7 @@ setMethod("pickPeaks", "Spectra", object }) -## quantify - -## removeReporters - -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod replaceIntensitiesBelow setMethod("replaceIntensitiesBelow", "Spectra", @@ -2714,8 +3482,18 @@ setMethod("replaceIntensitiesBelow", "Spectra", object }) +#' @rdname addProcessing +setMethod("reset", "Spectra", function(object, ...) { + object@backend <- reset(object@backend) + object@processingQueue <- list() + if (!.hasSlot(object, "processingQueueVariables")) + object <- updateObject(object, check = FALSE) + object@processingQueueVariables <- character() + object@processing <- .logging(object@processing, "Reset object.") + object +}) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importFrom ProtGenerics smooth #' @importFrom MsCoreUtils coefMA coefWMA coefSG @@ -2746,93 +3524,264 @@ setMethod("smooth", "Spectra", x }) -#' @exportMethod addProcessing +#' @rdname addProcessing #' -#' @importFrom ProtGenerics ProcessingStep +#' @importMethodsFrom ProtGenerics spectrapply #' -#' @importMethodsFrom ProtGenerics addProcessing +#' @exportMethod spectrapply +setMethod("spectrapply", "Spectra", function(object, FUN, ..., + chunkSize = integer(), + f = factor(), + BPPARAM = SerialParam()) { + if (missing(FUN)) + FUN <- identity + if (length(chunkSize)) + return(chunkapply(object, FUN, ..., chunkSize = chunkSize)) + if (!length(f)) + f <- as.factor(seq_along(object)) + .lapply(object, FUN = FUN, f = f, ..., + BPPARAM = backendBpparam(object@backend, BPPARAM)) +}) + +#' @title Estimate Precursor Intensities #' -#' @importClassesFrom ProtGenerics ProcessingStep +#' @aliases estimatePrecursorIntensity #' -#' @importFrom methods .hasSlot +#' @description #' -#' @importFrom BiocGenerics updateObject +#' Some MS instrument manufacturers don't provide precursor intensities for +#' fragment spectra. These can however be estimated, given that also MS1 +#' spectra are available. The `estimatePrecursorIntensity()` funtion defines the +#' precursor intensities for MS2 spectra using the intensity of the matching +#' MS1 peak from the closest MS1 spectrum (i.e. the last MS1 spectrum measured +#' before the respective MS2 spectrum). With `method = "interpolation"` it is +#' also possible to calculate the precursor intensity based on an interpolation +#' of intensity values (and retention times) of the matching MS1 peaks from the +#' previous and next MS1 spectrum. See below for an example. #' -#' @rdname Spectra -setMethod("addProcessing", "Spectra", function(object, FUN, ..., - spectraVariables = character()) { - if (missing(FUN)) - return(object) - object@processingQueue <- c(object@processingQueue, - list(ProcessingStep(FUN, ARGS = list(...)))) - if (!.hasSlot(object, "processingQueueVariables")) - object <- updateObject(object) - object@processingQueueVariables <- union(object@processingQueueVariables, - spectraVariables) - validObject(object) - object -}) - -#' @rdname Spectra +#' @param object `Spectra` with MS1 and MS2 spectra. #' -#' @export -coreSpectraVariables <- function() .SPECTRA_DATA_COLUMNS - -#' @rdname Spectra -setMethod("uniqueMsLevels", "Spectra", function(object, ...) { - uniqueMsLevels(object@backend, ...) -}) +#' @param ppm `numeric(1)` with the maximal allowed relative difference of m/z +#' values between the precursor m/z of a spectrum and the m/z of the +#' respective ion on the MS1 scan. +#' +#' @param tolerance `numeric(1)` with the maximal allowed difference of m/z +#' values between the precursor m/z of a spectrum and the m/z of the +#' respective ion on the MS1 scan. +#' +#' @param method `character(1)` defining whether the precursor intensity +#' should be estimated on the previous MS1 spectrum (`method = "previous"`, +#' the default) or based on an interpolation on the previous and next +#' MS1 spectrum (`method = "interpolation"`). +#' +#' @param msLevel. `integer(1)` the MS level for which precursor intensities +#' should be estimated. Defaults to `2L`. +#' +#' @param f `factor` (or vector to be coerced to `factor`) defining which +#' spectra belong to the same original data file (sample). +#' Defaults to `f = dataOrigin(x)`. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @author Johannes Rainer with feedback and suggestions from Corey Broeckling +#' +#' @importMethodsFrom ProtGenerics estimatePrecursorIntensity +#' +#' @exportMethod estimatePrecursorIntensity +#' +#' @rdname estimatePrecursorIntensity +#' +#' @examples +#' +#' #' ## Calculating the precursor intensity for MS2 spectra: +#' ## +#' ## Some MS instrument manufacturer don't report the precursor intensities +#' ## for MS2 spectra. The `estimatePrecursorIntensity` function can be used +#' ## in these cases to calculate the precursor intensity on MS1 data. Below +#' ## we load an mzML file from a vendor providing precursor intensities and +#' ## compare the estimated and reported precursor intensities. +#' tmt <- Spectra(msdata::proteomics(full.names = TRUE)[5], +#' backend = MsBackendMzR()) +#' pmi <- estimatePrecursorIntensity(tmt) +#' plot(pmi, precursorIntensity(tmt)) +#' +#' ## We can also replace the original precursor intensity values with the +#' ## newly calculated ones +#' tmt$precursorIntensity <- pmi +setMethod( + "estimatePrecursorIntensity", "Spectra", + function(object, ppm = 20, tolerance = 0, + method = c("previous", "interpolation"), + msLevel. = 2L, f = dataOrigin(object), BPPARAM = bpparam()) { + if (is.factor(f)) + f <- as.character(f) + f <- factor(f, levels = unique(f)) + BPPARAM <- backendBpparam(object@backend, BPPARAM) + unlist(bplapply(split(object, f), + FUN = .estimate_precursor_intensity, ppm = ppm, + tolerance = tolerance, method = method, + msLevel = msLevel., BPPARAM = BPPARAM), + use.names = FALSE) + }) -#' @rdname Spectra -setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { - backendBpparam(object@backend, BPPARAM) -}) -#' @rdname hidden_aliases -setMethod("combinePeaks", "list", function(object, ...) { - .Deprecated("combinePeaksData", old = "combinePeaks", - msg = paste0("'combinePeaks' for lists of peak matrices is ", - "deprecated; please use 'combinePeaksData' ", - "instead.")) - combinePeaksData(object, ...) -}) +################################################################################ +## +## Spectra similarity calculations +## +################################################################################ -#' @rdname Spectra +#' @title Spectra similarity calculations #' -#' @exportMethod combinePeaks -setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, - intensityFun = base::mean, - mzFun = base::mean, - weighted = TRUE, - msLevel. = uniqueMsLevels(object), - ...) { - object <- addProcessing( - object, .peaks_combine, ppm = ppm, tolerance = tolerance, - intensityFun = intensityFun, mzFun = mzFun, weighted = weighted, - msLevel = force(msLevel.), spectraVariables = "msLevel") - object@processing <- .logging( - object@processing, "Combining peaks within each spectrum with ppm = ", - ppm, " and tolerance = ", tolerance, ".") - object -}) - - -#' @rdname Spectra +#' @name compareSpectra #' -#' @importFrom MsCoreUtils entropy nentropy +#' @aliases compareSpectra #' -#' @export -setMethod("entropy", "Spectra", function(object, normalized = TRUE) { - if (length(object)) { - if (normalized) entropy_fun <- nentropy - else entropy_fun <- entropy - unlist(.peaksapply( - object, FUN = function(pks, ...) entropy_fun(pks[, "intensity"])), - use.names = FALSE - ) - } else numeric() -}) -#' @rdname Spectra -setMethod("entropy", "ANY", function(object, ...) { - MsCoreUtils::entropy(object) +#' @description +#' +#' `compareSpectra()` compares each spectrum in `x` with each spectrum in `y` +#' using the function provided with `FUN` (defaults to [ndotproduct()]). If +#' `y` is missing, each spectrum in `x` is compared with each other spectrum +#' in `x`. +#' The matching/mapping of peaks between the compared spectra is done with the +#' `MAPFUN` function. The default [joinPeaks()] matches peaks of both spectra +#' and allows to keep all peaks from the first spectrum (`type = "left"`), +#' from the second (`type = "right"`), from both (`type = "outer"`) and to +#' keep only matching peaks (`type = "inner"`); see [joinPeaks()] for more +#' information and examples). The `MAPFUN` function should have parameters +#' `x`, `y`, `xPrecursorMz` and `yPrecursorMz` as these values are passed to +#' the function. +#' +#' In addition to `joinPeaks()` also [joinPeaksGnps()] is supported for +#' GNPS-like similarity score calculations. Note that `joinPeaksGnps()` should +#' only be used in combination with `FUN = MsCoreUtils::gnps` +#' (see [joinPeaksGnps()] for more information and details). Use +#' `MAPFUN = joinPeaksNone` to disable internal peak matching/mapping if a +#' similarity scoring function is used that performs the matching internally. +#' +#' `FUN` is supposed to be a function to compare intensities of (matched) +#' peaks of the two spectra that are compared. The function needs to take two +#' matrices with columns `"mz"` and `"intensity"` as input and is supposed +#' to return a single numeric as result. In addition to the two peak matrices +#' the spectra's precursor m/z values are passed to the function as parameters +#' `xPrecursorMz` (precursor m/z of the `x` peak matrix) and `yPrecursorMz` +#' (precursor m/z of the `y` peak matrix). Additional parameters to functions +#' `FUN` and `MAPFUN` can be passed with `...`. Parameters `ppm` and +#' `tolerance` are passed to both `MAPFUN` and `FUN`. +#' The function returns a `matrix` with the results of `FUN` for each +#' comparison, number of rows equal to `length(x)` and number of columns +#' equal `length(y)` (i.e. element in row 2 and column 3 is the result from +#' the comparison of `x[2]` with `y[3]`). If `SIMPLIFY = TRUE` the `matrix` +#' is *simplified* to a `numeric` if length of `x` or `y` is one. See also +#' the vignette for additional examples, such as using spectral entropy +#' similarity in the scoring. +#' +#' @param FUN function to compare intensities of peaks between two spectra. +#' Defaults to [ndotproduct()]. +#' +#' @param MAPFUN For `compareSpectra()`: function to map/match peaks between +#' the two compared spectra. See [joinPeaks()] for more information and +#' possible functions. Defaults to [joinPeaks()]. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `Spectra` object. +#' +#' @param SIMPLIFY `logical(1)` defining whether the result matrix should be +#' *simplified* to a `numeric` if possible (i.e. if either `x` or `y` is +#' of length 1). +#' +#' @param ... Additional arguments passed to the internal functions. +#' +#' @importFrom MsCoreUtils ndotproduct +#' +#' @importMethodsFrom ProtGenerics compareSpectra +#' +#' @exportMethod compareSpectra +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' ## Restrict to MS2 (fragment) spectra: +#' sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) +#' +#' ## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +#' ## the normalized dotproduct method. +#' res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +#' ## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +#' ## the second row comparisons of spectrum 3 with spectra 10 to 20 +#' res +#' +#' ## We next calculate the pairwise similarity for the first 10 spectra +#' compareSpectra(sps_ms2[1:10]) +#' +#' ## Use compareSpectra to determine the number of common (matching) peaks +#' ## with a ppm of 10: +#' ## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +#' ## peaks that can be mapped betwen both spectra. The provided FUN returns +#' ## simply the number of matching peaks. +#' compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +#' +#' ## We repeat this calculation between all pairwise combinations +#' ## of the first 20 spectra +#' compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +NULL + +#' @rdname compareSpectra +setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), + function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, + FUN = ndotproduct, ..., SIMPLIFY = TRUE) { + mat <- .compare_spectra_chunk(x, y, MAPFUN = MAPFUN, + tolerance = tolerance, + ppm = ppm, FUN = FUN, ...) + if (SIMPLIFY && (length(x) == 1 || length(y) == 1)) + mat <- as.vector(mat) + mat + }) +#' @rdname compareSpectra +setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), + function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, + FUN = ndotproduct, ..., SIMPLIFY = TRUE) { + if (length(x) == 1) + return(compareSpectra(x, x, MAPFUN = MAPFUN, + tolerance = tolerance, + ppm = ppm, FUN = FUN, ..., + SIMPLIFY = SIMPLIFY)) + mat <- .compare_spectra_self(x, MAPFUN = MAPFUN, FUN = FUN, + tolerance = tolerance, ppm = ppm, + ...) + if (SIMPLIFY && length(x) == 1) + mat <- as.vector(mat) + mat + }) + + +################################################################################ +## +## methods with documentation in Spectra-functions.R +## +################################################################################ + +#' @rdname processingChunkSize +setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { + backendBpparam(object@backend, BPPARAM) }) diff --git a/R/countIdentifications.R b/R/countIdentifications.R index b7ddb687..2f3e8c15 100644 --- a/R/countIdentifications.R +++ b/R/countIdentifications.R @@ -40,6 +40,10 @@ #' spectra variable `countIdentifications` with the number of #' identification for each scan. #' +#' @seealso +#' +#' [addProcessing()] for other data analysis functions. +#' #' @author Laurent Gatto #' #' @export diff --git a/R/peaks-functions.R b/R/peaks-functions.R index 7419dd44..dc19e353 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -87,7 +87,7 @@ NULL msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - x[which(between(x[, "intensity"], intensity)), , drop = FALSE] + x[which(MsCoreUtils::between(x[, "intensity"], intensity)), , drop = FALSE] } #' @description @@ -146,8 +146,9 @@ NULL keep = TRUE, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - no_match <- is.na(closest(x[, "mz"], mz, tolerance = tolerance, ppm = ppm, - duplicates = "keep", .check = FALSE)) + no_match <- is.na(MsCoreUtils::closest(x[, "mz"], mz, tolerance = tolerance, + ppm = ppm, duplicates = "keep", + .check = FALSE)) if (keep) x[!no_match, , drop = FALSE] else x[no_match, , drop = FALSE] } @@ -170,8 +171,8 @@ NULL if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) if (keep) - x[between(x[, "mz"], mz), , drop = FALSE] - else x[!between(x[, "mz"], mz), , drop = FALSE] + x[MsCoreUtils::between(x[, "mz"], mz), , drop = FALSE] + else x[!MsCoreUtils::between(x[, "mz"], mz), , drop = FALSE] } #' @description @@ -307,7 +308,13 @@ NULL #' #' @author Johannes Rainer, Michael Witting #' -#' @seealso [gnps()] +#' @seealso +#' +#' - [compareSpectra()] for the function to calculate similarities between +#' spectra. +#' +#' - [gnps()] in the *MsCoreUtils* package for more information on the GNPS +#' similarity score. #' #' @importFrom MsCoreUtils join ppm #' @@ -415,14 +422,14 @@ joinPeaksNone <- function(x, y, ...) { return(x) } - n <- noise(x[, 1L], x[, 2L], method = method, ...) + n <- MsCoreUtils::noise(x[, 1L], x[, 2L], method = method, ...) - l <- localMaxima(x[, 2L], hws = halfWindowSize) + l <- MsCoreUtils::localMaxima(x[, 2L], hws = halfWindowSize) p <- which(l & x[, 2L] > (snr * n)) if (k > 0L) { - cbind(mz = refineCentroids(x = x[, 1L], y = x[, 2L], p = p, + cbind(mz = MsCoreUtils::refineCentroids(x = x[, 1L], y = x[, 2L], p = p, k = k, threshold = threshold, descending = descending), intensity = x[p, 2L]) @@ -552,9 +559,10 @@ joinPeaksNone <- function(x, y, ...) { .peaks_deisotope <- function(x, substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), tolerance = 0, ppm = 10, charge = 1, ...) { - iso_grps <- isotopologues(x, substDefinition = substDefinition, - tolerance = tolerance, ppm = ppm, - charge = charge) + iso_grps <- MetaboCoreUtils::isotopologues( + x, substDefinition = substDefinition, + tolerance = tolerance, ppm = ppm, + charge = charge) if (length(iso_grps)) { rem <- unique(unlist(lapply(iso_grps, `[`, -1), use.names = FALSE)) x[-rem, , drop = FALSE] @@ -614,7 +622,7 @@ joinPeaksNone <- function(x, y, ...) { msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !length(x)) return(x) - grps <- group(x[, "mz"], tolerance = tolerance, ppm = ppm) + grps <- MsCoreUtils::group(x[, "mz"], tolerance = tolerance, ppm = ppm) lg <- length(grps) if (grps[lg] == lg) return(x) @@ -649,7 +657,7 @@ joinPeaksNone <- function(x, y, ...) { msLevel = spectrumMsLevel, ...) { if (!spectrumMsLevel %in% msLevel || !nrow(x)) return(x) - keep <- is.na(closest(x[, "mz"], precursorMz, ppm = ppm, + keep <- is.na(MsCoreUtils::closest(x[, "mz"], precursorMz, ppm = ppm, tolerance = tolerance, duplicates = "keep", .check = FALSE)) x[keep, , drop = FALSE] @@ -670,3 +678,72 @@ joinPeaksNone <- function(x, y, ...) { pmz <- precursorMz - tolerance - ppm(precursorMz, ppm = ppm) x[x[, "mz"] < pmz, , drop = FALSE] } + +#' filter a peak matrix `x` by (arbitrary) numeric ranges for spectra and/or +#' peaks variables. ranges for spectra and peaks variables are combined using +#' a logical AND, rows in the provided range matrices with a logical OR. +#' +#' Used by `filterPeaksRanges()` function for `Spectra`. +#' +#' @param svars `character` with the spectra variables for which filter ranges +#' where provided. +#' +#' @param pvars `character` with the peaks variables for which filter ranges +#' where provided. +#' +#' @param ranges `list` with `numeric` two-column matrices with the +#' user-provided ranges. The number of rows of all matrices is expected +#' to match. +#' +#' @param spectrumMsLevel `integer(1)` with the MS level of the peak matrix' +#' spectrum. +#' +#' @param keep `logical(1)` whether mass peaks that match the filters should be +#' kept or removed. +#' +#' @param ... values for all spectra variables defined in `svars` are expected +#' to be passed through `...` as `name = value` pairs. +#' +#' @author Johannes Rainer +#' +#' @noRd +.peaks_filter_ranges <- function(x, svars = character(), + pvars = character(), + ranges, spectrumMsLevel, + keep = TRUE, ...) { + svalue <- list(..., msLevel = spectrumMsLevel) + nx <- nrow(x) + sel <- rep(FALSE, nx) + for (i in seq_len(nrow(ranges[[1L]]))) { + ## check ranges for spectra variables + svars_ok <- vapply(svars, function(z) + MsCoreUtils::between(svalue[[z]], ranges[[z]][i, ]), TRUE, + USE.NAMES = FALSE) + if (!anyNA(svars_ok) && all(svars_ok)) { + if (length(pvars)) { + ## check ranges for peaks variables + tmp <- rowSums(do.call(cbind, lapply(pvars, function(z) { + MsCoreUtils::between(x[, z], ranges[[z]][i, ]) + }))) == length(pvars) + tmp[is.na(tmp)] <- FALSE + sel <- sel | tmp + } else { + ## No need to check further, because we have a match + if (keep) return(x) + else return(x[logical(), , drop = FALSE]) + } + } + } + if (keep) x[sel, , drop = FALSE] + else x[!sel, , drop = FALSE] +} + +#' Check for presence of peaks defined by their m/z value. Note that this +#' function does **not** return a peak matrix, but only a logical of length 1! +#' +#' @return `logical(1)` +#' @noRd +.peaks_contain_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, + condFun = any, ...) { + condFun(common(mz, x[, "mz"], tolerance = tolerance, ppm = ppm)) +} diff --git a/README.md b/README.md index be839639..3df3e6d7 100644 --- a/README.md +++ b/README.md @@ -19,58 +19,81 @@ footprint. A (possibly incomplete) list of available backends (along with a link to the R package providing it) is shown below: -- `MsBackendMemory` (package: *Spectra*): *default* backend which keeps all data - in memory. Optimized for fast processing. +- `MsBackendCompDb` (package + [*CompoundDb*](https://github.com/rformassspectrometry/CompoundDb): provides + access to spectra data (spectra and peaks variables) from a *CompDb* + database. Has a small memory footprint because all data (except precursor m/z + values) are retrieved on-the-fly from the database. + - `MsBackendDataFrame` (package: *Spectra*): alternative to the `MsBackendMemory` also keeping all data in memory, but supporting `S4` objects as spectra variables because the data is stored internally in a `DataFrame`. -- `MsBackendMzR` (package: *Spectra*): by using the `mzR` package it supports - import of MS data from mzML, mzXML and CDF files. This backend keeps only - general spectra variables in memory and retrieves the peaks data (m/z and - intensity values) on-the-fly from the original data files. The backend has - thus a smaller memory footprint compared to in-memory backends. + - `MsBackendHdf5Peaks` (package: *Spectra*): on-disk backend similar to `MsBackendMzR`, but the peaks data is stored in HDF5 files (general spectra variables are kept in memory). -- `MsBackendMgf` (package - [*MsBackendMgf*](https://github.com/rformassspectrometry/MsBackendMgf): allows - to import/export data in mascot generic format (MGF). Extends the - `MsBackendDataFrame` and keeps thus all data, after import, in memory. -- `MsBackendMsp` (package - [*MsbackendMsp*](https://github.com/rformassspectrometry/MsBackendMsp): allows - to import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and - keeps thus all data, after import, in memory. + +- `MsBackendHmdbXml` (package + [*MsbackendHmdb*](https://github.com/rformassspectrometry/MsBackendHmdb)): + allows import of MS data from xml files of the Human Metabolome Database + (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after + import, in memory. + - `MsBackendMassbank` (package [*MsBackendMassbank*](https://github.com/rformassspectrometry/MsBackendMassbank)): allows to import/export data in MassBank text file format. Extends the `MsBackendDataFrame` and keeps thus all data, after import, in memory. + - `MsBackendMassbankSql` (package [*MsBackendMassbank*](https://github.com/rformassspectrometry/MsBackendMassbank)): allows to directly connect to a MassBank SQL database to retrieve all MS data and variables. Has a minimal memory footprint because all data is retrieved on-the-fly from the SQL database. + +- `MsBackendMemory` (package: *Spectra*): *default* backend which keeps all data + in memory. Optimized for fast processing. + +- `MsBackendMetaboLights` (package + [*MsBackendMetaboLights*](https://github.com/rformassspectrometry/MsBackendMetaboLights)): + retrieves and caches MS data files from MetaboLights. + +- `MsBackendMgf` (package + [*MsBackendMgf*](https://github.com/rformassspectrometry/MsBackendMgf)): allows + to import/export data in mascot generic format (MGF). Extends the + `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMsp` (package + [*MsbackendMsp*](https://github.com/rformassspectrometry/MsBackendMsp)): allows + to import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and + keeps thus all data, after import, in memory. + +- `MsBackendMzR` (package: *Spectra*): by using the `mzR` package it supports + import of MS data from mzML, mzXML and CDF files. This backend keeps only + general spectra variables in memory and retrieves the peaks data (m/z and + intensity values) on-the-fly from the original data files. The backend has + thus a smaller memory footprint compared to in-memory backends. + +- `MsBackendOfflineSql` (package + [*MsBackendSql*](https://github.com/rformassspectrometry/MsBackendSql)): + stores all MS data in a SQL database and has thus a minimal memory footprint. + Does, in contrast to `MsBackendSql`, not keep an active SQL database + connection and can thus support parallel processing. + - `MsBackendRawFileReader` (package [*MsBackendRawFileReader*](https://github.com/fgcz/MsBackendRawFileReader)): implements a backend for reading MS data from Thermo Fisher Scientific's raw data files using the manufacturer's NewRawFileReader .Net libraries. The package generalizes the functionality introduced by the `rawrr` package. -- `MsBackendHmdbXml` (package - [*MsbackendHmdb*](https://github.com/rformassspectrometry/MsBackendHmdb)): - allows import of MS data from xml files of the Human Metabolome Database - (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after - import, in memory. + - `MsBackendSql` (package [*MsBackendSql*](https://github.com/rformassspectrometry/MsBackendSql)): stores all MS data in a SQL database and has thus a minimal memory footprint. -- `MsBackendCompDb` (package - [*CompoundDb*](https://github.com/rformassspectrometry/CompoundDb): provides - access to spectra data (spectra and peaks variables) from a *CompDb* - database. Has a small memory footprint because all data (except precursor m/z - values) are retrieved on-the-fly from the database. + - `MsBackendTimsTof` (package [*MsBackendTimsTof*](https://github.com/rformassspectrometry/MsBackendTimsTof): allows import of data from Bruker TimsTOF raw data files (using the `opentimsr` R package). + - `MsBackendWeizMass` (package [*MsBackendWeizMass*](https://github.com/rformassspectrometry/MsBackendWeizMass): allows to access MS data from WeizMass MS/MS spectral databases. @@ -95,4 +118,6 @@ BiocManager::install("Spectra") Contributions are highly welcome and should follow the [contribution guidelines](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html#contributions). Also, please check the coding style guidelines in the [RforMassSpectrometry -vignette](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html). +vignette](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html) +and importantly, follow our [code of +conduct](https://rformassspectrometry.github.io/RforMassSpectrometry/articles/RforMassSpectrometry.html#code-of-conduct). diff --git a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R index 84a69f60..98788c2d 100644 --- a/inst/test_backends/test_MsBackend/test_spectra_subsetting.R +++ b/inst/test_backends/test_MsBackend/test_spectra_subsetting.R @@ -49,6 +49,24 @@ test_that("[", { res <- be[integer()] expect_s4_class(res, class(be)[1L]) expect_true(length(res) == 0L) + + ## logical + l <- rep(FALSE, length(be)) + l[sample(seq_along(l), floor(length(l) / 2))] <- TRUE + res <- be[l] + expect_true(validObject(res)) + expect_true(length(res) == sum(l)) + expect_equal(res, be[which(l)]) +}) + +#' extractByIndex. Uses [ if not implemented +test_that("extractByIndex", { + i <- sample(seq_along(be), floor(length(be) / 2)) + res <- extractByIndex(be, i) + expect_true(validObject(res)) + expect_equal(length(res), length(i)) + expect_equal(msLevel(res), msLevel(be)[i]) + expect_equal(rtime(res), rtime(be)[i]) }) test_that("cbind2 works", { @@ -87,8 +105,9 @@ test_that("selectSpectraVariables", { if (!isReadOnly(be) || inherits(be, "MsBackendCached") || inherits(be, "MsBackendDataFrame")) { tmp <- be - res <- selectSpectraVariables(tmp, c("mz", "intensity", - "dataStorage", "scanIndex")) + res <- selectSpectraVariables( + tmp, union(c("mz", "intensity", "dataStorage", "scanIndex"), + backendRequiredSpectraVariables(be))) expect_true(all(names(coreSpectraVariables()) %in% spectraVariables(res))) expect_true(all(is.na(res$msLevel))) diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index dc410ae4..7b22f696 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -17,6 +17,14 @@ \alias{backendInitialize} \alias{backendParallelFactor,MsBackendMzR-method} \alias{backendParallelFactor,MsBackendHdf5Peaks-method} +\alias{dataStorageBasePath} +\alias{dataStorageBasePath,MsBackendMzR-method} +\alias{dataStorageBasePath<-} +\alias{dataStorageBasePath<-,MsBackendMzR-method} +\alias{extractByIndex} +\alias{msLeveL<-,MsBackend-method} +\alias{backendRequiredSpectraVariables} +\alias{backendRequiredSpectraVariables,MsBackend-method} \alias{backendBpparam,MsBackend-method} \alias{backendInitialize,MsBackend-method} \alias{backendMerge,list-method} @@ -36,6 +44,8 @@ \alias{dataStorage,MsBackend-method} \alias{dataStorage<-,MsBackend-method} \alias{dropNaSpectraVariables,MsBackend-method} +\alias{extractByIndex,MsBackend,ANY-method} +\alias{extractByIndex,MsBackend,missing-method} \alias{filterAcquisitionNum,MsBackend-method} \alias{filterDataOrigin,MsBackend-method} \alias{filterDataStorage,MsBackend-method} @@ -65,6 +75,7 @@ \alias{isReadOnly,MsBackend-method} \alias{length,MsBackend-method} \alias{msLevel,MsBackend-method} +\alias{msLevel<-,MsBackend-method} \alias{mz,MsBackend-method} \alias{mz<-,MsBackend-method} \alias{lengths,MsBackend-method} @@ -74,6 +85,7 @@ \alias{precursorCharge,MsBackend-method} \alias{precursorIntensity,MsBackend-method} \alias{precursorMz,MsBackend-method} +\alias{precursorMz<-,MsBackend-method} \alias{peaksData<-,MsBackend-method} \alias{reset,MsBackend-method} \alias{rtime,MsBackend-method} @@ -94,6 +106,8 @@ \alias{$<-,MsBackend-method} \alias{[[,MsBackend-method} \alias{[[<-,MsBackend-method} +\alias{dataStorageBasePath,MsBackend-method} +\alias{dataStorageBasePath<-,MsBackend-method} \alias{MsBackendDataFrame} \alias{backendInitialize,MsBackendDataFrame-method} \alias{MsBackendHdf5Peaks} @@ -140,6 +154,10 @@ \S4method{dropNaSpectraVariables}{MsBackend}(object) +\S4method{extractByIndex}{MsBackend,ANY}(object, i) + +\S4method{extractByIndex}{MsBackend,missing}(object, i) + \S4method{filterAcquisitionNum}{MsBackend}(object, n, file, ...) \S4method{filterDataOrigin}{MsBackend}(object, dataOrigin = character()) @@ -210,6 +228,8 @@ \S4method{msLevel}{MsBackend}(object) +\S4method{msLevel}{MsBackend}(object) <- value + \S4method{mz}{MsBackend}(object) \S4method{mz}{MsBackend}(object) <- value @@ -228,6 +248,8 @@ \S4method{precursorMz}{MsBackend}(object) +\S4method{precursorMz}{MsBackend}(object, ...) <- value + \S4method{peaksData}{MsBackend}(object) <- value \S4method{reset}{MsBackend}(object) @@ -272,6 +294,10 @@ \S4method{uniqueMsLevels}{MsBackend}(object, ...) +\S4method{dataStorageBasePath}{MsBackend}(object) + +\S4method{dataStorageBasePath}{MsBackend}(object) <- value + MsBackendDataFrame() \S4method{backendInitialize}{MsBackendDataFrame}(object, data, peaksVariables = c("mz", "intensity"), ...) @@ -311,6 +337,8 @@ length as the number of spectra in the backend.} \item{value}{replacement value for \verb{<-} methods. See individual method description or expected data type.} +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} + \item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition numbers to filter for.} @@ -404,8 +432,6 @@ reported total ion current should be reported, or whether the total ion current should be (re)calculated on the actual data (\code{initial = FALSE}).} -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} - \item{j}{For \code{[}: not supported.} \item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return @@ -514,7 +540,9 @@ detailed description and examples): allowed. Parameter \code{i} should support \code{integer} indices and \code{logical} and should throw an error if \code{i} is out of bounds. The \code{MsCoreUtils::i2index} could be used to check the input \code{i}. -For \code{i = integer()} an empty backend should be returned. +For \code{i = integer()} an empty backend should be returned. Implementation +of this method is optional, as the default calls the \code{extractByIndex()} +method (which has to be implemented as the main subsetting method). \item \code{$}, \verb{$<-}: access or set/add a single spectrum variable (column) in the backend. Using a \code{value} of \code{NULL} should allow deleting the specified spectra variable. An error should be thrown if the spectra variable is not @@ -560,12 +588,20 @@ The default implementation returns a factor of length 0 (\code{factor()}) providing thus no default splitting. \code{backendParallelFactor()} for \code{MsBackendMzR} on the other hand returns \code{factor(dataStorage(object))} hence suggesting to split the object by data file. +\item \code{backendRequiredSpectraVariables()}: returns a \code{character} with spectra +variable names that are mandatory for a specific backend. The default +returns an empty \code{character()}. The implementation for \code{MsBackendMzR} +returns \code{c("dataStorage", "scanIndex")} as these two spectra variables +are required to load the MS data on-the-fly. This method needs only to +be implemented if a backend requires specific variables to be defined. \item \code{dataOrigin()}: gets a \code{character} of length equal to the number of spectra in \code{object} with the \emph{data origin} of each spectrum. This could e.g. be the mzML file from which the data was read. \item \code{dataStorage()}: gets a \code{character} of length equal to the number of spectra in \code{object} with the data storage of each spectrum. Note that missing values (\code{NA_character_}) are not supported for \code{dataStorage}. +\item \code{dataStorageBasePath()}, \verb{dataStorageBasePath<-: gets or sets the common *base* path of the directory containing all data files. If supported, the function is expected to return (or accept) a }character\verb{of length 1. Most backends (such as for example the}MsBackendMemory\verb{will not support this function and}dataStorageBasePath()\verb{will return}NA_character_\verb{. For }MsBackendMzR\verb{, this function allows to get or change the path to the directory containing the original data files, which is required if e.g. a serialized }MsBackendMzR` instance gets copied to another computer or +file system. \item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the object's \code{spectraData} that contain only missing values (\code{NA}). Note that while columns with only \code{NA}s are removed, a \code{spectraData()} call after @@ -599,6 +635,16 @@ queue) are applied prior to export - this would not be possible with only a for the \code{MsBackendMzR} backend that supports export of the data in \emph{mzML} or \emph{mzXML} format. See the documentation for the \code{MsBackendMzR} class below for more information. +\item \code{extractByIndex()}: function to subset a backend to selected elements +defined by the provided index. Similar to \code{[}, this method should allow +extracting (or to subset) the data in any order. In contrast to \code{[}, +however, \code{i} is expected to be an \code{integer} (while \code{[} should also +support \code{logical} and eventually \code{character}). While being apparently +redundant to \code{[}, this methods avoids package namespace errors/problems +that can result in implementations of \code{[} being not found by R (which +can happen sometimes in parallel processing using the \code{\link[=SnowParam]{SnowParam()}}). This +method is used internally by \code{Spectra} to extract/subset its backend. +Implementation of this method is mandatory. \item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or \code{dataStorage} is also provided, \code{object} is subsetted to the spectra with @@ -719,6 +765,7 @@ number of spectra). For empty spectra, \code{0} is returned. \item \code{msLevel()}: gets the spectra's MS level. Returns an \code{integer} vector (of length equal to the number of spectra) with the MS level for each spectrum (or \code{NA_integer_} if not available). +\item \verb{msLevel<-}: replaces the spectra's MS level. \item \code{mz()}: gets the mass-to-charge ratios (m/z) from the spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of spectra, each element a \code{numeric} vector with the m/z values of @@ -937,7 +984,7 @@ This backend provides an \code{export()} method to export data from a \code{Spec The parameters are: \itemize{ \item \code{object}: an instance of the \code{MsBackendMzR} class. -\item \code{x}: the \linkS4class{Spectra} object to be exported. +\item \code{x}: the \link{Spectra} object to be exported. \item \code{file}: \code{character} with the (full) output file name(s). Should be of length 1 or equal \code{length(x)}. If a single file is specified, all spectra are exported to that file. Alternatively it is possible to specify @@ -951,7 +998,7 @@ backend and if \code{dataOrigin(x)} contains the original MS data file names. \item \code{BPPARAM}: parallel processing settings. } -See examples in \linkS4class{Spectra} or the vignette for more details and +See examples in \link{Spectra} or the vignette for more details and examples. The \code{MsBackendMzR} ignores parameter \code{columns} of the \code{peaksData()} @@ -1089,5 +1136,5 @@ be$peak_ann <- NULL peaksVariables(be) } \author{ -Johannes Rainer, Sebastian Gibb, Laurent Gatto +Johannes Rainer, Sebastian Gibb, Laurent Gatto, Philippine Louail } diff --git a/man/MsBackendCached.Rd b/man/MsBackendCached.Rd index e65e41e9..ae8c6687 100644 --- a/man/MsBackendCached.Rd +++ b/man/MsBackendCached.Rd @@ -5,6 +5,7 @@ \alias{MsBackendCached-class} \alias{backendInitialize,MsBackendCached-method} \alias{dataStorage,MsBackendCached-method} +\alias{extractByIndex,MsBackendCached,ANY-method} \alias{length,MsBackendCached-method} \alias{spectraVariables,MsBackendCached-method} \alias{spectraData,MsBackendCached-method} @@ -57,6 +58,8 @@ MsBackendCached() \S4method{dataStorage}{MsBackendCached}(object) +\S4method{extractByIndex}{MsBackendCached,ANY}(object, i) + \S4method{length}{MsBackendCached}(x) \S4method{spectraVariables}{MsBackendCached}(object) @@ -150,6 +153,8 @@ variables to keep.} \item{...}{ignored} +\item{i}{For \code{[}: \code{integer} with the indices to subset the object.} + \item{x}{A \code{MsBackendCached} object.} \item{columns}{For \code{spectraData()}: \code{character} with the names of the spectra @@ -158,8 +163,6 @@ variables to retrieve.} \item{value}{replacement value for \verb{<-} methods. See individual method description or expected data type.} -\item{i}{For \code{[}: \code{integer} with the indices to subset the object.} - \item{j}{For \code{[}: ignored.} \item{drop}{For \code{[}: not considered.} diff --git a/man/Spectra.Rd b/man/Spectra.Rd index 07b4be7d..6f97d33a 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1,30 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R, R/Spectra.R -\name{applyProcessing} -\alias{applyProcessing} -\alias{concatenateSpectra} -\alias{combineSpectra} -\alias{joinSpectraData} -\alias{processingLog} -\alias{deisotopeSpectra} -\alias{reduceSpectra} -\alias{filterPrecursorMaxIntensity} -\alias{filterPrecursorIsotopes} -\alias{scalePeaks} -\alias{filterPrecursorPeaks} +% Please edit documentation in R/Spectra.R +\name{Spectra} \alias{Spectra} \alias{Spectra-class} -\alias{[,Spectra-method} -\alias{uniqueMsLevels} -\alias{uniqueMsLevels,Spectra-method} -\alias{combinePeaks} +\alias{setBackend} +\alias{export} \alias{Spectra,missing-method} \alias{Spectra,MsBackend-method} \alias{Spectra,character-method} \alias{Spectra,ANY-method} \alias{setBackend,Spectra,MsBackend-method} -\alias{c,Spectra-method} -\alias{split,Spectra,ANY-method} \alias{export,Spectra-method} \alias{acquisitionNum,Spectra-method} \alias{peaksData,Spectra-method} @@ -109,59 +94,10 @@ \alias{combinePeaks,Spectra-method} \alias{entropy,Spectra-method} \alias{entropy,ANY-method} +\alias{dataStorageBasePath,Spectra-method} +\alias{dataStorageBasePath<-,Spectra-method} \title{The Spectra class to manage and access MS data} \usage{ -applyProcessing( - object, - f = processingChunkFactor(object), - BPPARAM = bpparam(), - ... -) - -concatenateSpectra(x, ...) - -combineSpectra( - x, - f = x$dataStorage, - p = x$dataStorage, - FUN = combinePeaksData, - ..., - BPPARAM = bpparam() -) - -joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") - -processingLog(x) - -deisotopeSpectra( - x, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), - tolerance = 0, - ppm = 20, - charge = 1 -) - -reduceSpectra(x, tolerance = 0, ppm = 20) - -filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) - -filterPrecursorIsotopes( - x, - tolerance = 0, - ppm = 20, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") -) - -scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) - -filterPrecursorPeaks( - object, - tolerance = 0, - ppm = 20, - mz = c("==", ">="), - msLevel. = uniqueMsLevels(object) -) - \S4method{Spectra}{missing}( object, processingQueue = list(), @@ -207,13 +143,9 @@ filterPrecursorPeaks( BPPARAM = bpparam() ) -\S4method{c}{Spectra}(x, ...) - -\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) - \S4method{export}{Spectra}(object, backend, ...) -\S4method{acquisitionNum}{Spectra}(object) +\S4method{dataStorageBasePath}{Spectra}(object) \S4method{peaksData}{Spectra}( object, @@ -621,11 +553,20 @@ filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with the m/z values to match peaks or precursor m/z against.} +\S4method{dataStorageBasePath}{Spectra}(object) <- value +} +\arguments{ +\item{object}{For \code{Spectra()}: an object to instantiate the \code{Spectra} +object and initialize the with data.. See section on creation of +\code{Spectra} objects for details. For all other methods a \code{Spectra} object.} + \item{processingQueue}{For \code{Spectra()}: optional \code{list} of \linkS4class{ProcessingStep} objects.} \item{metadata}{For \code{Spectra()}: optional \code{list} with metadata information.} +\item{...}{Additional arguments.} + \item{backend}{For \code{Spectra()}: \linkS4class{MsBackend} to be used as backend. See section on creation of \code{Spectra} objects for details. For \code{setBackend()}: instance of \linkS4class{MsBackend} that supports \code{setBackend()} (i.e. for @@ -635,238 +576,56 @@ passing the full spectra data to the initialize method. See section on creation of \code{Spectra} objects for details. For \code{export()}: \linkS4class{MsBackend} to be used to export the data.} -\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be used -to import spectrum data from the provided files. See section \emph{Creation -of objects, conversion and changing the backend} for more details.} - -\item{drop}{For \code{[}, \code{split()}: not considered.} - -\item{columns}{For \code{spectraData()} accessor: optional \code{character} with -column names (spectra variables) that should be included in the -returned \code{DataFrame}. By default, all columns are returned. -For \code{peaksData()} accessor: optional \code{character} with requested columns -in the individual \code{matrix} of the returned \code{list}. Defaults to -\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} -with \code{object} being the \code{Spectra} object are supported.} - -\item{value}{replacement value for \verb{<-} methods. See individual -method description or expected data type.} - -\item{which}{for \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether -any (the default) or all provided \code{mz} have to be present in the -spectrum.} - -\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the -value which should be subtracted from the spectrum's precursor m/z.} - -\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which \code{Spectra} -should be split. This parameter overrides parameters \code{f} and \code{BPPARAM}.} - -\item{use.names}{For \code{lengths()}: ignored.} - -\item{spectraVariables}{\itemize{ -\item For \code{selectSpectraVariables()}: \code{character} with the -names of the spectra variables to which the backend should be -subsetted. -\itemize{ -\item For \code{addProcessing()}: \code{character} with additional spectra variables -that should be passed along to the function defined with \code{FUN}. See -function description for details. -\item For \code{filterRanges()} and \code{filterValues()}: \code{character} vector -specifying the column(s) from \code{spectraData(object)} on which to filter -the data and that correspond to the the names of the spectra variables -that should be used for the filtering. -} -}} - -\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially -reported total ion current should be reported, or whether the -total ion current should be (re)calculated on the actual data -(\code{initial = FALSE}, same as \code{ionCount()}).} - -\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return -or set.} - -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the object.} - -\item{j}{For \code{[}: not supported.} - -\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition -numbers to filter for.} - -\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occur only for spectra of selected \code{dataStorage}.} +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} -\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occurr only for spectra of selected \code{dataOrigin}.} +\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be +used to import spectrum data from the provided files. See section +\emph{Creation of objects} for more details.} -\item{halfWindowSize}{\itemize{ -\item For \code{pickPeaks()}: \code{integer(1)}, used in the -identification of the mass peaks: a local maximum has to be the maximum -in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\itemize{ -\item For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the -window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\item For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} defining the m/z -window left and right of a peak where to remove fourier transform -artefacts. -} -}} +\item{f}{For \code{setBackend()}: factor defining how to split the data +for parallelized copying of the spectra data to the new backend. For +some backends changing this parameter can lead to errors. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} -\item{threshold}{\itemize{ -\item For \code{pickPeaks()}: a \code{double(1)} defining the proportion of the maximal -peak intensity. Just values above are used for the weighted mean -calculation. -\itemize{ -\item For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold -or a \code{function} to calculate the threshold for each spectrum on its -intensity values. Defaults to \code{threshold = min}. -\item For \code{filterFourierTransformArtefacts()}: the relative intensity (to a -peak) below which peaks are considered fourier artefacts. Defaults to -\code{threshold = 0.2} hence removing peaks that have an intensity below 0.2 -times the intensity of the tested peak (within the selected -\code{halfWindowSize}). +\item{value}{For \code{dataStorageBasePath()}: A \code{character} vector that defines +the base directory where the data storage files can be found.} } -}} - -\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope -peaks should not be removed as fourier artefacts.} - -\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge -to be considered for isotopes.} - -\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z -\code{tolerance} to be used to define whether peaks might be isotopes of -the current tested peak.} - -\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 -defining either the lower or the lower and upper intensity limit for the -filtering, or a \code{function} that takes the intensities as input and -returns a \code{logical} (same length then peaks in the spectrum) whether the -peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus -only peaks with \code{NA} intensity are removed.} - -\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} -whether the matching peaks should be retained (\code{keep = TRUE}, the -default) or dropped (\code{keep = FALSE}).} - -\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to -to subset \code{object}.} - -\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor -charges to be used as filter.} - -\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the -acquisition number of the spectra to which the object should be -subsetted.} - -\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to -be used to subset/filter \code{object}.} - -\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values -(upper and lower boundary) that define the ranges to filter the \code{object}. -These paired values need to be in the same order as the -\code{spectraVariables} parameter (see below).} - -\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } -defining whether the condition has to match for all provided -\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them -(\code{match = "any"}) for spectra to be retained.} - -\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the -values to filter the Spectra data. These values need to be in the same -order as the \code{spectraVariables} parameter.} - -\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. -Defaults to \code{binSize = 1}.} - -\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between -bins.} - -\item{zero.rm}{\code{logical}. For \code{bin()}: indicating whether to remove bins -with zero intensity. Defaults to \code{TRUE}, meaning the function will -discard bins created with an intensity of 0 to enhance memory efficiency.} - -\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between the -two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and possible -functions.} - -\item{SIMPLIFY}{For \code{compareSpectra()} whether the result matrix should be -\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is -of length 1).} - -\item{method}{\itemize{ -\item For \code{pickPeaks()}: \code{character(1)}, the noise estimators that -should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation -(\code{method = "MAD"}) and Friedman's Super Smoother -(\code{method = "SuperSmoother"}) are supported. +\description{ +The \code{Spectra} class encapsules spectral mass spectrometry (MS) data and +related metadata. The MS data is represented by a \emph{backend} extending the +virual \link{MsBackend} class which provides the data to the \code{Spectra} object. +The \code{Spectra} class implements only data accessor, filtering and analysis +methods for the MS data and relies on its \emph{backend} to provide the MS data. +This allows to change data representations of a \code{Spectra} object depending +on the user's needs and properties of the data. Different backends and +their properties are explained in the \link{MsBackend} documentation. + +Documentation on other topics and functionality of \code{Spectra}can be found in: \itemize{ -\item For \code{smooth()}: \code{character(1)}, the smoothing function that should be -used, currently, the Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -} -}} - -\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the -\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be -higher than \code{snr * noise} to be considered as peak.} - -\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of -the peak that should be considered in the weighted mean calculation.} - -\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values between -the nearest valleys around the peak centroids are used.} - -\item{intensityFun}{For \code{combinePeaks()}: function to be used to aggregate -intensities for all peaks in each peak group into a single intensity -value.} - -\item{mzFun}{For \code{combinePeaks()}: function to aggregate m/z values for all -peaks within each peak group into a single m/z value. This parameter -is ignored if \code{weighted = TRUE} (the default).} - -\item{weighted}{For \code{combinePeaks()}: \code{logical(1)} whether m/z values of -peaks within each peak group should be aggregated into a single m/z -value using an intensity-weighted mean. Defaults to \code{weighted = TRUE}.} - -\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized -entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for -details.} -} -\value{ -See individual method description for the return value. +\item \code{\link[=spectraData]{spectraData()}} for accessing and using MS data through \code{Spectra} objects. +\item \code{\link[=filterMsLevel]{filterMsLevel()}} to subset and filter \code{Spectra} objects. +\item \code{\link[=plotSpectra]{plotSpectra()}} for visualization of \code{Spectra} orbjects. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \code{\link[=combineSpectra]{combineSpectra()}} for merging, aggregating and splitting of \code{Spectra} +objects. +\item \code{\link[=combinePeaks]{combinePeaks()}} for merging and aggregating \code{Spectra}'s mass peaks data. +\item \code{\link[=addProcessing]{addProcessing()}} for data analysis functions. +\item \code{\link[=compareSpectra]{compareSpectra()}} for spectra similarity calculations. } -\description{ -The \code{Spectra} class encapsules spectral mass spectrometry data and -related metadata. - -It supports multiple data backends, e.g. in-memory (\link{MsBackendMemory}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}}), on-disk as mzML (\code{\link[=MsBackendMzR]{MsBackendMzR()}}) or HDF5 -(\code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}). } \details{ The \code{Spectra} class uses by default a lazy data manipulation strategy, i.e. data manipulations such as performed with \code{replaceIntensitiesBelow()} are not applied immediately to the data, but applied on-the-fly to the -spectrum data once it is retrieved. For some backends that allow to write -data back to the data storage (such as the \code{\link[=MsBackendMemory]{MsBackendMemory()}}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it is possible to apply -to queue with the \code{applyProcessing} function. See the *Data manipulation and -analysis \emph{methods} section below for more details. - -For more information on parallel or chunk-wise processing (especially -helpful for very large data sets) see \code{\link[=processingChunkSize]{processingChunkSize()}}. - -To apply arbitrary functions to a \code{Spectra} use the \code{spectrapply()} function -(or directly \code{\link[=chunkapply]{chunkapply()}} for chunk-wise processing). See description of -the \code{spectrapply()} function below for details. - -For details on plotting spectra, see \code{\link[=plotSpectra]{plotSpectra()}}. +spectrum data once it is retrieved. This enables data manipulation +operations also for \emph{read only} data representations. For some backends that +allow to write data back to the data storage (such as the +\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it +is possible to apply to queue with the \code{\link[=applyProcessing]{applyProcessing()}} function (see +the \code{\link[=applyProcessing]{applyProcessing()}} function for details). Clarifications regarding scan/acquisition numbers and indices: \itemize{ @@ -883,15 +642,33 @@ the \code{acquisitionNum}) See also \href{https://github.com/lgatto/MSnbase/issues/525}{this issue}. } -\section{Creation of objects, conversion, changing the backend and export}{ +\section{Data stored in a \code{Spectra} object}{ + + +The \code{Spectra} object is a container for MS data that includes mass peak +data (\emph{m/z} and related intensity values, also referred to as \emph{peaks data} +in the context of \code{Spectra}) and metadata of individual spectra (so called +\emph{spectra variables}). While a core set of spectra variables (the +\code{coreSpectraVariables()}) are guaranteed to be provided by a +\code{Spectra}, it is possible to add arbitrary additional spectra variables to +a \code{Spectra} object. + +The \code{Spectra} object is designed to contain MS data of a (large) set of mass +spectra. The data is organized \emph{linearly} and can be thought of a list of +mass spectra, i.e. each element in the \code{Spectra} is one spectrum. +} + +\section{Creation of objects}{ \code{Spectra} classes can be created with the \code{Spectra()} constructor function which supports the following formats: \itemize{ \item parameter \code{object} is a \code{data.frame} or \code{DataFrame} containing the -spectrum data. The provided \code{backend} (by default a -\linkS4class{MsBackendMemory}) will be initialized with that data. +full spectrum data (spectra variables in columns as well as columns +with the individual MS peak data, \emph{m/z} and intensity). The provided +\code{backend} (by default a \linkS4class{MsBackendMemory}) will be initialized +with that data. \item parameter \code{object} is a \linkS4class{MsBackend} (assumed to be already initialized). \item parameter \code{object} is missing, in which case it is supposed that the data @@ -906,41 +683,79 @@ which allows to import spectra data from mzML, mzXML or CDF files. With \code{...} additional arguments can be passed to the backend's \code{\link[=backendInitialize]{backendInitialize()}} method. Parameter \code{backend} allows to specify which -\linkS4class{MsBackend} should be used for data storage. +\linkS4class{MsBackend} should be used for data representation and storage. +} + +\section{Data representation of a \code{Spectra}}{ + + +The MS data which can be accessed through the \code{Spectra} object is +\emph{represented} by its backend, which means that this backend defines how +and where the data is stored (e.g. in memory or on disk). The \code{Specrta} +object relies on the backend to provide the MS data whenever it needs it +for data processing. +Different backends with different properties, such as minimal memory +requirement or fast data access, are defined in the \emph{Spectra} package or +one of the MsBackend* packages. More information on backends and their +properties is provided in the documentation of \link{MsBackend}. + +On-disk backends keep only a limited amount of data in memory retrieving +most of the data (usually the MS peak data) upon request on-the-fly from +their on-disk data representations. Moving the on-disk data storage of such +a backend or a serialized object to a different location in the file +system will cause data corruption. The \code{dataStorageBasePath()} and +\verb{dataStorageBasePath<-} functions allow in such cases (and if thebackend +classes support this operation), to get or change the \emph{base} +path to the directory of the backend's data storage. In-memory backends +such as \link{MsBackendMemory} or \link{MsBackendDataFrame} keeping all MS data in +memory don't support, and need, this function, but for \link{MsBackendMzR} this +function can be used to update/adapt the path to the directory containing +the original data files. Thus, for \code{Spectra} objects (using this backend) +that were moved to another file system or computer, these functions allow to +adjust/adapt the base file path. +} + +\section{Changing data representation of a \code{Spectra}}{ -The backend of a \code{Spectra} object can be changed with the \code{setBackend()} -method that takes an instance of the new backend as second parameter -\code{backend}. A call to \code{setBackend(sps, backend = MsBackendDataFrame())} + +The data representation, i.e. the backend of a \code{Spectra} object can be +changed with the \code{setBackend()} method that takes an instance of the new +backend as second parameter \code{backend}. A call to +\code{setBackend(sps, backend = MsBackendDataFrame())} would for example change the backend of \code{sps} to the \emph{in-memory} \code{MsBackendDataFrame}. Changing to a backend is only supported if that backend has a \code{data} parameter in its \code{backendInitialize()} method and if \code{supportsSetBackend()} returns \code{TRUE} for that backend. \code{setBackend()} will -transfer the full spectra data from the originating backend as a -\code{DataFrame} to the new backend. -Most \emph{read-only} backends do not support \code{setBackend()}. It is for example -not possible to change the backend to a \emph{read-only} backend (such as -the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend). +transfer the full spectra data from the originating backend as a \code{DataFrame} +to the new backend. + +Generally, it is not possible to change \strong{to} a read-only backend such as +the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend. The definition of the function is: \code{setBackend(object, backend, ..., f = dataStorage(object), BPPARAM = bpparam())} and its parameters are: \itemize{ -\item parameter \code{object}: the \code{Spectra} object. -\item parameter \code{backend}: an instance of the new backend, e.g. -\verb{[MsBackendMemory()]}. -\item parameter \code{f}: factor allowing to parallelize the change of the backends. -By default the process of copying the spectra data from the original to the +\item \code{object}: the \code{Spectra} object. +\item \code{backend}: an instance of the new backend, e.g. \verb{[MsBackendMemory()]}. +\item \code{f}: factor allowing to parallelize the change of the backends. By +default the process of copying the spectra data from the original to the new backend is performed separately (and in parallel) for each file. Users are advised to use the default setting. -\item parameter \code{...}: optional additional arguments passed to the -\code{\link[=backendInitialize]{backendInitialize()}} method of the new \code{backend}. -\item parameter \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for +\item \code{...}: optional additional arguments passed to the \code{\link[=backendInitialize]{backendInitialize()}} +method of the new \code{backend}. +\item \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for details. } +} + +\section{Exporting data from a \code{Spectra} object}{ + Data from a \code{Spectra} object can be \strong{exported} to a file with the -\code{export()} function. The actual export of the data has to be performed by +\code{export()} function. The actual export of the data is performed by the \code{export} method of the \link{MsBackend} class defined with the mandatory -parameter \code{backend}. Note however that not all backend classes support +parameter \code{backend} which defines also the format in which the data +is exported. Note however that not all backend classes support export of data. From the \code{MsBackend} classes in the \code{Spectra} package currently only the \code{MsBackendMzR} backend supports data export (to mzML/mzXML file(s)); see the help page of the \linkS4class{MsBackend} for @@ -1539,6 +1354,8 @@ level(s). \examples{ +## -------- CREATION OF SPECTRA OBJECTS -------- + ## Create a Spectra providing a `DataFrame` containing the spectrum data. spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) @@ -1548,12 +1365,6 @@ spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) data <- Spectra(spd) data -## Get the number of spectra -length(data) - -## Get the number of peaks per spectrum -lengths(data) - ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk ## backend. sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -1561,6 +1372,9 @@ sciex_file <- dir(system.file("sciex", package = "msdata"), sciex <- Spectra(sciex_file, backend = MsBackendMzR()) sciex + +## -------- CHANGING DATA REPRESENTATIONS -------- + ## The MS data is on disk and will be read into memory on-demand. We can ## however change the backend to a MsBackendMemory backend which will ## keep all of the data in memory. @@ -1904,7 +1718,7 @@ head(res) ## parameter spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) -## ---- DATA EXPORT ---- +## -------- DATA EXPORT -------- ## Some `MsBackend` classes provide an `export()` method to export the data ## to the file format supported by the backend. @@ -1933,42 +1747,7 @@ res <- Spectra(backendInitialize(MsBackendMzR(), fls[1])) mz(res) mz(data) - -## ---- PEAKS VARIABLES AND DATA ---- - -## Some `MsBackend` classes provide support for arbitrary peaks variables -## (in addition to the mandatory `"mz"` and `"intensity"` values. Below -## we create a simple data frame with an additional peak variable `"pk_ann"` -## and create a `Spectra` with a `MsBackendMemory` for that data. -## Importantly the number of values (per spectrum) need to be the same -## for all peak variables. - -tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) -tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) -tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) -tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) - -## Create the Spectra. With parameter `peaksVariables` we can define -## the columns in `tmp` that contain peaks variables. -sps <- Spectra(tmp, source = MsBackendMemory(), - peaksVariables = c("mz", "intensity", "pk_ann")) -peaksVariables(sps) - -## Extract just the m/z and intensity values -peaksData(sps)[[1L]] - -## Extract the full peaks data -peaksData(sps, columns = peaksVariables(sps))[[1L]] - -## Access just the pk_ann variable -sps$pk_ann } \author{ -Nir Shahaf, Johannes Rainer - -Nir Shahaf - -Johannes Rainer - -Sebastian Gibb, Johannes Rainer, Laurent Gatto +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail } diff --git a/man/addProcessing.Rd b/man/addProcessing.Rd new file mode 100644 index 00000000..787aeabe --- /dev/null +++ b/man/addProcessing.Rd @@ -0,0 +1,547 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{applyProcessing} +\alias{applyProcessing} +\alias{processingLog} +\alias{scalePeaks} +\alias{addProcessing} +\alias{bin} +\alias{containsMz} +\alias{containsNeutralLoss} +\alias{entropy} +\alias{pickPeaks} +\alias{replaceIntensitiesBelow} +\alias{reset} +\alias{smooth} +\alias{spectrapply} +\alias{addProcessing,Spectra-method} +\alias{bin,Spectra-method} +\alias{containsMz,Spectra-method} +\alias{containsNeutralLoss,Spectra-method} +\alias{entropy,Spectra-method} +\alias{entropy,ANY-method} +\alias{pickPeaks,Spectra-method} +\alias{replaceIntensitiesBelow,Spectra-method} +\alias{reset,Spectra-method} +\alias{smooth,Spectra-method} +\alias{spectrapply,Spectra-method} +\title{Data manipulation and analysis methods} +\usage{ +applyProcessing( + object, + f = processingChunkFactor(object), + BPPARAM = bpparam(), + ... +) + +processingLog(x) + +scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) + +\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) + +\S4method{bin}{Spectra}( + x, + binSize = 1L, + breaks = NULL, + msLevel. = uniqueMsLevels(x), + FUN = sum, + zero.rm = TRUE +) + +\S4method{containsMz}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + which = c("any", "all"), + BPPARAM = bpparam() +) + +\S4method{containsNeutralLoss}{Spectra}( + object, + neutralLoss = 0, + tolerance = 0, + ppm = 20, + BPPARAM = bpparam() +) + +\S4method{entropy}{Spectra}(object, normalized = TRUE) + +\S4method{entropy}{ANY}(object, ...) + +\S4method{pickPeaks}{Spectra}( + object, + halfWindowSize = 2L, + method = c("MAD", "SuperSmoother"), + snr = 0, + k = 0L, + descending = FALSE, + threshold = 0, + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{replaceIntensitiesBelow}{Spectra}( + object, + threshold = min, + value = 0, + msLevel. = uniqueMsLevels(object) +) + +\S4method{reset}{Spectra}(object, ...) + +\S4method{smooth}{Spectra}( + x, + halfWindowSize = 2L, + method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), + msLevel. = uniqueMsLevels(x), + ... +) + +\S4method{spectrapply}{Spectra}( + object, + FUN, + ..., + chunkSize = integer(), + f = factor(), + BPPARAM = SerialParam() +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{f}{For \code{spectrapply()} and \code{applyProcessing()}: \code{factor} defining +how \code{object} should be splitted for eventual parallel processing. +Defaults to \code{factor()} for \code{spectrapply()} hence the object is not +splitted while it defaults to \code{f = processingChunkSize(object)} for +\code{applyProcessing()} splitting thus the object by default into chunks +depending on \code{\link[=processingChunkSize]{processingChunkSize()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for +additional information on parallel processing.} + +\item{...}{Additional arguments passed to internal and downstream functions.} + +\item{x}{A \code{Spectra}.} + +\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from +intensity values of a spectrum by which all intensities (of +that spectrum) should be divided by. The default \code{by = sum} will +divide intensities of each spectrum by the sum of intensities of that +spectrum.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix +of each spectrum in \code{object}. +For \code{bin()}: function to aggregate intensity values of peaks falling +into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. +For \code{spectrapply()} and \code{chunkapply()}: function to be applied to +each individual or each chunk of \code{Spectra}.} + +\item{spectraVariables}{For \code{addProcessing()}: \code{character} with additional +spectra variables that should be passed along to the function defined +with \code{FUN}. See function description for details.} + +\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. +Defaults to \code{binSize = 1}.} + +\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between +bins.} + +\item{zero.rm}{For \code{bin()}: \code{logical(1)} indicating whether to remove bins +with zero intensity. Defaults to \code{TRUE}, meaning the function will +discard bins created with an intensity of 0 to enhance memory +efficiency.} + +\item{mz}{For \code{containsMz()}: \code{numeric} with the m/z value(s) of the mass +peaks to check.} + +\item{tolerance}{For \code{containsMz()} and \code{neutralLoss()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched.} + +\item{ppm}{For \code{containsMz()} and \code{neutralLoss()}: \code{numeric(1)} defining a +relative, m/z-dependent, maximal accepted difference between m/z values +for peaks to be matched.} + +\item{which}{For \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether +any (the default) or all provided \code{mz} have to be present in the +spectrum.} + +\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the +value which should be subtracted from the spectrum's precursor m/z.} + +\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized +entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for +details.} + +\item{halfWindowSize}{For \code{pickPeaks()}: \code{integer(1)}, used in the +identification of the mass peaks: a local maximum has to be the +maximum in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. +For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the +window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}.} + +\item{method}{For \code{pickPeaks()}: \code{character(1)}, the noise estimators that +should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation +(\code{method = "MAD"}) and Friedman's Super Smoother +(\code{method = "SuperSmoother"}) are supported. +For \code{smooth()}: \code{character(1)}, the smoothing function that should be +used, currently, the Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported.} + +\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the +\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be +higher than \code{snr * noise} to be considered as peak.} + +\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of +the peak that should be considered in the weighted mean calculation.} + +\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values +betwee the nearest valleys around the peak centroids are used.} + +\item{threshold}{For \code{pickPeaks()}: a \code{numeric(1)} defining the proportion +of the maximal peak intensity. Only values above the threshold are +used for the weighted mean calculation. +For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold +or a \code{function} to calculate the threshold for each spectrum on its +intensity values. Defaults to \code{threshold = min}.} + +\item{value}{For \code{replaceIntensitiesBelow()}: \code{numeric(1)} defining the +value with which intensities should be replaced with.} + +\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which the +\code{Spectra} should be split. This parameter overrides parameters +\code{f} and \code{BPPARAM}.} +} +\value{ +See the documentation of the individual functions for a description of the +return value. +} +\description{ +Various data analysis functions are available for \code{Spectra} objects. These +can be categorized into functions that either return a \code{Spectra} object +(with the manipulated data) and functions that directly return the +result from the calculation. For the former category, the data manipulations +are cached in the result object's \emph{processing queue} and only exectuted +on-the-fly when the respective data gets extracted from the \code{Spectra} (see +section \emph{The processing queue} for more information). + +For the second category, the calculations are directly executed and the +result, usually one value per spectrum, returned. Generally, to reduce +memory demand, a chunk-wise processing of the data is performed. +} +\section{Data analysis methods returning a \code{Spectra}}{ + + +The methods listed here return a \code{Spectra} object as a result. +\itemize{ +\item \code{addProcessing()}: adds an arbitrary function that should be applied to the +peaks matrix of every spectrum in \code{object}. The function (can be passed +with parameter \code{FUN}) is expected to take a peaks matrix as input and to +return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +the first containing the m/z values of the peaks and the second the +corresponding intensities. The function has to have \code{...} in its +definition. Additional arguments can be passed with \code{...}. With parameter +\code{spectraVariables} it is possible to define additional spectra variables +from \code{object} that should be passed to the function \code{FUN}. These will be +passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} +will pass the spectra's precursor m/z as a parameter named \code{precursorMz} +to the function. The only exception is the spectra's MS level, these will +be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. +with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be +submitted to the function as a parameter called \code{spectrumMsLevel}). +Examples are provided in the package vignette. +\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is +performed only on spectra of the specified MS level(s) (parameter +\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with +parameter \code{breaks} which by default are equally sized bins, with size +being defined by parameter \code{binSize}, from the minimal to the maximal m/z +of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used +for all spectra in \code{x}. All intensity values for peaks falling into the +same bin are aggregated using the function provided with parameter \code{FUN} +(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that +the binning operation is applied to the peak data on-the-fly upon data +access and it is possible to \emph{revert} the operation with the \code{reset()} +function (see description of \code{reset()} below). +\item \code{countIdentifications}: counts the number of identifications each scan has +led to. See \code{\link[=countIdentifications]{countIdentifications()}} for more details. +\item \code{pickPeaks()}: picks peaks on individual spectra using a moving +window-based approach (window size = \code{2 * halfWindowSize}). For noisy +spectra there are currently two different noise estimators available, +the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and +Friedman's Super Smoother (\code{method = "SuperSmoother"}), +as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. +The method supports also to optionally \emph{refine} the m/z value of +the identified centroids by considering data points that belong (most +likely) to the same mass peak. Therefore the m/z value is calculated as an +intensity weighted average of the m/z values within the peak region. +The peak region is defined as the m/z values (and their respective +intensities) of the \code{2 * k} closest signals to the centroid or the closest +valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} +has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for +details. +If the ratio of the signal to the highest intensity of the peak is below +\code{threshold} it will be ignored for the weighted average. +\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified +threshold with the provided \code{value}. Parameter \code{threshold} can be either +a single numeric value or a function which is applied to all non-\code{NA} +intensities of each spectrum to determine a threshold value for each +spectrum. The default is \code{threshold = min} which replaces all values +which are <= the minimum intensity in a spectrum with \code{value} (the +default for \code{value} is \code{0}). Note that the function specified with +\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} +will be passed to the function. If the spectrum is in profile mode, +ranges of successive non-0 peaks <= \code{threshold} are set to 0. +Parameter \code{msLevel.} allows to apply this to only spectra of certain MS +level(s). +\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending +on parameter \code{by}. With \code{by = sum} (the default) peak intensities are +divided by the sum of peak intensities within each spectrum. The sum of +intensities is thus 1 for each spectrum after scaling. Parameter +\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. +By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all +spectra will be scaled. +\item \code{smooth()}: smooths individual spectra using a moving window-based approach +(window size = \code{2 * halfWindowSize}). Currently, the +Moving-Average- (\code{method = "MovingAverage"}), +Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, +weights depending on the distance of the center and calculated +\code{1/2^(-halfWindowSize:halfWindowSize)}) and +Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. +For details how to choose the correct \code{halfWindowSize} please see +\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. +} +} + +\section{Data analysis methods returning the result from the calculation}{ + + +The functions listed in this section return immediately the result from the +calculation. To reduce memory demand (and allow parallel processing) the +calculations a chunk-wise processing is generally performed. +\itemize{ +\item \code{chunkapply()}: apply an arbitrary function to chunks of spectra. See +\code{\link[=chunkapply]{chunkapply()}} for details and examples. +\item \code{containsMz()}: checks for each of the spectra whether they contain mass +peaks with an m/z equal to \code{mz} (given acceptable difference as defined by +parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter +\code{which} allows to define whether any (\code{which = "any"}, the default) or +all (\code{which = "all"}) of the \code{mz} have to match. The function returns +\code{NA} if \code{mz} is of length 0 or is \code{NA}. +\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a +peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given +acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). +Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). +\item \code{entropy()}: calculates the entropy of each spectra based on the metrics +suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. +\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 +spectra using the intensity of the matching MS1 peak from the +closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +respective MS2 spectrum). With \code{method = "interpolation"} it is also +possible to calculate the precursor intensity based on an interpolation of +intensity values (and retention times) of the matching MS1 peaks from the +previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for +examples and more details. +\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment +spectra's precursor m/z based on the reported precursor m/z and the data +from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. +\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See +\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. +\item \code{spectrapply()}: applies a given function to each individual spectrum or +sets of a \code{Spectra} object. By default, the \code{Spectra} is split into +individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} +is applied to each of them. An alternative splitting can be defined with +parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. +The returned result and its order depend on the function \code{FUN} and how +\code{object} is split (hence on \code{f}, if provided). Parallel processing is +supported and can be configured with parameter \code{BPPARAM}, is however only +suggested for computational intense \code{FUN}. +As an alternative to the (eventual parallel) processing of the full +\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, +parameter \code{chunkSize} needs to be specified. \code{object} is then split into +chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. +This guarantees a lower memory demand (especially for on-disk backends) +since only the data for one chunk needs to be loaded into memory in each +iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and +\code{BPPARAM} will be ignored. +See also \code{chunkapply()} above or examples below for details on chunk-wise +processing. +} +} + +\section{The processing queue}{ + + +Operations that modify mass peak data, i.e. the m/z and intensity values of +a \code{Spectra} are generally not applied immediately to the data but are +\emph{cached} within the object's \emph{processing queue}. These operations are then +applied to the data only upon request, for example when m/z and/or +intensity values are extracted. This lazy execution guarantees that the +same functionality can be applied to any \code{Spectra} object, regardless of +the type of backend that is used. Thus, data manipulation operations can +also be applied to data that is \emph{read only}. As a side effect, this enables +also to \emph{undo} operations using the \code{reset()} function. + +Functions related to the processing queue are: +\itemize{ +\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend +only: apply all steps from the lazy processing queue to the peak data and +write it back to the data storage. Parameter \code{f} allows to specify how +\code{object} should be split for parallel processing. This should either be +equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable +parallel processing alltogether. Other partitionings might result in +errors (especially if a \code{MsBackendHdf5Peaks} backend is used). +\item \code{processingLog()}: returns a \code{character} vector with the processing log +messages. +\item \code{reset()}: restores the data to its original state (as much as possible): +removes any processing steps from the lazy processing queue and calls +\code{reset()} on the backend which, depending on the backend, can also undo +e.g. data filtering operations. Note that a \verb{reset*(} call after +\code{applyProcessing()} will not have any effect. See examples below for more +information. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- FUNCTIONS RETURNING A SPECTRA -------- + +## Replace peak intensities below 40 with a value of 1 +sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +sps_mod + +## Get the intensities of the first spectrum before and after the +## operation +intensity(sps_dda[1]) +intensity(sps_mod[1]) + +## Remove all peaks with an intensity below 5. +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) + +intensity(sps_mod) + +## In addition it is possible to pass a function to `filterIntensity()`: in +## the example below we want to keep only peaks that have an intensity which +## is larger than one third of the maximal peak intensity in that spectrum. +keep_peaks <- function(x, prop = 3) { + x > max(x, na.rm = TRUE) / prop +} +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +intensity(sps_mod) + +## We can also change the proportion by simply passing the `prop` parameter +## to the function. To keep only peaks that have an intensity which is +## larger than half of the maximum intensity: +sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +intensity(sps_mod) + +## With the `scalePeaks()` function we can alternatively scale the +## intensities of mass peaks per spectrum to relative intensities. This +## is specifically useful for fragment (MS2) spectra. We below thus +## scale the intensities per spectrum by the total sum of intensities +## (such that the sum of all intensities per spectrum is 1). +## Below we scale the intensities of all MS2 spectra in our data set. +sps_mod <- scalePeaks(sps_dda, msLevel = 2L) + +## MS1 spectra were not affected +sps_mod |> + filterMsLevel(1L) |> + intensity() + +## Intensities of MS2 spectra were scaled +sps_mod |> + filterMsLevel(2L) |> + intensity() + +## Since data manipulation operations are by default not directly applied to +## the data but only cached in the internal processing queue, it is also +## possible to remove these data manipulations with the `reset()` function: +tmp <- reset(sps_mod) +tmp +lengths(sps_dda) |> head() +lengths(sps_mod) |> head() +lengths(tmp) |> head() + +## Data manipulation operations cached in the processing queue can also be +## applied to the mass peaks data with the `applyProcessing()` function, if +## the `Spectra` uses a backend that supports that (i.e. allows replacing +## the mass peaks data). Below we first change the backend to a +## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +## mass peaks data +sps_dda <- setBackend(sps_dda, MsBackendMemory()) +sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +sps_mod <- applyProcessing(sps_mod) +sps_mod + +## While we can't *undo* this filtering operation now using the `reset()` +## function, accessing the data would now be faster, because the operation +## does no longer to be applied to the original data before returning to the +## user. + + +## -------- FUNCTIONS RETURNING THE RESULT -------- + +## With the `spectrapply()` function it is possible to apply an +## arbitrary function to each spectrum in a Spectra. +## In the example below we calculate the mean intensity for each spectrum +## in a subset of the sciex_im data. Note that we can access all variables +## of each individual spectrum either with the `$` operator or the +## corresponding method. +res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +head(res) + +## As an alternative, applying a function `FUN` to a `Spectra` can be +## performed *chunk-wise*. The advantage of this is, that only the data for +## one chunk at a time needs to be loaded into memory reducing the memory +## demand. This type of processing can be performed by specifying the size +## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +## parameter +spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) + +## Precursor intensity estimation. Some manufacturers don't report the +## precursor intensity for MS2 spectra: +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +## This intensity can however be estimated from the previously measured +## MS1 scan with the `estimatePrecursorIntensity()` function: +pi <- estimatePrecursorIntensity(sps_dda) + +## This function returned the result as a `numeric` vector with one +## value per spectrum: +pi + +## We can replace the precursor intensity values of the originating +## object: +sps_dda$precursorIntensity <- pi +sps_dda |> + filterMsLevel(2L) |> + precursorIntensity() + +} +\seealso{ +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for calculation of spectra similarity scores. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +} diff --git a/man/combinePeaks.Rd b/man/combinePeaks.Rd new file mode 100644 index 00000000..a59b8f24 --- /dev/null +++ b/man/combinePeaks.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{combinePeaks} +\alias{combinePeaks} +\alias{combinePeaks,Spectra-method} +\title{Aggregating and combining mass peaks data} +\usage{ +\S4method{combinePeaks}{Spectra}( + object, + tolerance = 0, + ppm = 20, + intensityFun = base::mean, + mzFun = base::mean, + weighted = TRUE, + msLevel. = uniqueMsLevels(object), + ... +) +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{tolerance = 0}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be grouped. Default +is \code{ppm = 20}.} + +\item{intensityFun}{Function to aggregate intensities for all peaks in +each peak group into a single intensity value.} + +\item{mzFun}{Function to aggregate m/z values for all mass peaks within +each peak group into a single m/z value. This parameter is ignored if +\code{weighted = TRUE} (the default).} + +\item{weighted}{\code{logical(1)} whether m/z values of peaks within each peak +group should be aggregated into a single m/z value using an +intensity-weighted mean. Defaults to \code{weighted = TRUE}.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}.} + +\item{...}{ignored.} +} +\description{ +In addition to aggregating content of spectra variables (describe in +\code{\link[=combineSpectra]{combineSpectra()}}) it is also possible to aggregate and combine mass peaks +data from individual spectra within a \code{Spectra}. These \code{combinePeaks()} +function combines mass peaks \strong{within each spectrum} with a difference in +their m/z values that is smaller than the maximal acceptable difference +defined by \code{ppm} and \code{tolerance}. Parameters \code{intensityFun} and \code{mzFun} +allow to define functions to aggregate the intensity and m/z values for +each such group of peaks. With \code{weighted = TRUE} (the default), the m/z +value of the combined peak is calculated using an intensity-weighted mean +and parameter \code{mzFun} is ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is +used for the grouping of mass peaks. Parameter \code{msLevel.} allows to define +selected MS levels for which peaks should be combined. This function +returns a \code{Spectra} with the same number of spectra than the input object, +but with possibly combined peaks within each spectrum. +Additional peak variables (other than \code{"mz"} and \code{"intensity"}) are +dropped (i.e. their values are replaced with \code{NA}) for combined peaks +unless they are constant across the combined peaks. See also +\code{\link[=reduceSpectra]{reduceSpectra()}} for a function to select a single \emph{representative} +mass peak for each peak group. +} +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) + +## Combine mass peaks per spectrum with a difference in their m/z value +## that is smaller than 20 ppm. The intensity values of such peaks are +## combined by summing their values, while for the m/z values the median +## is reported +sciex_comb <- combinePeaks(sciex, ppm = 20, + intensityFun = sum, mzFun = median) + +## Comparing the number of mass peaks before and after aggregation +lengths(sciex) |> head() +lengths(sciex_comb) |> head() + +## Plotting the first spectrum before and after aggregation +par(mfrow = c(1, 2)) +plotSpectra(sciex[2L]) +plotSpectra(sciex_comb[2L]) + +## Using `reduceSpectra()` to keep for each group of mass peaks with a +## difference in their m/z values < 20ppm the one with the highest intensity. +sciex_red <- reduceSpectra(sciex, ppm = 20) + +## Comparing the number of mass peaks before and after the operation +lengths(sciex) |> head() +lengths(sciex_red) |> head() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}'s +spectra data. +\item \code{\link[=combinePeaksData]{combinePeaksData()}} for the function to combine the mass peaks data. +\item \code{\link[=reduceSpectra]{reduceSpectra()}} and similar functions to filter mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/combineSpectra.Rd b/man/combineSpectra.Rd new file mode 100644 index 00000000..d4f7bdb0 --- /dev/null +++ b/man/combineSpectra.Rd @@ -0,0 +1,240 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{concatenateSpectra} +\alias{concatenateSpectra} +\alias{combineSpectra} +\alias{joinSpectraData} +\alias{split} +\alias{c,Spectra-method} +\alias{split,Spectra,ANY-method} +\title{Merging, aggregating and splitting Spectra} +\usage{ +concatenateSpectra(x, ...) + +combineSpectra( + x, + f = x$dataStorage, + p = x$dataStorage, + FUN = combinePeaksData, + ..., + BPPARAM = bpparam() +) + +joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") + +\S4method{c}{Spectra}(x, ...) + +\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{...}{Additional arguments.} + +\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} +for details. +For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra +that should be combined. Defaults to \code{x$dataStorage}.} + +\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input +\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., +depending on the used backend, per-file parallel processing will be +performed.} + +\item{FUN}{For \code{combineSpectra()}: function to combine the (peak matrices) +of the spectra. Defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} + +\item{y}{A \code{DataFrame} with the spectra variables to join/add.} + +\item{by.x}{A \code{character(1)} specifying the spectra variable used +for merging. Default is \code{"spectrumId"}.} + +\item{by.y}{A \code{character(1)} specifying the column used for +merging. Set to \code{by.x} if missing.} + +\item{suffix.y}{A \code{character(1)} specifying the suffix to be used +for making the names of columns in the merged spectra variables +unique. This suffix will be used to amend \code{names(y)}, while +\code{spectraVariables(x)} will remain unchanged.} + +\item{drop}{For \code{split()}: not considered.} +} +\description{ +Various functions are availabe to combine, aggregate or split data from one +of more \code{Spectra} objects. These are: +\itemize{ +\item \code{c()} and \code{concatenateSpectra()}: combines several \code{Spectra} objects into +a single object. The resulting \code{Spectra} contains all data from all +individual \code{Spectra}, i.e. the union of all their spectra variables. +Concatenation will fail if the processing queue of any of the \code{Spectra} +objects is not empty or if different backends are used for the \code{Spectra} +objects. In such cases it is suggested to first change the backends of +all \code{Spectra} to the same type of backend (using the \code{\link[=setBackend]{setBackend()}} +function and to eventually (if needed) apply the processing queue using +the \code{\link[=applyProcessing]{applyProcessing()}} function. +\item \code{combineSpectra()}: combines sets of spectra (defined with parameter \code{f}) +into a single spectrum per set aggregating their MS data (i.e. their +\emph{peaks data} matrices with the \emph{m/z} and intensity values of their +mass peaks). The spectra variable values of the first spectrum per set +are reported for the combined spectrum. The peak matrices of the spectra +per set are combined using the function specified with parameter \code{FUN} +which uses by default the \code{\link[=combinePeaksData]{combinePeaksData()}} function. See the +documentation of \code{\link[=combinePeaksData]{combinePeaksData()}} for details on the aggregation of +the peak data and the package vignette for examples. +The sets of spectra can be specified with parameter \code{f} which is expected +to be a \code{factor} or \code{vector} of length equal to the length of the +\code{Spectra} specifying to which set a spectrum belongs to. The function +returns a \code{Spectra} of length equal to the unique levels of \code{f}. The +optional parameter \code{p} allows to define how the \code{Spectra} should be +split for potential parallel processing. The default is +\code{p = x$dataStorage} and hence a per storage file parallel processing is +applied for \code{Spectra} with on disk data representations (such as the +\code{\link[=MsBackendMzR]{MsBackendMzR()}}). This also prevents that spectra from different data +files/samples are combined (eventually use e.g. \code{p = x$dataOrigin} or any +other spectra variables defining the originating samples for a spectrum). +Before combining the peaks data, all eventual present processing steps are +applied (by calling \code{\link[=applyProcessing]{applyProcessing()}} on the \code{Spectra}). This function +will replace the original \emph{m/z} and intensity values of a \code{Spectra} hence +it can not be called on a \code{Spectra} with a \emph{read-only} backend. In such +cases, the backend should be changed to a \emph{writeable} backend before +using the \code{\link[=setBackend]{setBackend()}} function (to e.g. a \code{\link[=MsBackendMemory]{MsBackendMemory()}} backend). +\item \code{joinSpectraData()}: Individual spectra variables can be directly +added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} +function allows to merge a \code{DataFrame} to the existing spectra +data of a \code{Spectra}. This function diverges from the \code{\link[=merge]{merge()}} method in +two main ways: +\itemize{ +\item The \code{by.x} and \code{by.y} column names must be of length 1. +\item If variable names are shared in \code{x} and \code{y}, the spectra +variables of \code{x} are not modified. It's only the \code{y} +variables that are appended with the suffix defined in +\code{suffix.y}. This is to avoid modifying any core spectra +variables that would lead to an invalid object. +\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not +allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) +throw a warning and only the last occurrence is kept. These +should be explored and ideally be removed using for +\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar +functions. +} +\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} +of \code{Spectra} objects. +} +} +\examples{ + +## Create a Spectra providing a `DataFrame` containing a MS data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## Create a second Spectra from mzML files and use the `MsBackendMzR` +## on-disk backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Subset to the first 100 spectra to reduce running time of the examples +sciex <- sciex[1:100] + + +## -------- COMBINE SPECTRA -------- + +## Combining the `Spectra` object `s` with the MS data from `sciex`. +## Calling directly `c(s, sciex)` would result in an error because +## both backends use a different backend. We thus have to first change +## the backends to the same backend. We change the backend of the `sciex` +## `Spectra` to a `MsBackendMemory`, the backend used by `s`. + +sciex <- setBackend(sciex, MsBackendMemory()) + +## Combine the two `Spectra` +all <- c(s, sciex) +all + +## The new `Spectra` objects contains the union of spectra variables from +## both: +spectraVariables(all) + +## The spectra variables that were not present in `s`: +setdiff(spectraVariables(all), spectraVariables(s)) + +## The values for these were filled with missing values for spectra from +## `s`: +all$peaksCount |> head() + + +## -------- AGGREGATE SPECTRA -------- + +## Sets of spectra can be combined into a single, representative spectrum +## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +## the spectra's m/z and intensity values) while using the values for all +## spectra variables from the first spectrum per set. Below we define the +## sets as all spectra measured in the *same second*, i.e. rounding their +## retention time to the next closer integer value. +f <- round(rtime(sciex)) +head(f) + +cmp <- combineSpectra(sciex, f = f) + +## The length of `cmp` is now equal to the length of unique levels in `f`: +length(cmp) + +## The spectra variable value from the first spectrum per set is used in +## the representative/combined spectrum: +cmp$rtime + +## The peaks data was aggregated: the number of mass peaks of the first six +## spectra from the original `Spectra`: +lengths(sciex) |> head() + +## and for the first aggreagated spectra: +lengths(cmp) |> head() + +## The default peaks data aggregation method joins all mass peaks. See +## documentation of the `combinePeaksData()` function for more options. + + +## -------- SPLITTING DATA -------- + +## A `Spectra` can be split into a `list` of `Spectra` objects using the +## `split()` function defining the sets into which the `Spectra` should +## be splitted into with parameter `f`. +sciex_split <- split(sciex, f) + +length(sciex_split) +sciex_split |> head() + + +## -------- ADDING SPECTRA DATA -------- + +## Adding new spectra variables +sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging + var1 = rnorm(10), + var2 = sample(letters, 10)) +spv + +sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") + +spectraVariables(sciex2) +spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +} +\seealso{ +\itemize{ +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to aggregate mass peaks data. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/compareSpectra.Rd b/man/compareSpectra.Rd new file mode 100644 index 00000000..375671c4 --- /dev/null +++ b/man/compareSpectra.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{compareSpectra} +\alias{compareSpectra} +\alias{compareSpectra,Spectra,Spectra-method} +\alias{compareSpectra,Spectra,missing-method} +\title{Spectra similarity calculations} +\usage{ +\S4method{compareSpectra}{Spectra,Spectra}( + x, + y, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) + +\S4method{compareSpectra}{Spectra,missing}( + x, + y = NULL, + MAPFUN = joinPeaks, + tolerance = 0, + ppm = 20, + FUN = ndotproduct, + ..., + SIMPLIFY = TRUE +) +} +\arguments{ +\item{x}{A \code{Spectra} object.} + +\item{y}{A \code{Spectra} object.} + +\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between +the two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and +possible functions. Defaults to \code{\link[=joinPeaks]{joinPeaks()}}.} + +\item{tolerance}{\code{numeric(1)} allowing to define a constant maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{ppm}{\code{numeric(1)} defining a relative, m/z-dependent, maximal +accepted difference between m/z values for peaks to be matched. This +parameter is directly passed to \code{MAPFUN}.} + +\item{FUN}{function to compare intensities of peaks between two spectra. +Defaults to \code{\link[=ndotproduct]{ndotproduct()}}.} + +\item{...}{Additional arguments passed to the internal functions.} + +\item{SIMPLIFY}{\code{logical(1)} defining whether the result matrix should be +\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is +of length 1).} +} +\description{ +\code{compareSpectra()} compares each spectrum in \code{x} with each spectrum in \code{y} +using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If +\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum +in \code{x}. +The matching/mapping of peaks between the compared spectra is done with the +\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra +and allows to keep all peaks from the first spectrum (\code{type = "left"}), +from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to +keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more +information and examples). The \code{MAPFUN} function should have parameters +\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to +the function. + +In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is supported for +GNPS-like similarity score calculations. Note that \code{joinPeaksGnps()} should +only be used in combination with \code{FUN = MsCoreUtils::gnps} +(see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and details). Use +\code{MAPFUN = joinPeaksNone} to disable internal peak matching/mapping if a +similarity scoring function is used that performs the matching internally. + +\code{FUN} is supposed to be a function to compare intensities of (matched) +peaks of the two spectra that are compared. The function needs to take two +matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed +to return a single numeric as result. In addition to the two peak matrices +the spectra's precursor m/z values are passed to the function as parameters +\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} +(precursor m/z of the \code{y} peak matrix). Additional parameters to functions +\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and +\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. +The function returns a \code{matrix} with the results of \code{FUN} for each +comparison, number of rows equal to \code{length(x)} and number of columns +equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from +the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} +is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also +the vignette for additional examples, such as using spectral entropy +similarity in the scoring. +} +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + +## Restrict to MS2 (fragment) spectra: +sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) + +## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +## the normalized dotproduct method. +res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +## the second row comparisons of spectrum 3 with spectra 10 to 20 +res + +## We next calculate the pairwise similarity for the first 10 spectra +compareSpectra(sps_ms2[1:10]) + +## Use compareSpectra to determine the number of common (matching) peaks +## with a ppm of 10: +## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +## peaks that can be mapped betwen both spectra. The provided FUN returns +## simply the number of matching peaks. +compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) + +## We repeat this calculation between all pairwise combinations +## of the first 20 spectra +compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", + FUN = function(x, y, ...) nrow(x)) +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto +} diff --git a/man/countIdentifications.Rd b/man/countIdentifications.Rd index c7904ef6..08afd04b 100644 --- a/man/countIdentifications.Rd +++ b/man/countIdentifications.Rd @@ -109,6 +109,9 @@ sp <- countIdentifications(sp) ## and three PSMs respectively. table(sp$countIdentifications, sp$msLevel) } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis functions. +} \author{ Laurent Gatto } diff --git a/man/estimatePrecursorIntensity.Rd b/man/estimatePrecursorIntensity.Rd index e4a7efd9..8780aab4 100644 --- a/man/estimatePrecursorIntensity.Rd +++ b/man/estimatePrecursorIntensity.Rd @@ -1,21 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R -\name{estimatePrecursorIntensity} +% Please edit documentation in R/Spectra.R +\name{estimatePrecursorIntensity,Spectra-method} +\alias{estimatePrecursorIntensity,Spectra-method} \alias{estimatePrecursorIntensity} \title{Estimate Precursor Intensities} \usage{ -estimatePrecursorIntensity( - x, +\S4method{estimatePrecursorIntensity}{Spectra}( + object, ppm = 20, tolerance = 0, method = c("previous", "interpolation"), msLevel. = 2L, - f = dataOrigin(x), + f = dataOrigin(object), BPPARAM = bpparam() ) } \arguments{ -\item{x}{\code{Spectra} with MS1 and MS2 spectra.} +\item{object}{\code{Spectra} with MS1 and MS2 spectra.} \item{ppm}{\code{numeric(1)} with the maximal allowed relative difference of m/z values between the precursor m/z of a spectrum and the m/z of the diff --git a/man/estimatePrecursorMz.Rd b/man/estimatePrecursorMz.Rd index f79bfa24..7bc9e6cd 100644 --- a/man/estimatePrecursorMz.Rd +++ b/man/estimatePrecursorMz.Rd @@ -83,6 +83,9 @@ plot(precursorMz(s), precursorMz(s) - pmz, xlab = "precursor m/z", ## we could then replace the reported precursor m/z values s$precursorMz <- pmz } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Mar Garcia-Aloy, Johannes Rainer } diff --git a/man/filterMsLevel.Rd b/man/filterMsLevel.Rd new file mode 100644 index 00000000..0ea3698b --- /dev/null +++ b/man/filterMsLevel.Rd @@ -0,0 +1,689 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R +\name{deisotopeSpectra} +\alias{deisotopeSpectra} +\alias{reduceSpectra} +\alias{filterPrecursorMaxIntensity} +\alias{filterPrecursorIsotopes} +\alias{filterPrecursorPeaks} +\alias{filterMsLevel} +\alias{[,Spectra-method} +\alias{filterAcquisitionNum} +\alias{filterDataOrigin} +\alias{filterDataStorage} +\alias{filterEmptySpectra} +\alias{filterIsolationWindow} +\alias{filterPolarity} +\alias{filterPrecursorCharge} +\alias{filterPrecursorMzRange} +\alias{filterPrecursorMzValues} +\alias{filterPrecursorScan} +\alias{filterRanges} +\alias{filterRt} +\alias{filterValues} +\alias{dropNaSpectraVariables} +\alias{selectSpectraVariables} +\alias{filterIntensity} +\alias{filterMzRange} +\alias{filterMzValues} +\alias{dropNaSpectraVariables,Spectra-method} +\alias{selectSpectraVariables,Spectra-method} +\alias{filterAcquisitionNum,Spectra-method} +\alias{filterEmptySpectra,Spectra-method} +\alias{filterDataOrigin,Spectra-method} +\alias{filterDataStorage,Spectra-method} +\alias{filterFourierTransformArtefacts,Spectra-method} +\alias{filterIntensity,Spectra-method} +\alias{filterIsolationWindow,Spectra-method} +\alias{filterMsLevel,Spectra-method} +\alias{filterMzRange,Spectra-method} +\alias{filterMzValues,Spectra-method} +\alias{filterPolarity,Spectra-method} +\alias{filterPrecursorMz,Spectra-method} +\alias{filterPrecursorMzRange,Spectra-method} +\alias{filterPrecursorMzValues,Spectra-method} +\alias{filterPrecursorCharge,Spectra-method} +\alias{filterPrecursorScan,Spectra-method} +\alias{filterRt,Spectra-method} +\alias{filterRanges,Spectra-method} +\alias{filterValues,Spectra-method} +\title{Filter and subset Spectra objects} +\usage{ +deisotopeSpectra( + x, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), + tolerance = 0, + ppm = 20, + charge = 1 +) + +reduceSpectra(x, tolerance = 0, ppm = 20) + +filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) + +filterPrecursorIsotopes( + x, + tolerance = 0, + ppm = 20, + substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") +) + +filterPrecursorPeaks( + object, + tolerance = 0, + ppm = 20, + mz = c("==", ">="), + msLevel. = uniqueMsLevels(object) +) + +\S4method{dropNaSpectraVariables}{Spectra}(object) + +\S4method{selectSpectraVariables}{Spectra}( + object, + spectraVariables = union(spectraVariables(object), peaksVariables(object)) +) + +\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) + +\S4method{filterAcquisitionNum}{Spectra}( + object, + n = integer(), + dataStorage = character(), + dataOrigin = character() +) + +\S4method{filterEmptySpectra}{Spectra}(object) + +\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) + +\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) + +\S4method{filterFourierTransformArtefacts}{Spectra}( + object, + halfWindowSize = 0.05, + threshold = 0.2, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 +) + +\S4method{filterIntensity}{Spectra}( + object, + intensity = c(0, Inf), + msLevel. = uniqueMsLevels(object), + ... +) + +\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) + +\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) + +\S4method{filterMzRange}{Spectra}( + object, + mz = numeric(), + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterMzValues}{Spectra}( + object, + mz = numeric(), + tolerance = 0, + ppm = 20, + msLevel. = uniqueMsLevels(object), + keep = TRUE +) + +\S4method{filterPolarity}{Spectra}(object, polarity = integer()) + +\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) + +\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) + +\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) + +\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) + +\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) + +\S4method{filterRanges}{Spectra}( + object, + spectraVariables = character(), + ranges = numeric(), + match = c("all", "any") +) + +\S4method{filterValues}{Spectra}( + object, + spectraVariables = character(), + values = numeric(), + ppm = 0, + tolerance = 0, + match = c("all", "any") +) +} +\arguments{ +\item{x}{\code{Spectra} object.} + +\item{substDefinition}{For \code{deisotopeSpectra()} and +\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions +of isotopic substitutions. Uses by default isotopic substitutions +defined from all compounds in the Human Metabolome Database (HMDB). See +\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} in the +\emph{MetaboCoreUtils} for details.} + +\item{tolerance}{For \code{filterMzValues()} and \code{reduceSpectra()}: +\code{numeric(1)} allowing to define a constant maximal accepted difference +between m/z values for peaks to be matched (or grouped). For +\code{containsMz()} it can also be of length equal \code{mz} to specify a different +tolerance for each m/z value. +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the +(constant) maximal accepted difference of precursor m/z values of +spectra for grouping them into \emph{precursor groups}. For +\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} +function. For \code{filterValues()}: \code{numeric} of any length allowing to +define a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be +recycled. Default is \code{tolerance = 0}.} + +\item{ppm}{For \code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} +defining a relative, m/z-dependent, maximal accepted difference between +m/z values for peaks to be matched (or grouped). +For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative +maximal accepted difference of precursor m/z values of spectra for +grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: +passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. +For \code{filterValues()}: \code{numeric} of any length allowing to define +a maximal accepted difference between user input \code{values} and the +\code{spectraVariables} values. If it is not equal to the length of the +value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be +recycled.} + +\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized +compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} + +\item{object}{\code{Spectra} object.} + +\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to +filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: +\code{numeric(2)} defining the lower and upper m/z boundary. +For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with +the m/z values to match peaks or precursor m/z against. +For \code{filterPrecursorPeaks()}: \code{character(1)} defining whether mass peaks +with an m/z matching the spectrum's precursor m/z (\code{mz = "=="}, +the default) or mass peaks with a m/z that is equal or larger +(\code{mz = ">="}) should be removed.} + +\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which +the function should be applied (defaults to all MS levels of \code{object}. +For \code{filterMsLevel()}: the MS level to which \code{object} should be +subsetted.} + +\item{spectraVariables}{For \code{selectSpectraVariables()}: \code{character} with the +names of the spectra variables to which the backend should be +subsetted. For \code{filterRanges()} and \code{filterValues()}: \code{character} +vector specifying the column(s) from \code{spectraData(object)} on which +to filter the data and that correspond to the the names of the +spectra variables that should be used for the filtering.} + +\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the +object.} + +\item{j}{For \code{[}: not supported.} + +\item{...}{Additional arguments.} + +\item{drop}{For \code{[}: not considered.} + +\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition +numbers to filter for.} + +\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occur only for spectra of selected \code{dataStorage}.} + +\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which +spectra to keep. +For \code{filterAcquisitionNum()}: optionally specify if filtering should +occurr only for spectra of selected \code{dataOrigin}.} + +\item{halfWindowSize}{For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} +defining the m/z window left and right of a peak where to remove +fourier transform artefacts.} + +\item{threshold}{For \code{filterFourierTransformArtefacts()}: the relative +intensity (to a peak) below which peaks are considered fourier +artefacts. Defaults to \code{threshold = 0.2} hence removing peaks that +have an intensity below 0.2 times the intensity of the tested peak +(within the selected \code{halfWindowSize}).} + +\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope +peaks should not be removed as fourier artefacts.} + +\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge +to be considered for isotopes.} + +\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z +\code{tolerance} to be used to define whether peaks might be isotopes of +the current tested peak.} + +\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 +defining either the lower or the lower and upper intensity limit for the +filtering, or a \code{function} that takes the intensities as input and +returns a \code{logical} (same length then peaks in the spectrum) whether the +peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus +only peaks with \code{NA} intensity are removed.} + +\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} +whether the matching peaks should be retained (\code{keep = TRUE}, the +default) or dropped (\code{keep = FALSE}).} + +\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to +to subset \code{object}.} + +\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor +charges to be used as filter.} + +\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the +acquisition number of the spectra to which the object should be +subsetted.} + +\item{f}{For \code{filterPrecursorScan()}: defining which spectra +belong to the same original data file (sample): Defaults to +\code{f = dataOrigin(x)}.} + +\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to +be used to subset/filter \code{object}.} + +\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values +(upper and lower boundary) that define the ranges to filter the \code{object}. +These paired values need to be in the same order as the +\code{spectraVariables} parameter (see below).} + +\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } +defining whether the condition has to match for all provided +\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them +(\code{match = "any"}) for spectra to be retained.} + +\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the +values to filter the Spectra data. These values need to be in the same +order as the \code{spectraVariables} parameter.} +} +\description{ +A variety of functions to filter or subset \code{Spectra} objects are available. +These can be generally separated into two main classes: I) \emph{classical} +subset operations that immediately reduce the number of spectra in the +object and II) filters that reduce the \strong{content} of the object without +changing its length (i.e. the number of spectra). The latter can be further +subdivided into functions that affect the content of the \code{spectraData} (i.e. +the general spectrum metadata) and those that reduce the content of the +object's \code{peaksData} (i.e. the m/z and intensity values of a spectrum's +mass peaks). + +A description of functions from these 3 different categories are given below +in sections \emph{Subset \code{Spectra}}, \emph{Filter content of \code{spectraData()}} and +\emph{Filter content of \code{peaksData()}}, respectively. +} +\section{Subset \code{Spectra}}{ + + +These functions affect the number of spectra in a \code{Spectra} object creating +a subset of the original object without affecting its content. +\itemize{ +\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method +\strong{always} returns a \code{Spectra} object. +\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching +the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or +\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with +an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin +or dataStorage values} retaining all other spectra. +Returns the filtered \code{Spectra}. +\item \code{filterDataOrigin()}: filters the object retaining spectra matching the +provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type +\code{character} and needs to match exactly the data origin value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataOrigin} parameter). +\item \code{filterDataStorage()}: filters the object retaining spectra stored in the +specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type +\code{character} and needs to match exactly the data storage value of the +spectra to subset. +Returns the filtered \code{Spectra} object (with spectra ordered according to +the provided \code{dataStorage} parameter). +\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). +Returns the filtered \code{Spectra} object (with spectra in their +original order). +\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their +isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} +and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} +object (with spectra in their original order). +\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching +the MS level specified with argument \code{msLevel}. Returns the filtered +\code{Spectra} (with spectra in their original order). +\item \code{filterPolarity()}: filters the object keeping only spectra matching the +provided polarity. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor +charge(s). +\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor +m/z and precursor intensity into predicted isotope groups and keep for each +only the spectrum representing the monoisotopic precursor. MS1 spectra +are returned as is. See documentation for \code{deisotopeSpectra()} below for +details on isotope prediction and parameter description. +\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups +of (MS2) spectra with similar precursor m/z values (given parameters +\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The +function filters only MS2 spectra and returns all MS1 spectra. If +precursor intensities are \code{NA} for all spectra within a spectra group, the +first spectrum of that groups is returned. +Note: some manufacturers don't provide precursor intensities. These can +however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. +\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now +deprecated): retains spectra with a precursor m/z within the +provided m/z range. See examples for details on selecting spectra with +a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. +\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching +any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with +missing precursor m/z value (e.g. MS1 spectra) are dropped. +\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. +MS2) of acquisition number \code{acquisitionNum}. Returns the filtered +\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to +define which spectra belong to the same sample or original data file ( +defaults to \code{f = dataOrigin(object)}). +\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user +defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available +spectra variables in object (spectra variable names can be specified with +parameter \code{spectraVariables}). Spectra for which the value of a spectra +variable is within it's defined range are retained. If multiple +ranges/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention +times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) +\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their +original order). +\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on +similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} +(parameter \code{spectraVariables}) to provided values (parameter \code{values}) +given acceptable differences (parameters tolerance and ppm). If multiple +values/spectra variables are defined, the \code{match} parameter can be used +to specify whether all conditions (\code{match = "all"}; the default) or if +any of the conditions must match (\code{match = "any"}; all spectra for which +values are within any of the provided ranges are retained). +} +} + +\section{Filter content of \code{spectraData()}}{ + + +The functions described in this section filter the content from a +\code{Spectra}'s spectra data, i.e. affect values of, or complete, spectra +variables. None of these functions reduces the object's number of spectra. +\itemize{ +\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the +object's \code{spectraData} that contain only missing values (\code{NA}). Note that +while columns with only \code{NA}s are removed, a \code{spectraData()} call after +\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values +for \emph{core} spectra variables. The total number of spectra is not changed +by this function. +\item \code{selectSpectraVariables()}: reduces the information within the object to +the selected spectra variables: all data for variables not specified will +be dropped. For mandatory columns (i.e., those listed by +\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only +the values will be dropped but not the variable itself. Additional (or +user defined) spectra variables will be completely removed. +Returns the filtered \code{Spectra}. +} +} + +\section{Filter content of \code{peaksData()}}{ + + +The functions described in this section filter the content of the +\code{Spectra}'s peaks data, i.e. either the number or the values (\emph{m/z} or +intensity values) of the mass peaks. Also, the actual operation is only +executed once peaks data is accessed (through \code{peaksData()}, +\code{mz()} or \code{intensity()}) or \code{applyProcessing()} is called. +These operations don't affect the number of spectra in the \code{Spectra} object. +\itemize{ +\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the +monoisotopic peak for groups of isotopologues. Isotopologues are +estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the +\emph{MetaboCoreUtils} package. Note that +the default parameters for isotope prediction/detection have been +determined using data from the Human Metabolome Database (HMDB) and +isotopes for elements other than CHNOPS might not be detected. See +parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for +more information. The approach and code to define the parameters for +isotope prediction is described +\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. +\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier +artefact peaks from spectra (see examples below). The function iterates +through all intensity ordered peaks in a spectrum and removes all peaks +with an m/z within +/- \code{halfWindowSize} of the current peak if their +intensity is lower than \code{threshold} times the current peak's intensity. +Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} +allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} +being the maximum charge that should be considered and \code{isotopeTolerance} +the absolute acceptable tolerance for matching their m/z). +See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and +\code{deisitopeSpectra()} for an alternative. +\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only +those with intensities that are within the provided range or match the +criteria of the provided function. For the former, parameter \code{intensity} +has to be a \code{numeric} defining the intensity range, for the latter a +\code{function} that takes the intensity values of the spectrum and returns +a \code{logical} whether the peak should be retained or not (see examples +below for details) - additional parameters to the function can be passed +with \code{...}. +To remove only peaks with intensities below a certain threshold, say +100, use \code{intensity = c(100, Inf)}. Note: also a single value can be +passed with the \code{intensity} parameter in which case an upper limit of +\code{Inf} is used. +Note that this function removes also peaks with missing intensities +(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the +filtering to spectra of the specified MS level(s). +\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing +those in each spectrum that are within the provided m/z range. Whether +peaks are retained or removed can be configured with parameter \code{keep} +(default \code{keep = TRUE}). +\item \code{filterMzValues()}: filters mass peaks in the object keeping all +peaks in each spectrum that match the provided m/z value(s) (for +\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). +The m/z matching considers also the absolute \code{tolerance} and m/z-relative +\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. +\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any +set of range-based filters on numeric spectra or peaks variables. See +\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. +\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with +an m/z equal or larger than the m/z of the precursor, depending on the +value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). +\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in +(given \code{ppm} and \code{tolerance}) in each spectrum only the mass peak with the +highest intensity removing all other peaks hence \emph{reducing} each +spectrum to the highest intensity peaks per \emph{peak group}. +Peak groups are defined using the \code{\link[=group]{group()}} function from the +\emph{MsCoreUtils} package. See also the \code{\link[=combinePeaks]{combinePeaks()}} function for an +alternative function to combine peaks within each spectrum. +} +} + +\examples{ + +## Load a `Spectra` object with LC-MS/MS data. +fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", + package = "msdata") +sps_dda <- Spectra(fl) +sps_dda + + +## -------- SUBSET SPECTRA -------- + +## Subset to the first 3 spectra +tmp <- sps_dda[1:3] +tmp +length(tmp) + +## Subset to all MS2 spectra; this could be done with [, or, more +## efficiently, with the `filterMsLevel` function: +sps_dda[msLevel(sps_dda) == 2L] +filterMsLevel(sps_dda, 2L) + +## Filter the object keeping only MS2 spectra with an precursor m/z value +## between a specified range: +filterPrecursorMzRange(sps_dda, c(80, 90)) + +## Filter the object to MS2 spectra with an precursor m/z matching a +## pre-defined value (given ppm and tolerance) +filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) + +## The `filterRanges()` function allows to filter a `Spectra` based on +## numerical ranges of any of its (numerical) spectra variables. +## First, determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz", "peaksCount") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the ranges (pairs of values with lower and upper boundary) to be +## used for the individual spectra variables. The first two values will be +## used for the first spectra variable (e.g., `"rtime"` here), the next two +## for the second (e.g. `"precursorMz"` here) and so on: +ranges <- c(30, 350, 200, 500, 350, 600) + +## Input the parameters within the filterRanges function: +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges) +filt_spectra + +## `filterRanges()` can also be used to filter a `Spectra` object with +## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +sv <- c("rtime", "rtime") +ranges <- c(30, 100, 200, 300) +filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, + ranges = ranges, match = "any") +filt_spectra + +## While `filterRanges()` filtered on numeric ranges, `filterValues()` +## allows to filter an object matching spectra variable values to user +## provided values (allowing to configure allowed differences using the +## `ppm` and `tolerance` parameters). +## First determine the variable(s) on which to base the filtering: +sv <- c("rtime", "precursorMz") +## Note that ANY variables can be chosen here, and as many as wanted. + +## Define the values that will be used to filter the spectra based on their +## similarities to their respective `spectraVariables`. +## The first values in the parameters values, tolerance and ppm will be +## used for the first spectra variable (e.g. `"rtime"` here), the next for +## the second (e.g. `"precursorMz"` here) and so on: +values <- c(350, 80) +tolerance <- c(100, 0.1) +ppm <- c(0, 50) + +## Input the parameters within the `filterValues()` function: +filt_spectra <- filterValues(sps_dda, spectraVariables = sv, + values = values, tolerance = tolerance, ppm = ppm) +filt_spectra + + +## -------- FILTER SPECTRA DATA -------- + +## Remove spectra variables without content (i.e. with only missing values) +sps_noNA <- dropNaSpectraVariables(sps_dda) + +## This reduced the size of the object slightly +print(object.size(sps_dda), unit = "MB") +print(object.size(sps_noNA), unit = "MB") + +## With the `selectSpectraVariables()` function it is in addition possible +## to subset the data of a `Spectra` to the selected columns/variables, +## keeping only their data: +tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", + "scanIndex")) +print(object.size(tmp), units = "MB") + +## Except the selected variables, all data is now removed. Accessing +## core spectra variables still works, but returns only NA +rtime(tmp) |> head() + + +## -------- FILTER PEAKS DATA -------- + +## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +## only those mass peaks with an m/z value matching the provided value(s). +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) + +## The filtered `Spectra` has the same length +length(sps_dda) +length(sps_sub) + +## But the number of mass peaks changed +lengths(sps_dda) |> head() +lengths(sps_sub) |> head() + +## This function can also be used to remove specific peaks from a spectrum +## by setting `keep = FALSE`. +sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), + tolerance = 0.3, keep = FALSE) +lengths(sps_sub) |> head() + +## With the `filterMzRange()` function it is possible to keep (or remove) +## mass peaks with m/z values within a specified numeric range. +sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +lengths(sps_sub) |> head() + +## See also the `filterPeaksRanges()` function for a more flexible framework +## to filter mass peaks + + +## Removing fourier transform artefacts seen in Orbitra data. + +## Loading an Orbitrap spectrum with artefacts. +data(fft_spectrum) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +fft_spectrum +plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + +## Using a few examples peaks in your data you can optimize the parameters +fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, + halfWindowSize = 0.2, + threshold = 0.005, + keepIsotopes = TRUE, + maxCharge = 5, + isotopeTolerance = 0.005 + ) + +fft_spectrum_filtered +length(mz(fft_spectrum_filtered)[[1]]) +plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) + + +## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +## by similarity of their m/z values) only one representative peak. This +## function helps cleaning fragment spectra. +## Filter the data set to MS2 spectra +ms2 <- filterMsLevel(sps_dda, 2L) + +## For groups of fragment peaks with a difference in m/z < 0.1, keep only +## the largest one. +ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +lengths(ms2) |> tail() +lengths(ms2_red) |> tail() +} +\seealso{ +\itemize{ +\item \code{\link[=combineSpectra]{combineSpectra()}} for functions to combine or aggregate \code{Spectra}. +\item \code{\link[=combinePeaks]{combinePeaks()}} for functions to combine or aggregate a \code{Spectra}'s +\code{peaksData()} +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +} diff --git a/man/filterPeaksRanges.Rd b/man/filterPeaksRanges.Rd new file mode 100644 index 00000000..db713c3b --- /dev/null +++ b/man/filterPeaksRanges.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra-functions.R +\name{filterPeaksRanges} +\alias{filterPeaksRanges} +\title{Filter peaks based on spectra and peaks variable ranges} +\usage{ +filterPeaksRanges(object, ..., keep = TRUE) +} +\arguments{ +\item{object}{A \link{Spectra} object.} + +\item{...}{the ranges for the spectra and/or peaks variables. Has to be +provided as \verb{ = } pairs with \verb{} being the name of a +spectra or peaks variable (of numeric data type) and \verb{} being +either a \code{numeric} of length 2 or a \code{numeric} two column matrix (see +function desription above for details),} + +\item{keep}{\code{logical(1)} whether to keep (default) or remove peaks that +match the provided range(s).} +} +\description{ +The \code{filterPeaksRanges()} function allows to filter the peaks matrices of a +\link{Spectra} object using any set of range-based filters on numeric spectra +variables or peaks variables. These ranges can be passed to the function +using the \code{...} as \verb{ = } pairs. \verb{} +has to be an available spectra or peaks variable. \verb{} can be a +\code{numeric} of length 2 defining the lower and upper boundary, or a \code{numeric} +two-column matrix (multi-row matrices are also supported, see further +below). \code{filterPeaksRanges(s, mz = c(200, 300))} would for example reduce +the peaks matrices of the \code{Spectra} object \code{s} to mass peaks with an m/z +value between 200 and 300. \code{filterPeaksRanges()} returns the original +\code{Spectra} object with the filter operation added to the processing queue. +Thus, the filter gets \strong{only} applied when the peaks data gets extracted +with \code{mz()}, \code{intensity()} or \code{peaksData()}. If ranges for both spectra +\strong{and} peaks variables are defined, the function evaluates first whether +the spectra variable value for a spectrum is within the provided range and, +if so, applies also the peaks variable-based filter (otherwise an empty +peaks matrix is returned). + +If more than one spectra variable and/or peaks variable are defined, their +filter results are combined with a logical AND: a peak matrix is only +returned for a spectrum if all values of spectra variables are within the +provided (respective) ranges for spectra variables, and this matrix is +further filtered to contain only those peaks which values are within the +provided peaks variable ranges. + +\strong{Filtering with multiple ranges} per spectra and peaks variables is also +supported: ranges can also be provided as multi-row numeric (two-column) +matrices. In this case, the above described procedure is applied for each +row separately and their results are combined with a logical OR, i.e. +peaks matrices are returned that match any of the conditions/filters +of a row. The number of rows of the provided ranges (being it for spectra +or peaks variables) have to match. + +\strong{Missing value handling}: any comparison which involves a missing value +(being it a spectra variable value, a peaks variable value or a value +in one of the provided ranges) is treated as a logical \code{FALSE}. For +example, if the retention time of a spectrum is \code{NA} and the data is +filtered using a retention time range, an empty peaks matrix is returned +(for \code{keep = TRUE}, for \code{keep = FALSE} the full peaks matrix is returned). +} +\note{ +In contrast to some other \emph{filter} functions, this function does not provide +a \code{msLevel} parameter that allows to define the MS level of spectra on which +the filter should be applied. The filter(s) will always be applied to +\strong{all} spectra (irrespectively of their MS level). Through combination of +multiple filter ranges it is however possible to apply MS level-dependent +filters (see examples below for details). + +The filter will not be applied immediately to the data but only executed when +the mass peak data is accessed (through \code{peaksData()}, \code{mz()} or +\code{intensity()}) or by calling \code{applyProcessing()}. +} +\examples{ + +## Define a test Spectra +d <- data.frame(rtime = c(123.2, 134.2), msLevel = c(1L, 2L)) +d$mz <- list(c(100.1, 100.2, 100.3, 200.1, 200.2, 300.3), + c(100.3, 100.4, 200.2, 400.3, 400.4)) +## Use the index of the mass peak within the spectrum as index for +## better illustration of filtering results +d$intensity <- list(c(1:6), 1:5) +s <- Spectra(d) +s + +## Filter peaks removing all mass peaks with an m/z between 200 and 300 +res <- filterPeaksRanges(s, mz = c(200, 300), keep = FALSE) +res + +## The Spectra object has still the same length and spectra variables +length(res) +res$rtime + +## The filter gets applied when mass peak data gets extracted, using either +## `mz()`, `intensity()` or `peaksData()`. The filtered peaks data does +## not contain any mass peaks with m/z values between 200 and 300: +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## We next combine spectra and filter variables. We want to keep only mass +## peaks of MS2 spectra that have an m/z between 100 and 110. +res <- filterPeaksRanges(s, mz = c(100, 110), msLevel = c(2, 2)) +res +length(res) + +## Only data for peaks are returned for which the spectra's MS level is +## between 2 and 2 and with an m/z between 100 and 110. The peaks data for +## the first spectrum, that has MS level 1, is thus empty: +peaksData(res)[[1L]] + +## While the peaks matrix for the second spectrum (with MS level 2) contains +## the mass peaks with m/z between 100 and 110. +peaksData(res)[[2L]] + +## To keep also the peaks data for the first spectrum, we need to define +## an additional set of ranges, which we define using a second row in each +## ranges matrix. We use the same filter as above, i.e. keeping only mass +## peaks with an m/z between 100 and 110 for spectra with MS level 2, but +## add an additional row for MS level 1 spectra keeping mass peaks with an +## m/z between 0 and 2000. Filter results of different rows are combined +## using a logical OR, i.e. peaks matrices with mass peaks are returned +## matching either the first, or the second row. +res <- filterPeaksRanges(s, mz = rbind(c(100, 110), c(0, 1000)), + msLevel = rbind(c(2, 2), c(1, 1))) + +## The results for the MS level 2 spectrum are the same as before, but with +## the additional row we keep the full peaks matrix of the MS1 spectrum: +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## As a last example we define a filter that keeps all mass peaks with an +## m/z either between 100 and 200, or between 300 and 400. +res <- filterPeaksRanges(s, mz = rbind(c(100, 200), c(300, 400))) +peaksData(res)[[1L]] +peaksData(res)[[2L]] + +## Such filters could thus be defined to restrict/filter the MS data to +## specific e.g. retention time and m/z ranges. +} +\author{ +Johannes Rainer +} diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index de5a31b8..c03adb62 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -8,17 +8,9 @@ \alias{[,MsBackendDataFrame-method} \alias{ppm} \alias{bin,numeric-method} -\alias{containsMz} -\alias{containsNeutralLoss} -\alias{dropNaSpectraVariables} -\alias{entropy} -\alias{export} -\alias{pickPeaks} -\alias{replaceIntensitiesBelow} -\alias{reset} -\alias{selectSpectraVariables} \alias{show,MsBackendDataFrame-method} \alias{backendMerge,MsBackendDataFrame-method} +\alias{backendRequiredSpectraVariables,MsBackendDataFrame-method} \alias{acquisitionNum,MsBackendDataFrame-method} \alias{peaksData,MsBackendDataFrame-method} \alias{centroided,MsBackendDataFrame-method} @@ -29,6 +21,7 @@ \alias{dataOrigin<-,MsBackendDataFrame-method} \alias{dataStorage,MsBackendDataFrame-method} \alias{dataStorage<-,MsBackendDataFrame-method} +\alias{extractByIndex,MsBackendDataFrame,ANY-method} \alias{intensity,MsBackendDataFrame-method} \alias{intensity<-,MsBackendDataFrame-method} \alias{isEmpty,MsBackendDataFrame-method} @@ -69,6 +62,7 @@ \alias{cbind2,MsBackendDataFrame,dataframeOrDataFrameOrmatrix-method} \alias{split,MsBackendDataFrame,ANY-method} \alias{filterAcquisitionNum,MsBackendDataFrame-method} +\alias{backendRequiredSpectraVariables,MsBackendHdf5Peaks-method} \alias{backendInitialize,MsBackendHdf5Peaks-method} \alias{show,MsBackendHdf5Peaks-method} \alias{peaksData,MsBackendHdf5Peaks-method} @@ -85,9 +79,11 @@ \alias{spectraData<-,MsBackendHdf5Peaks-method} \alias{$<-,MsBackendHdf5Peaks-method} \alias{[,MsBackendHdf5Peaks-method} +\alias{extractByIndex,MsBackendHdf5Peaks,ANY-method} \alias{backendMerge,MsBackendHdf5Peaks-method} \alias{show,MsBackendMemory-method} \alias{backendMerge,MsBackendMemory-method} +\alias{backendRequiredSpectraVariables,MsBackendMemory-method} \alias{acquisitionNum,MsBackendMemory-method} \alias{centroided,MsBackendMemory-method} \alias{centroided<-,MsBackendMemory-method} @@ -97,6 +93,7 @@ \alias{dataOrigin<-,MsBackendMemory-method} \alias{dataStorage,MsBackendMemory-method} \alias{dataStorage<-,MsBackendMemory-method} +\alias{extractByIndex,MsBackendMemory,ANY-method} \alias{intensity,MsBackendMemory-method} \alias{intensity<-,MsBackendMemory-method} \alias{ionCount,MsBackendMemory-method} @@ -140,6 +137,7 @@ \alias{cbind2,MsBackendMemory,dataframeOrDataFrameOrmatrix-method} \alias{split,MsBackendMemory,ANY-method} \alias{filterAcquisitionNum,MsBackendMemory-method} +\alias{backendRequiredSpectraVariables,MsBackendMzR-method} \alias{backendInitialize,MsBackendMzR-method} \alias{show,MsBackendMzR-method} \alias{peaksData,MsBackendMzR-method} @@ -172,28 +170,12 @@ .check = TRUE ) -containsMz(object, ...) - -containsNeutralLoss(object, ...) - -dropNaSpectraVariables(object, ...) - -entropy(object, ...) - -export(object, ...) - -pickPeaks(object, ...) - -replaceIntensitiesBelow(object, threshold = min, ...) - -reset(object, ...) - -selectSpectraVariables(object, ...) - \S4method{show}{MsBackendDataFrame}(object) \S4method{backendMerge}{MsBackendDataFrame}(object, ...) +\S4method{backendRequiredSpectraVariables}{MsBackendDataFrame}(object, ...) + \S4method{acquisitionNum}{MsBackendDataFrame}(object) \S4method{peaksData}{MsBackendDataFrame}(object, columns = c("mz", "intensity")) @@ -214,6 +196,8 @@ selectSpectraVariables(object, ...) \S4method{dataStorage}{MsBackendDataFrame}(object) <- value +\S4method{extractByIndex}{MsBackendDataFrame,ANY}(object, i) + \S4method{intensity}{MsBackendDataFrame}(object) \S4method{intensity}{MsBackendDataFrame}(object) <- value @@ -301,6 +285,8 @@ selectSpectraVariables(object, ...) dataOrigin = character() ) +\S4method{backendRequiredSpectraVariables}{MsBackendHdf5Peaks}(object, ...) + \S4method{backendInitialize}{MsBackendHdf5Peaks}( object, files = character(), @@ -340,12 +326,16 @@ selectSpectraVariables(object, ...) \S4method{[}{MsBackendHdf5Peaks}(x, i, j, ..., drop = FALSE) +\S4method{extractByIndex}{MsBackendHdf5Peaks,ANY}(object, i) + \S4method{backendMerge}{MsBackendHdf5Peaks}(object, ...) \S4method{show}{MsBackendMemory}(object) \S4method{backendMerge}{MsBackendMemory}(object, ...) +\S4method{backendRequiredSpectraVariables}{MsBackendMemory}(object, ...) + \S4method{acquisitionNum}{MsBackendMemory}(object) \S4method{centroided}{MsBackendMemory}(object) @@ -364,6 +354,8 @@ selectSpectraVariables(object, ...) \S4method{dataStorage}{MsBackendMemory}(object) <- value +\S4method{extractByIndex}{MsBackendMemory,ANY}(object, i) + \S4method{intensity}{MsBackendMemory}(object) \S4method{intensity}{MsBackendMemory}(object) <- value @@ -455,6 +447,8 @@ selectSpectraVariables(object, ...) dataOrigin = character() ) +\S4method{backendRequiredSpectraVariables}{MsBackendMzR}(object, ...) + \S4method{backendInitialize}{MsBackendMzR}(object, files, ..., BPPARAM = bpparam()) \S4method{show}{MsBackendMzR}(object) diff --git a/man/joinPeaks.Rd b/man/joinPeaks.Rd index 29cabc8d..bc1fa688 100644 --- a/man/joinPeaks.Rd +++ b/man/joinPeaks.Rd @@ -142,7 +142,12 @@ joinPeaksGnps(x, y, pmz_x, pmz_y) joinPeaksGnps(x, y, pmz_x, yPrecursorMz = NA) } \seealso{ -\code{\link[=gnps]{gnps()}} +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for the function to calculate similarities between +spectra. +\item \code{\link[=gnps]{gnps()}} in the \emph{MsCoreUtils} package for more information on the GNPS +similarity score. +} } \author{ Johannes Rainer, Michael Witting diff --git a/man/neutralLoss.Rd b/man/neutralLoss.Rd index da1a887e..d27cd3c8 100644 --- a/man/neutralLoss.Rd +++ b/man/neutralLoss.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AllGenerics.R, R/Spectra-neutralLoss.R +% Please edit documentation in R/Spectra-neutralLoss.R \name{neutralLoss} \alias{neutralLoss} \alias{PrecursorMzParam} \alias{neutralLoss,Spectra,PrecursorMzParam-method} \title{Calculate Neutral Loss Spectra} \usage{ -neutralLoss(object, param, ...) - PrecursorMzParam( filterPeaks = c("none", "abovePrecursor", "belowPrecursor", "removePrecursor"), msLevel = c(2L, NA_integer_), @@ -18,13 +16,6 @@ PrecursorMzParam( \S4method{neutralLoss}{Spectra,PrecursorMzParam}(object, param, ...) } \arguments{ -\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral -loss spectra should be calculated.} - -\item{param}{One of the \emph{parameter} objects discussed below.} - -\item{...}{Currently ignored.} - \item{filterPeaks}{For \code{PrecursorMzParam()}: \code{character(1)} or \code{function} defining if and how fragment peaks should be filtered before calculation. Pre-defined options are: \code{"none"} (keep all peaks), \code{"abovePrecursor"} @@ -47,6 +38,13 @@ for details.} \item{tolerance}{\code{numeric(1)} with absolute acceptable difference in m/z values to filter peaks. Defaults to \code{tolerance = 0}. See function description for details.} + +\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral +loss spectra should be calculated.} + +\item{param}{One of the \emph{parameter} objects discussed below.} + +\item{...}{Currently ignored.} } \value{ A \code{\link[=Spectra]{Spectra()}} object with calculated neutral loss spectra. @@ -136,6 +134,9 @@ Aisporna A, Benton PH, Chen A, Derks RJE, Galano JM, Giera M and Siuzdak G Analysis in METLIN. Journal of the American Society for Mass Spectrometry. \doi{10.1021/jasms.1c00343} } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Johannes Rainer } diff --git a/man/processingChunkSize.Rd b/man/processingChunkSize.Rd index b47d8c69..a9382611 100644 --- a/man/processingChunkSize.Rd +++ b/man/processingChunkSize.Rd @@ -1,9 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R \name{processingChunkSize} \alias{processingChunkSize} \alias{processingChunkSize<-} \alias{processingChunkFactor} +\alias{backendBpparam,Spectra-method} \title{Parallel and chunk-wise processing of \code{Spectra}} \usage{ processingChunkSize(x) @@ -11,11 +12,18 @@ processingChunkSize(x) processingChunkSize(x) <- value processingChunkFactor(x) + +\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) } \arguments{ \item{x}{\code{Spectra}.} \item{value}{\code{integer(1)} defining the chunk size.} + +\item{object}{\code{Spectra} object.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information.} } \value{ \code{processingChunkSize()} returns the currently defined processing diff --git a/man/spectraData.Rd b/man/spectraData.Rd new file mode 100644 index 00000000..2aad735f --- /dev/null +++ b/man/spectraData.Rd @@ -0,0 +1,601 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Spectra.R +\name{spectraData} +\alias{spectraData} +\alias{acquisitionNum} +\alias{centroided} +\alias{collisionEnergy} +\alias{dataOrigin} +\alias{dataStorage} +\alias{intensity} +\alias{ionCount} +\alias{isCentroided} +\alias{isEmpty} +\alias{isolationWindowLowerMz} +\alias{isolationWindowUpperMz} +\alias{isolationWindowTargetMz} +\alias{lengths} +\alias{msLevel} +\alias{mz} +\alias{peaksData} +\alias{peaksVariables} +\alias{polarity} +\alias{precursorCharge} +\alias{precursorIntensity} +\alias{precursorMz} +\alias{rtime} +\alias{scanIndex} +\alias{smoothed} +\alias{spectraNames} +\alias{spectraVariables} +\alias{tic} +\alias{uniqueMsLevels} +\alias{asDataFrame} +\alias{acquisitionNum,Spectra-method} +\alias{centroided,Spectra-method} +\alias{centroided<-,Spectra-method} +\alias{collisionEnergy,Spectra-method} +\alias{collisionEnergy<-,Spectra-method} +\alias{coreSpectraVariables} +\alias{dataOrigin,Spectra-method} +\alias{dataOrigin<-,Spectra-method} +\alias{dataStorage,Spectra-method} +\alias{intensity,Spectra-method} +\alias{ionCount,Spectra-method} +\alias{isCentroided,Spectra-method} +\alias{isEmpty,Spectra-method} +\alias{isolationWindowLowerMz,Spectra-method} +\alias{isolationWindowLowerMz<-,Spectra-method} +\alias{isolationWindowTargetMz,Spectra-method} +\alias{isolationWindowTargetMz<-,Spectra-method} +\alias{isolationWindowUpperMz,Spectra-method} +\alias{isolationWindowUpperMz<-,Spectra-method} +\alias{length,Spectra-method} +\alias{lengths,Spectra-method} +\alias{msLevel,Spectra-method} +\alias{mz,Spectra-method} +\alias{peaksData,Spectra-method} +\alias{peaksVariables,Spectra-method} +\alias{polarity,Spectra-method} +\alias{polarity<-,Spectra-method} +\alias{precScanNum,Spectra-method} +\alias{precursorCharge,Spectra-method} +\alias{precursorIntensity,Spectra-method} +\alias{precursorMz,Spectra-method} +\alias{precursorMz<-,Spectra-method} +\alias{rtime,Spectra-method} +\alias{rtime<-,Spectra-method} +\alias{scanIndex,Spectra-method} +\alias{smoothed,Spectra-method} +\alias{smoothed<-,Spectra-method} +\alias{spectraData,Spectra-method} +\alias{spectraData<-,Spectra-method} +\alias{spectraNames,Spectra-method} +\alias{spectraNames<-,Spectra-method} +\alias{spectraVariables,Spectra-method} +\alias{tic,Spectra-method} +\alias{uniqueMsLevels,Spectra-method} +\alias{$,Spectra-method} +\alias{$<-,Spectra-method} +\alias{[[,Spectra-method} +\alias{[[<-,Spectra-method} +\title{Accessing mass spectrometry data} +\usage{ +asDataFrame( + object, + i = seq_along(object), + spectraVars = spectraVariables(object) +) + +\S4method{acquisitionNum}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) + +\S4method{centroided}{Spectra}(object) <- value + +\S4method{collisionEnergy}{Spectra}(object) + +\S4method{collisionEnergy}{Spectra}(object) <- value + +coreSpectraVariables() + +\S4method{dataOrigin}{Spectra}(object) + +\S4method{dataOrigin}{Spectra}(object) <- value + +\S4method{dataStorage}{Spectra}(object) + +\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{ionCount}{Spectra}(object) + +\S4method{isCentroided}{Spectra}(object, ...) + +\S4method{isEmpty}{Spectra}(x) + +\S4method{isolationWindowLowerMz}{Spectra}(object) + +\S4method{isolationWindowLowerMz}{Spectra}(object) <- value + +\S4method{isolationWindowTargetMz}{Spectra}(object) + +\S4method{isolationWindowTargetMz}{Spectra}(object) <- value + +\S4method{isolationWindowUpperMz}{Spectra}(object) + +\S4method{isolationWindowUpperMz}{Spectra}(object) <- value + +\S4method{length}{Spectra}(x) + +\S4method{lengths}{Spectra}(x, use.names = FALSE) + +\S4method{msLevel}{Spectra}(object) + +\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) + +\S4method{peaksData}{Spectra}( + object, + columns = c("mz", "intensity"), + f = processingChunkFactor(object), + ..., + BPPARAM = bpparam() +) + +\S4method{peaksVariables}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) + +\S4method{polarity}{Spectra}(object) <- value + +\S4method{precScanNum}{Spectra}(object) + +\S4method{precursorCharge}{Spectra}(object) + +\S4method{precursorIntensity}{Spectra}(object) + +\S4method{precursorMz}{Spectra}(object) + +\S4method{precursorMz}{Spectra}(object, ...) <- value + +\S4method{rtime}{Spectra}(object) + +\S4method{rtime}{Spectra}(object) <- value + +\S4method{scanIndex}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) + +\S4method{smoothed}{Spectra}(object) <- value + +\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) + +\S4method{spectraData}{Spectra}(object) <- value + +\S4method{spectraNames}{Spectra}(object) + +\S4method{spectraNames}{Spectra}(object) <- value + +\S4method{spectraVariables}{Spectra}(object) + +\S4method{tic}{Spectra}(object, initial = TRUE) + +\S4method{uniqueMsLevels}{Spectra}(object, ...) + +\S4method{$}{Spectra}(x, name) + +\S4method{$}{Spectra}(x, name) <- value + +\S4method{[[}{Spectra}(x, i, j, ...) + +\S4method{[[}{Spectra}(x, i, j, ...) <- value +} +\arguments{ +\item{object}{A \code{Spectra} object.} + +\item{i}{For \code{asDataFrame()}: A \code{numeric} indicating which scans to coerce +to a \code{DataFrame} (default is \code{seq_along(object)}).} + +\item{spectraVars}{\code{character()} indicating what spectra variables to add to +the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all +available variables.} + +\item{value}{A vector with values to replace the respective spectra +variable. Needs to be of the correct data type for the spectra variable.} + +\item{f}{For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how +data should be chunk-wise loaded an processed. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} + +\item{...}{Additional arguments.} + +\item{x}{A \code{Spectra} object.} + +\item{use.names}{For \code{lengths()}: ignored.} + +\item{columns}{For \code{spectraData()} accessor: optional \code{character} with +column names (spectra variables) that should be included in the +returned \code{DataFrame}. By default, all columns are returned. +For \code{peaksData()} accessor: optional \code{character} with requested columns +in the individual \code{matrix} of the returned \code{list}. Defaults to +\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} +with \code{object} being the \code{Spectra} object are supported.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. See also \code{\link[=processingChunkSize]{processingChunkSize()}} for more information +on parallel processing.} + +\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially +reported total ion current should be reported, or whether the +total ion current should be (re)calculated on the actual data +(\code{initial = FALSE}, same as \code{ionCount()}).} + +\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return +or set.} + +\item{j}{For \code{[}: not supported.} +} +\description{ +As detailed in the documentation of the \link{Spectra} class, a \code{Spectra} object +is a container for mass spectrometry (MS) data that includes both the mass +peaks data (or \emph{peaks data}, generally \emph{m/z} and intensity values) as well +as spectra metadata (so called \emph{spectra variables}). Spectra variables +generally define one value per spectrum, while for peaks variables one value +per mass peak is defined and hence multiple values per spectrum (depending +on the number of mass peaks of a spectrum). + +Data can be extracted from a \code{Spectra} object using dedicated accessor +functions or also using the \code{$} operator. Depending on the backend class +used by the \code{Spectra} to represent the data, data can also be added or +replaced (again, using dedicated functions or using \verb{$<-}). +} +\section{Spectra variables}{ + + +A common set of \emph{core spectra variables} are defined for \code{Spectra}. These +have a pre-defined data type and each \code{Spectra} will return a value for +these if requested. If no value for a spectra variable is defined, a missing +value (of the correct data type) is returned. The list of core spectra +variables and their respective data type is: +\itemize{ +\item \emph{acquisitionNum} \code{integer(1)}: the index of acquisition of a spectrum +during an MS run. +\item \emph{centroided} \code{logical(1)}: whether the spectrum is in profile or centroid +mode. +\item \emph{collisionEnergy} \code{numeric(1)}: collision energy used to create an MSn +spectrum. +\item \emph{dataOrigin} \code{character(1)}: the \emph{origin} of the spectrum's data, e.g. the +mzML file from which it was read. +\item \emph{dataStorage} \code{character(1)}: the (current) storage location of the +spectrum data. This value depends on the backend used to handle and +provide the data. For an \emph{in-memory} backend like the \code{MsBackendDataFrame} +this will be \code{""}, for an on-disk backend such as the +\code{MsBackendHdf5Peaks} it will be the name of the HDF5 file where the +spectrum's peak data is stored. +\item \emph{isolationWindowLowerMz} \code{numeric(1)}: lower m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowTargetMz} \code{numeric(1)}: the target m/z for the isolation +window in which the (MSn) spectrum was measured. +\item \emph{isolationWindowUpperMz} \code{numeric(1)}: upper m/z for the isolation window +in which the (MSn) spectrum was measured. +\item \emph{msLevel} \code{integer(1)}: the MS level of the spectrum. +\item \emph{polarity} \code{integer(1)}: the polarity of the spectrum (\code{0} and \code{1} +representing negative and positive polarity, respectively). +\item \emph{precScanNum} \code{integer(1)}: the scan (acquisition) number of the precursor +for an MSn spectrum. +\item \emph{precursorCharge} \code{integer(1)}: the charge of the precursor of an MSn +spectrum. +\item \emph{precursorIntensity} \code{numeric(1)}: the intensity of the precursor of an +MSn spectrum. +\item \emph{precursorMz} \code{numeric(1)}: the m/z of the precursor of an MSn spectrum. +\item \emph{rtime} \code{numeric(1)}: the retention time of a spectrum. +\item \emph{scanIndex} \code{integer(1)}: the index of a spectrum within a (raw) file. +\item \emph{smoothed} \code{logical(1)}: whether the spectrum was smoothed. +} + +For each of these spectra variable a dedicated accessor function is defined +(such as \code{msLevel()} or \code{rtime()}) that allows to extract the values of +that spectra variable for all spectra in a \code{Spectra} object. Also, +replacement functions are defined, but not all backends might support +replacing values for spectra variables. As described above, additional +spectra variables can be defined or added. The \code{spectraVariables()} function +can be used to + +Values for multiple spectra variables, or all spectra vartiables* can be +extracted with the \code{spectraData()} function. +} + +\section{Peaks variables}{ + + +\code{Spectra} also provide mass peak data with the \emph{m/z} and intensity values +being the \emph{core} peaks variables: +\itemize{ +\item \emph{intensity} \code{numeric}: intensity values for the spectrum's peaks. +\item \emph{mz} \code{numeric}: the m/z values for the spectrum's peaks. +} + +Values for these can be extracted with the \code{mz()} and \code{intensity()} +functions, or the \code{peaksData()} function. The former functions return a +\code{NumericList} with the respective values, while the latter returns a \code{List} +with \code{numeric} two-column matrices. The list of peaks matrices can also +be extracted using \code{as(x, "list")} or \code{as(x, "SimpleList")} with \code{x} being +a \code{Spectra} object. + +Some \code{Spectra}/backends provide also values for additional peaks variables. +The set of available peaks variables can be extracted with the +\code{peaksVariables()} function. +} + +\section{Functions to access MS data}{ + + +The set of available functions to extract data from, or set data in, a +\code{Spectra} object are (in alphabetical order) listed below. Note that there +are also other functions to extract information from a \code{Spectra} object +documented in \code{\link[=addProcessing]{addProcessing()}}. +\itemize{ +\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. +See examples for details. Note that replacing values of a peaks variable +is not supported with a non-empty processing queue, i.e. if any filtering +or data manipulations on the peaks data was performed. In these cases +\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data +operations. +\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the +backend. +\item \code{acquisitionNum()}: returns the acquisition number of each +spectrum. Returns an \code{integer} of length equal to the number of +spectra (with \code{NA_integer_} if not available). +\item \code{asDataFrame()}: converts the \code{Spectra} to a \code{DataFrame} (in long format) +contining all data. Returns a \code{DataFrame}. +\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding +information of the spectra. \code{centroided()} returns a \code{logical} +vector of length equal to the number of spectra with \code{TRUE} if a +spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} +if it is undefined. See also \code{isCentroided()} for estimating from +the spectrum data whether the spectrum is centroided. \code{value} +for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of +length equal to the number of spectra in \code{object}. +\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the +collision energy for all spectra in \code{object}. \code{collisionEnergy()} +returns a \code{numeric} with length equal to the number of spectra +(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a +\code{numeric} of length equal to the number of spectra in \code{object}. +\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with +their expected data type. +\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each +spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than +\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a +\code{character} vector (same length than \code{object}) with the replacement +values for the data origin of each spectrum. +\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) +with the data storage location of each spectrum. +\item \code{intensity()}: gets the intensity values from the spectra. Returns +a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each +spectrum). The length of the list is equal to the number of +\code{spectra} in \code{object}. +\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for +each spectrum. If the spectrum is empty (see \code{isEmpty()}), +\code{NA_real_} is returned. +\item \code{isCentroided()}: a heuristic approach assessing if the spectra in +\code{object} are in profile or centroided mode. The function takes +the \code{qtl}th quantile top peaks, then calculates the difference +between adjacent m/z value and returns \code{TRUE} if the first +quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for +the code.) +\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty +(i.e. does not contain any peaks). Returns a \code{logical} vector of +length equal number of spectra. +\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the +lower m/z boundary of the isolation window. +\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the +target m/z of the isolation window. +\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the +upper m/z boundary of the isolation window. +\item \code{length()}: gets the number of spectra in the object. +\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per +spectrum. Returns an \code{integer} vector (length equal to the +number of spectra). For empty spectra, \code{0} is returned. +\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names +being spectrum names, length equal to the number of spectra) with the MS +level for each spectrum. +\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the +spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of +spectra, each element a \code{numeric} vector with the m/z values of +one spectrum. +\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks +data consist of the m/z and intensity values as well as possible additional +annotations (variables) of all peaks of each spectrum. The function +returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or +\code{data.frame}), with each array providing the values for the requested +\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter +\code{columns} is passed to the backend's \code{peaksData()} function to allow +the selection of specific (or additional) peaks variables (columns) that +should be extracted (if available). Importantly, +it is \strong{not} guaranteed that each backend supports this parameter (while +each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). +Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value +returned by \code{peaksVariables(object)} is supported. +Note also that it is possible to extract the peak data with +\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, +respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} +does not support the parameter \code{columns}. +\item \code{peaksVariables()}: lists the available variables for mass peaks provided +by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which +all backends need to support and provide), but some backends might provide +additional variables. +These variables correspond to the column names of the peak data array +returned by \code{peaksData()}. +\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each +spectrum. \code{polarity()} returns an \code{integer} vector (length equal +to the number of spectra), with \code{0} and \code{1} representing negative +and positive polarities, respectively. \verb{polarity<-} expects an +\code{integer} vector of length 1 or equal to the number of spectra. +\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, +\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), +intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) +and acquisition number (\code{interger}) of the precursor for MS level > +2 spectra from the object. Returns a vector of length equal to +the number of spectra in \code{object}. \code{NA} are reported for MS1 +spectra of if no precursor information is available. +\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) +for each spectrum. \code{rtime()} returns a \code{numeric} vector (length +equal to the number of spectra) with the retention time for each +spectrum. \verb{rtime<-} expects a numeric vector with length equal +to the number of spectra. +\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} +for each spectrum. This represents the relative index of the +spectrum within each file. Note that this can be different to the +\code{acquisitionNum} of the spectrum which represents the index of the +spectrum during acquisition/measurement (as reported in the mzML file). +\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is +\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal +to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector +of length 1 or equal to the number of spectra in \code{object}. +\item \code{spectraData()}: gets general spectrum metadata (annotation, also called +header). \code{spectraData()} returns a \code{DataFrame}. Note that this +method does by default \strong{not} return m/z or intensity values. +\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} +object with the one provided with \code{value}. The \verb{spectraData<-} function +expects a \code{DataFrame} to be passed as value with the same number of rows +as there a spectra in \code{object}. Note that replacing values of +peaks variables is not supported with a non-empty processing queue, i.e. +if any filtering or data manipulations on the peaks data was performed. +In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all +cached data operations and empty the processing queue. +\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. +\item \code{spectraVariables()}: returns a \code{character} vector with the +available spectra variables (columns, fields or attributes of each +spectrum) available in \code{object}. Note that \code{spectraVariables()} does not +list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional +annotations for each MS peak). Peak variables are returned by +\code{peaksVariables()}. +\item \code{tic()}: gets the total ion current/count (sum of signal of a +spectrum) for all spectra in \code{object}. By default, the value +reported in the original raw data file is returned. For an empty +spectrum, \code{0} is returned. +\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This +function is supposed to be more efficient than \code{unique(msLevel(object))}. +} +} + +\examples{ + +## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +## backend. +sciex_file <- dir(system.file("sciex", package = "msdata"), + full.names = TRUE) +sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +sciex + +## Get the number of spectra in the data set +length(sciex) + +## Get the number of mass peaks per spectrum - limit to the first 6 +lengths(sciex) |> head() + +## Get the MS level for each spectrum - limit to the first 6 spectra +msLevel(sciex) |> head() + +## Alternatively, we could also use $ to access a specific spectra variable. +## This could also be used to add additional spectra variables to the +## object (see further below). +sciex$msLevel |> head() + +## Get the intensity and m/z values. +intensity(sciex) +mz(sciex) + +## Convert a subset of the Spectra object to a long DataFrame. +asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) + +## Create a Spectra providing a `DataFrame` containing the spectrum data. + +spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) + +s <- Spectra(spd) +s + +## List all available spectra variables (i.e. spectrum data and metadata). +spectraVariables(s) + +## For all *core* spectrum variables accessor functions are available. These +## return NA if the variable was not set. +centroided(s) +dataStorage(s) +rtime(s) +precursorMz(s) + +## The core spectra variables are: +coreSpectraVariables() + +## Add an additional metadata column. +s$spectrum_id <- c("sp_1", "sp_2") + +## List spectra variables, "spectrum_id" is now also listed +spectraVariables(s) + +## Get the values for the new spectra variable +s$spectrum_id + +## Extract specific spectra variables. +spectraData(s, columns = c("spectrum_id", "msLevel")) + + +## -------- PEAKS VARIABLES AND DATA -------- + +## Get the peak data (m/z and intensity values). +pks <- peaksData(s) +pks +pks[[1]] +pks[[2]] + +## Note that we could get the same resulb by coercing the `Spectra` to +## a `list` or `SimpleList`: +as(s, "list") +as(s, "SimpleList") + +## Or use `mz()` and `intensity()` to extract the m/z and intensity values +## separately +mz(s) +intensity(s) + +## Some `MsBackend` classes provide support for arbitrary peaks variables +## (in addition to the mandatory `"mz"` and `"intensity"` values. Below +## we create a simple data frame with an additional peak variable `"pk_ann"` +## and create a `Spectra` with a `MsBackendMemory` for that data. +## Importantly the number of values (per spectrum) need to be the same +## for all peak variables. + +tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) +tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) +tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) +tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) + +## Create the Spectra. With parameter `peaksVariables` we can define +## the columns in `tmp` that contain peaks variables. +sps <- Spectra(tmp, source = MsBackendMemory(), + peaksVariables = c("mz", "intensity", "pk_ann")) +peaksVariables(sps) + +## Extract just the m/z and intensity values +peaksData(sps)[[1L]] + +## Extract the full peaks data +peaksData(sps, columns = peaksVariables(sps))[[1L]] + +## Access just the pk_ann variable +sps$pk_ann + + +} +\seealso{ +\itemize{ +\item \code{\link[=addProcessing]{addProcessing()}} for functions to analyze \code{Spectra}. +\item \link{Spectra} for a general description of the \code{Spectra} object. +} +} +\author{ +Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail +} diff --git a/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg b/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg index e16506da..e041fc61 100644 --- a/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg +++ b/tests/testthat/_snaps/plotMzDelta/plotmzdelta-1000.svg @@ -1,579 +1,254 @@ - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + +Histogram of Mass Delta Distributions +M/Z delta +Frequency + + + + + +50 +100 +150 +200 + + + + + + + +0 +500 +1000 +1500 +2000 +2500 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +peg +A +R +N +D +C +E +Q/K +G +H +I/L +M +F +P +S +T +W +Y +V diff --git a/tests/testthat/test_MsBackend.R b/tests/testthat/test_MsBackend.R index 3d3f7e28..5e91a0fe 100644 --- a/tests/testthat/test_MsBackend.R +++ b/tests/testthat/test_MsBackend.R @@ -56,6 +56,33 @@ test_that("MsBackend methods throw errors", { expect_error(dm[1], "implemented for") expect_error(dm$a, "implemented for") expect_error(dm$a <- "a", "implemented for") + expect_error(extractByIndex(dm, 1), "implemented for") + expect_equal(backendRequiredSpectraVariables(dm), character()) + expect_error(precursorMz(dm) <- 12.3, "implemented for") +}) + +test_that("extractByIndex not implemented fallback", { + ## Backends that don't implement a dedicated `extractByIndex` method should + ## fall back to the [ method. + setClass("DummyBackend", + contains = "MsBackend", + slots = c(d = "integer")) + dm <- new("DummyBackend") + expect_error(extractByIndex(dm, 1L), "'extractByIndex' not implemented") + + dm@d <- 1:4 + + ## Have an implementation for [ but not extractByIndex: + setMethod("[", "DummyBackend", function(x, i, j, ..., drop = FALSE) { + x@d <- x@d[i] + x + }) + + res <- dm[c(3, 1)] + expect_equal(res@d, c(3L, 1L)) + + res <- extractByIndex(dm, c(3, 1)) + expect_equal(res@d, c(3L, 1L)) }) test_that("reset,MsBackend works", { @@ -75,3 +102,9 @@ test_that("backendBpparam,MsBackend works", { test_that("backendParallelFactor,MsBackend works", { expect_equal(backendParallelFactor(MsBackendMemory()), factor()) }) + +test_that("dataStorageBasePath,MsExperiment works", { + expect_identical(dataStorageBasePath(MsBackendMemory()), NA_character_) + tmp <- MsBackendMemory() + expect_warning(dataStorageBasePath(tmp) <- "/", "not support") +}) diff --git a/tests/testthat/test_MsBackendCached.R b/tests/testthat/test_MsBackendCached.R index 6ff1b7ee..e547b190 100644 --- a/tests/testthat/test_MsBackendCached.R +++ b/tests/testthat/test_MsBackendCached.R @@ -87,12 +87,24 @@ test_that("[,MsBackendCached works", { res <- be[c(1, 4, 3), ] expect_true(length(res) == 3) expect_true(nrow(res@localData) == 3) + res_2 <- extractByIndex(be, c(1, 4, 3)) + expect_equal(res, res_2) df <- data.frame(msLevel = 1L, b = 1:6) be <- backendInitialize(be, data = df) res <- be[c(6, 1, 3)] expect_true(length(res) == 3) expect_equal(res@localData$b, c(6, 1, 3)) + res_2 <- extractByIndex(be, c(6, 1, 3)) + expect_equal(res, res_2) + + res <- be[c(6, 1, 3, 1)] + expect_true(length(res) == 4) + expect_equal(res@localData$b, c(6, 1, 3, 1)) + res_2 <- extractByIndex(be, c(6, 1, 3, 1)) + expect_equal(res, res_2) + + expect_equal(extractByIndex(be), be) }) test_that("$,MsBackendCached works", { @@ -290,3 +302,10 @@ test_that("lengths,MsBackendCached works", { res <- lengths(be) expect_true(all(res == 0)) }) + +test_that("precursorMz<-,MsBackendCached works", { + be <- backendInitialize(MsBackendCached(), nspectra = 4) + expect_true(all(is.na(precursorMz(be)))) + precursorMz(be) <- c(1.1, 1.2, 1.3, 1.34) + expect_equal(precursorMz(be), c(1.1, 1.2, 1.3, 1.34)) +}) diff --git a/tests/testthat/test_MsBackendDataFrame.R b/tests/testthat/test_MsBackendDataFrame.R index ec3aeec6..2cc04795 100644 --- a/tests/testthat/test_MsBackendDataFrame.R +++ b/tests/testthat/test_MsBackendDataFrame.R @@ -576,24 +576,42 @@ test_that("show,MsBackendDataFrame works", { test_that("[,MsBackendDataFrame works", { be <- MsBackendDataFrame() expect_error(be[1]) + + expect_equal(extractByIndex(be), be) + df <- DataFrame(scanIndex = 1:2, a = "a", b = "b") be <- backendInitialize(be, df) res <- be[1] expect_true(validObject(res)) expect_equal(be@spectraData[1, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 1) + expect_equal(res, res_2) res <- be[2] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) res <- be[2:1] expect_true(validObject(res)) expect_equal(be@spectraData[2:1, ], res@spectraData) + res_2 <- extractByIndex(be, 2:1) + expect_equal(res, res_2) + + res <- be[c(2, 1, 2)] + expect_equal(res$scanIndex, c(2, 1, 2)) + res_2 <- extractByIndex(be, c(2, 1, 2)) + expect_equal(res, res_2) res <- be[c(FALSE, FALSE)] expect_true(validObject(res)) expect_true(length(res) == 0) + res_2 <- extractByIndex(be, integer()) + expect_equal(res, res_2) res <- be[c(FALSE, TRUE)] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) expect_error(be[TRUE], "match the length of") expect_error(be["a"], "does not have names") @@ -606,11 +624,15 @@ test_that("[,MsBackendDataFrame works", { expect_true(validObject(res)) expect_equal(dataStorage(res), "2") expect_equal(res@spectraData$file, "b") + res_2 <- extractByIndex(be, 3) + expect_equal(res, res_2) res <- be[c(3, 1)] expect_true(validObject(res)) expect_equal(dataStorage(res), c("2", "1")) expect_equal(res@spectraData$file, c("b", "a")) + res_2 <- extractByIndex(be, c(3, 1)) + expect_equal(res, res_2) }) test_that("cbind2, MsBackendDataFrame works", { @@ -638,15 +660,32 @@ test_that("selectSpectraVariables,MsBackendDataFrame works", { be <- backendInitialize(MsBackendDataFrame(), df) res <- selectSpectraVariables(be, c("dataStorage", "other_col")) + + expect_equal(res@peaksVariables, be@peaksVariables) expect_equal(colnames(res@spectraData), c("dataStorage", "other_col")) expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) res <- selectSpectraVariables(be, c("dataStorage", "rtime")) expect_equal(colnames(res@spectraData), c("dataStorage", "rtime")) + expect_equal(res@peaksVariables, be@peaksVariables) - expect_error(selectSpectraVariables(be, "rtime"), "dataStorage is/are missing") + expect_error(selectSpectraVariables(be, "rtime"), "are required") expect_error(selectSpectraVariables(be, "something"), "something not available") + + df$mz <- list(c(1.2, 1.4), c(5.3, 34.5, 52.1)) + df$intensity <- list(c(123, 121.1), c(1231.1, 343.1, 21.1)) + be <- backendInitialize(MsBackendDataFrame(), df) + res <- selectSpectraVariables(be, c("dataStorage", "other_col")) + expect_equal(colnames(res@spectraData), c("dataStorage", "other_col")) + expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) + expect_equal(res@peaksVariables, character()) + + be <- backendInitialize(MsBackendDataFrame(), df) + res <- selectSpectraVariables(be, c("dataStorage", "mz", "intensity")) + expect_equal(colnames(res@spectraData), c("dataStorage", "mz", "intensity")) + expect_equal(msLevel(res), c(NA_integer_, NA_integer_)) + expect_equal(res@peaksVariables, c("mz", "intensity")) }) test_that("$,$<-,MsBackendDataFrame works", { @@ -1001,3 +1040,8 @@ test_that("[[,[[<-,MsBackendDataFrame works", { test_that("supportsSetBackend,MsBackendDataFrame", { expect_true(supportsSetBackend(MsBackendDataFrame())) }) + +test_that("backendRequiredSpectraVariables,MsBackendDataFrame works", { + expect_equal(backendRequiredSpectraVariables(MsBackendDataFrame()), + "dataStorage") +}) diff --git a/tests/testthat/test_MsBackendHdf5Peaks.R b/tests/testthat/test_MsBackendHdf5Peaks.R index b7afdf37..17495169 100644 --- a/tests/testthat/test_MsBackendHdf5Peaks.R +++ b/tests/testthat/test_MsBackendHdf5Peaks.R @@ -334,12 +334,16 @@ test_that("[,MsBackendHdf5Peaks works", { expect_identical(peaksData(res), sciex_pks[idx]) expect_identical(rtime(res), rtime(sciex_mzr)[idx]) expect_identical(msLevel(res), msLevel(sciex_mzr)[idx]) + res_2 <- extractByIndex(be, idx) + expect_equal(res, res_2) idx <- dataStorage(be) == fls[2] res <- be[idx, ] expect_true(validObject(res)) expect_true(all(dataStorage(res) == fls[2])) expect_identical(peaksData(res), sciex_pks[idx]) + res_2 <- extractByIndex(be, idx) + expect_equal(res, res_2) }) test_that("backendMerge,MsBackendHdf5Peaks works", { @@ -409,3 +413,8 @@ test_that("backendParallelFactor,MsBackendHdf5Peaks", { factor(dataStorage(sciex_hd5), levels = unique(dataStorage(sciex_hd5)))) }) + +test_that("backendRequiredSpectraVariables,MsBackendHdf5Peaks works", { + expect_equal(backendRequiredSpectraVariables(MsBackendHdf5Peaks()), + c("dataStorage", "scanIndex")) +}) diff --git a/tests/testthat/test_MsBackendMemory.R b/tests/testthat/test_MsBackendMemory.R index 2cdacc37..c4df695f 100644 --- a/tests/testthat/test_MsBackendMemory.R +++ b/tests/testthat/test_MsBackendMemory.R @@ -501,41 +501,67 @@ test_that("$<-,MsBackendMemory works", { test_that("[,MsBackendMemory works", { be <- new("MsBackendMemory") + res <- extractByIndex(be) + expect_equal(res, be) + df <- data.frame(scanIndex = 1:2, a = "a", b = "b") be <- backendInitialize(be, df) res <- be[1] expect_true(validObject(res)) expect_equal(be@spectraData[1, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 1) + expect_equal(res, res_2) + res <- be[2] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) + res <- be[2:1] expect_true(validObject(res)) expect_equal(be@spectraData[2:1, ], res@spectraData) + res_2 <- extractByIndex(be, 2:1) + expect_equal(res, res_2) res <- be[c(FALSE, FALSE)] expect_true(validObject(res)) expect_true(length(res) == 0) + res_2 <- extractByIndex(be, integer()) + expect_equal(res, res_2) + res <- be[c(FALSE, TRUE)] expect_true(validObject(res)) expect_equal(be@spectraData[2, ], res@spectraData[1, ]) + res_2 <- extractByIndex(be, 2) + expect_equal(res, res_2) expect_error(be[TRUE], "match the length of") expect_error(be["a"], "names") df <- data.frame(scanIndex = c(1L, 2L, 1L, 2L), - file = c("a", "a", "b", "b")) + file = c("a", "a", "b", "b"), + idx = 1:4) be <- backendInitialize(be, df) dataStorage(be) <- c("1", "1", "2", "2") res <- be[3] expect_true(validObject(res)) expect_equal(dataStorage(res), "2") expect_equal(res@spectraData$file, "b") + res_2 <- extractByIndex(be, 3) + expect_equal(res, res_2) res <- be[c(3, 1)] expect_true(validObject(res)) expect_equal(dataStorage(res), c("2", "1")) expect_equal(res@spectraData$file, c("b", "a")) + res_2 <- extractByIndex(be, c(3, 1)) + expect_equal(res, res_2) + + res <- be[c(3, 1, 3)] + expect_equal(res$idx, c(3, 1, 3)) + res_2 <- extractByIndex(be, c(3, 1, 3)) + expect_equal(res, res_2) }) test_that("cbind2, MsBackendMemory works", { @@ -934,3 +960,8 @@ test_that("tic,MsBackendMemory works", { test_that("supportsSetBackend,MsBackendMemory", { expect_true(supportsSetBackend(MsBackendMemory())) }) + +test_that("backendRequiredSpectraVariables,MsBackendMemory works", { + expect_equal(backendRequiredSpectraVariables(MsBackendMemory()), + "dataStorage") +}) diff --git a/tests/testthat/test_MsBackendMzR.R b/tests/testthat/test_MsBackendMzR.R index ff891738..44d38cd2 100644 --- a/tests/testthat/test_MsBackendMzR.R +++ b/tests/testthat/test_MsBackendMzR.R @@ -474,6 +474,8 @@ test_that("[,MsBackendMzR works", { expect_equal(length(tmp), 13) expect_equal(tmp@spectraData$scanIndex, 13:25) expect_true(all(is.na(smoothed(tmp)))) + tmp_2 <- extractByIndex(sciex_mzr, 13:25) + expect_equal(tmp, tmp_2) ints <- intensity(tmp) spd <- spectraData(tmp) @@ -493,8 +495,16 @@ test_that("selectSpectraVariables,MsBackendMzR works", { "scanIndex")) expect_equal(colnames(res@spectraData), c("dataStorage", "msLevel", "rtime", "scanIndex")) + expect_equal(res@peaksVariables, character()) + + res <- selectSpectraVariables(be, c("dataStorage", "msLevel", "rtime", + "scanIndex", "mz", "intensity")) + expect_equal(colnames(res@spectraData), c("dataStorage", "msLevel", "rtime", + "scanIndex")) + expect_equal(res@peaksVariables, c("mz", "intensity")) + expect_error(selectSpectraVariables(be, c("dataStorage", "msLevel")), - "scanIndex is/are missing") + "required") }) test_that("$,$<-,MsBackendMzR works", { @@ -559,6 +569,7 @@ test_that("dropNaSpectraVariables works with MsBackendMzR", { expect_equal(mz(res[1]), mz(sciex_mzr[1])) expect_true(length(spectraVariables(res)) < length(spectraVariables(sciex_mzr))) + expect_equal(res@peaksVariables, sciex_mzr@peaksVariables) }) test_that("supportsSetBackend,MsBackendMzR", { @@ -570,3 +581,31 @@ test_that("backendParallelFactor,MsBackendMzR", { factor(dataStorage(sciex_mzr), levels = unique(dataStorage(sciex_mzr)))) }) + +test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { + tmpd <- normalizePath(tempdir()) + file.copy(sciex_file, tmpd) + + expect_equal(dataStorageBasePath(sciex_mzr), + MsCoreUtils::common_path(sciex_file)) + tmp <- sciex_mzr + dataStorageBasePath(tmp) <- tmpd + expect_true(validObject(tmp)) + bp <- normalizePath(dataStorageBasePath(tmp)) + expect_equal(bp, tmpd) + + #' errors + expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") +}) + +test_that("backendRequiredSpectraVariables,MsBackendMzR works", { + tmp <- MsBackendMzR() + expect_equal(backendRequiredSpectraVariables(tmp), + c("dataStorage", "scanIndex")) +}) + +test_that("precursorMz<-,MsbackendMzR works", { + a <- sciex_mzr[1:3] + precursorMz(a) <- c(12.2, 1.2, 1.4) + expect_equal(precursorMz(a), c(12.2, 1.2, 1.4)) +}) diff --git a/tests/testthat/test_Spectra-functions.R b/tests/testthat/test_Spectra-functions.R index ec73a72f..2dbcf372 100644 --- a/tests/testthat/test_Spectra-functions.R +++ b/tests/testthat/test_Spectra-functions.R @@ -352,32 +352,6 @@ test_that("dropNaSpectraVariables works", { function(z) !any(is.na(z))))) }) -test_that(".has_mz works", { - sps <- Spectra(sciex_mzr)[1:10] - sps <- setBackend(sps, MsBackendDataFrame()) - mzs <- mz(sps) - x <- c(mzs[[2]][5], mzs[[3]][8]) - - res <- .has_mz(sps, mz = x, ppm = 0) - expect_true(length(res) == length(sps)) - expect_true(is.logical(res)) - - spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3)) - spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1)) - spd$intensity <- list(c(10, 20, 30, 40), c(11, 21, 31), c(12, 22, 32)) - sps <- Spectra(spd) - - res <- .has_mz(sps, mz = c(14, 34)) - expect_equal(res, c(TRUE, TRUE, FALSE)) - res <- .has_mz(sps, mz = c(14, 34), tolerance = 0.15) - expect_equal(res, c(TRUE, TRUE, TRUE)) - - res <- .has_mz(sps, mz = c(14, 34), condFun = all) - expect_true(all(!res)) - res <- .has_mz(sps, mz = c(14, 34), condFun = all, tolerance = 0.15) - expect_equal(res, c(FALSE, TRUE, TRUE)) -}) - test_that(".has_mz_each works", { spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3)) spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1)) @@ -690,16 +664,6 @@ test_that(".estimate_precursor_intensity works", { expect_true(all(is.na(res))) }) -test_that("estimatePrecursorIntensity works", { - fls <- msdata::proteomics(full.names = TRUE)[c(5, 3)] - second <- Spectra(fls[2], backend = MsBackendMzR()) - both <- Spectra(fls, backend = MsBackendMzR()) - - res_second <- estimatePrecursorIntensity(second) - res_both <- estimatePrecursorIntensity(both) - expect_equal(res_second, res_both[510:length(res_both)]) -}) - test_that(".chunk_factor works", { res <- .chunk_factor(10, chunkSize = 3) expect_equal(res, as.factor(c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4))) @@ -861,3 +825,75 @@ test_that("processingChunkFactor works", { expect_error(processingChunkFactor("a"), "Spectra") }) + +test_that("filterPeaksRanges,Spectra works", { + df <- data.frame(rtime = 123.3, new_var = 4, msLevel = 2L) + df$mz <- list(c(100.1, 100.2, 100.3, 100.4, 200.1, 200.2, 200.3, + 300.1, 300.3, 300.4, 300.5)) + df$intensity <- list(1:11) + s <- Spectra(df) + ## Check errors + expect_error(filterPeaksRanges(3), "'Spectra' object") + expect_error(filterPeaksRanges(s, rtime = c(1, 2), not_exist = c(1, 2)), + "valid spectra variables") + expect_error(filterPeaksRanges(s, rtime = 2, mz = c(1, 2)), + "'numeric' of length 2") + expect_error(filterPeaksRanges( + s, rtime = rbind(c(1, 2), c(2, 3)), mz = c(1, 2)), + "Number of rows of the range matrices") + + ## Single range per variable + res <- filterPeaksRanges(s, rtime = c(100, 200), mz = cbind(200, 300)) + expect_true(inherits(res, "Spectra")) + expect_true(length(res@processingQueue) > 0L) + expect_equal(res@processingQueueVariables, c("rtime", "msLevel")) + expect_equal(length(res@processing), 1L) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(5:7)) + res <- filterPeaksRanges(s, rtime = c(100, 200), mz = cbind(200, 300), + keep = FALSE) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(1:4, 8:11)) + + ## Multiple ranges per variable + res <- filterPeaksRanges( + s, new_var = rbind(c(1, 8), c(1, 4), c(1, 5)), + rtime = rbind(c(100, 200), c(400, 500), c(100, 200)), + mz = rbind(c(100, 100.3), c(0, 500), c(300.3, 310))) + expect_true(inherits(res, "Spectra")) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(1:3, 9:11)) + res <- filterPeaksRanges( + s, new_var = rbind(c(1, 8), c(1, 4), c(1, 5)), + rtime = rbind(c(100, 200), c(400, 500), c(100, 200)), + mz = rbind(c(100, 100.3), c(0, 500), c(300.3, 310)), keep = FALSE) + expect_true(inherits(res, "Spectra")) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], c(4:8)) + + ## Filter also with msLevel; to have the same behaviour as with other + ## filters we would need to add a second filter for e.g. MS level 2 + s <- c(s, s) + s$msLevel <- c(1L, 2L) + res <- filterPeaksRanges(s, rtime = c(100, 200), msLevel = c(1, 1), + mz = c(100, 200)) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 1:4) + a <- peaksData(res)[[2L]] + expect_true(nrow(a) == 0L) + res <- filterPeaksRanges(s, rtime = rbind(c(100, 200), c(100, 200)), + msLevel = rbind(c(1, 1), c(2, 2)), + mz = rbind(c(100, 200), c(0, 400))) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 1:4) + a <- peaksData(res)[[2L]] + expect_equal(a[, 2L], 1:11) + res <- filterPeaksRanges(s, rtime = rbind(c(100, 200), c(100, 200)), + msLevel = rbind(c(1, 1), c(2, 2)), + mz = rbind(c(100, 200), c(0, 400)), + keep = FALSE) + a <- peaksData(res)[[1L]] + expect_equal(a[, 2L], 5:11) + a <- peaksData(res)[[2L]] + expect_true(nrow(a) == 0) +}) diff --git a/tests/testthat/test_Spectra.R b/tests/testthat/test_Spectra.R index 3f8090fc..4cc721d9 100644 --- a/tests/testthat/test_Spectra.R +++ b/tests/testthat/test_Spectra.R @@ -13,32 +13,48 @@ test_that("Spectra,ANY works", { df$polarity <- "NEG" expect_error(Spectra(df), "wrong data type: polarity") + + res <- Spectra(files = sciex_file, source = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + expect_true(length(res) > 1) }) test_that("Spectra,missing works", { res <- Spectra() expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendMemory") + + res <- Spectra(backend = MsBackendDataFrame()) + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendDataFrame") + + res <- Spectra(source = MsBackendDataFrame()) + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendDataFrame") be <- backendInitialize(MsBackendDataFrame(), DataFrame(msLevel = c(1L, 2L), fromFile = 1L)) res <- Spectra(backend = be) + expect_s4_class(res@backend, "MsBackendDataFrame") expect_true(length(res) == 2) expect_identical(msLevel(res), c(1L, 2L)) }) test_that("Spectra,MsBackend works", { - res <- Spectra() - expect_true(length(res) == 0) - - be <- backendInitialize(MsBackendDataFrame(), DataFrame(msLevel = c(1L, 2L), - fromFile = 1L)) + be <- backendInitialize(MsBackendDataFrame(), + DataFrame(msLevel = c(1L, 2L), + fromFile = 1L)) res <- Spectra(be) expect_true(length(res) == 2) expect_identical(msLevel(res), c(1L, 2L)) }) test_that("Spectra,character works", { - res <- Spectra(sciex_file, backend = MsBackendMzR()) + res <- Spectra(sciex_file) + expect_true(is(res@backend, "MsBackendMzR")) + expect_true(length(res) > 0) + + res <- Spectra(sciex_file, source = MsBackendMzR()) expect_true(is(res@backend, "MsBackendMzR")) expect_equal(unique(res@backend$dataStorage), sciex_file) expect_identical(rtime(res), rtime(sciex_mzr)) @@ -51,7 +67,7 @@ test_that("Spectra,character works", { show(res) ## Empty character - res <- Spectra(character(), backend = MsBackendMzR()) + res <- Spectra(character()) expect_s4_class(res, "Spectra") expect_s4_class(res@backend, "MsBackendMzR") expect_true(length(res) == 0) @@ -62,6 +78,37 @@ test_that("Spectra,character works", { expect_true(length(res) == 0) }) +test_that(".create_spectra works, ", { + ## missing object + res <- .create_spectra() + expect_true(length(res) == 0) + expect_s4_class(res@backend, "MsBackendMemory") + expect_error(res <- .create_spectra(backend = MsBackendMzR()), "mandatory") + + ## object being a character, backend a MsBackendMemory -> error + res <- expect_error(.create_spectra(sciex_file), "DataFrame") + ## object being a character, backend a MsBackendMzR + res <- .create_spectra(sciex_file, backend = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + dta <- spectraData(res@backend) + + ## object being a DataFrame, backend a MsBackendDataFrame + res <- .create_spectra(dta, backend = MsBackendDataFrame()) + expect_s4_class(res@backend, "MsBackendDataFrame") + expect_equal(res$msLevel, dta$msLevel) + + ## object missing but providing files + res <- .create_spectra(files = sciex_file, backend = MsBackendMzR()) + expect_s4_class(res@backend, "MsBackendMzR") + expect_equal(res$msLevel, dta$msLevel) + + ## object missing but providing data + res <- .create_spectra(data = dta, backend = MsBackendMemory()) + expect_s4_class(res@backend, "MsBackendMemory") + expect_equal(res$msLevel, dta$msLevel) + +}) + test_that("setBackend,Spectra works", { df <- DataFrame(rtime = as.numeric(1:9), fact = c(2L, 1L, 2L, 1L, 3L, 2L, 3L, 3L, 1L)) @@ -1891,4 +1938,57 @@ test_that("entropy,Spectra works", { expect_identical(res, vapply(df$intensity, MsCoreUtils::entropy, numeric(1))) }) +test_that("dataStorageBasePath,dataStorageBasePath<-,MsBackendMzR works", { + tmpd <- normalizePath(tempdir()) + file.copy(sciex_file, tmpd) + tmp <- Spectra(sciex_mzr) + expect_equal(dataStorageBasePath(tmp), + MsCoreUtils::common_path(sciex_file)) + tmp <- sciex_mzr + tmp <- Spectra(tmp) + dataStorageBasePath(tmp) <- tmpd + expect_true(validObject(tmp@backend)) + bp <- normalizePath(dataStorageBasePath(tmp)) + expect_equal(bp, tmpd) + + #' errors + expect_error(dataStorageBasePath(tmp) <- "some path", "Provided path") +}) + + +test_that("asDataFrame works", { + sciex_file <- normalizePath( + dir(system.file("sciex", package = "msdata"), full.names = TRUE)) + sp <- Spectra(sciex_file) + ## Full dataframe + df <- asDataFrame(sp) + expect_identical(nrow(df), sum(sapply(peaksData(sp), nrow))) + expect_identical(ncol(df), length(spectraVariables(sp)) + 2L) + expect_identical(names(df), c("mz", "intensity", spectraVariables(sp))) + ## Three first scans and 2 spectra variables + df <- asDataFrame(sp, i = 1:3, spectraVars = c("msLevel", "rtime")) + expect_identical(nrow(df), sum(sapply(peaksData(sp[1:3]), nrow))) + expect_identical(ncol(df), 2L + 2L) + ## Three first scans and no spectra variables + df <- asDataFrame(sp, i = 1:3, spectraVars = NULL) + expect_identical(nrow(df), sum(sapply(peaksData(sp[1:3]), nrow))) + expect_identical(ncol(df), 2L) + expect_identical(names(df), c("mz", "intensity")) +}) + +test_that("estimatePrecursorIntensity works", { + fls <- msdata::proteomics(full.names = TRUE)[c(5, 3)] + second <- Spectra(fls[2], backend = MsBackendMzR()) + both <- Spectra(fls, backend = MsBackendMzR()) + + res_second <- estimatePrecursorIntensity(second) + res_both <- estimatePrecursorIntensity(both) + expect_equal(res_second, res_both[510:length(res_both)]) +}) + +test_that("precursorMz<-,Spectra works", { + a <- sps_dda[1:3] + precursorMz(a) <- c(12.3, 1.1, 34.3) + expect_equal(precursorMz(a), c(12.3, 1.1, 34.3)) +}) diff --git a/tests/testthat/test_peaks-functions.R b/tests/testthat/test_peaks-functions.R index b7204b46..ef0978c8 100644 --- a/tests/testthat/test_peaks-functions.R +++ b/tests/testthat/test_peaks-functions.R @@ -490,3 +490,248 @@ test_that(".peaks_filter_precursor_keep_below works", { precursorMz = 14.2, tolerance = 0.1) expect_equal(unname(res[, "intensity"]), 1) }) + +test_that(".peaks_filter_ranges works", { + ## Testing all possible combinations, with/without spectra and/or peaks + ## variables, single/multiple variables, single/multiple rows, NA handling + x <- cbind(mz = c(100.1, 100.2, 100.3, 100.4, + 104.1, 104.2, + 200.3, 200.4, 200.5, + 300.1, 300.2), + intensity = 1:11) + ## res <- .peaks_filter_ranges(x, spectrumMsLevel = 1L, msLevel = 2L) + ## expect_equal(res, x) + + ## Single filters. + ranges <- list(rtime = cbind(1, 2), new_var = cbind(3, 4), + mz = cbind(200, 201), intensity = cbind(8, 9)) + + ## * No peaks variables. + pvars <- character() + svars <- c("rtime", "new_var") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + + ## * No spectra variables. + pvars <- c("mz", "intensity") + svars <- character() + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], 8:9) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], c(1:7, 10:11)) + ranges$mz <- cbind(100, 106) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + ranges$mz <- cbind(200, 201) + + ## * Spectra and peaks variables. + svars <- c("rtime") + pvars <- c("mz") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], 7:9) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, "intensity"], c(1:6, 10:11)) + + ## Multiple filters. + ranges <- list(rtime = rbind(c(1, 2), c(0, 4), c(2, 3)), + new_var = rbind(c(3, 4), c(1, 9), c(3, 5)), + mz = rbind(c(200, 201), c(100, 101), c(200, 201)), + intensity = rbind(c(8, 9), c(1, 20), c(3, 12))) + + ## * No peaks variables. + svars <- c("rtime", "new_var") + pvars <- character() + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 1, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 20, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) == 0L) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 20, + new_var = 3, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x) + + ## * No spectra variables. + svars <- character() + pvars <- c("mz", "intensity") + res <- .peaks_filter_ranges(x, pvars = pvars, ranges = ranges, + spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, 2L], sort(c(8, 9, 1, 2, 3, 4, 7))) + res <- .peaks_filter_ranges(x, pvars = pvars, ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_true(nrow(res) < nrow(x)) + expect_equal(res[, 2L], c(5:6, 10:11)) + res <- .peaks_filter_ranges(x, pvars = c("intensity"), ranges = ranges, + spectrumMsLevel = 1L) + expect_equal(res, x) + res <- .peaks_filter_ranges(x, pvars = c("intensity"), ranges = ranges, + spectrumMsLevel = 1L, keep = FALSE) + expect_equal(res, x[logical(), , drop = FALSE]) + + ## * Spectra and peaks variables. + svars <- c("rtime") + pvars <- c("mz") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 2, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], c(1:4, 7:9)) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 2, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], c(5:6, 10:11)) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], 1:4) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = 4, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res[, 2L], 5:11) + + ## Handling NA + ## * spectra variable value is NA + ranges <- lapply(ranges, function(z) z[1, , drop = FALSE]) + svars <- "rtime" + pvars <- c("mz", "intensity") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L, + keep = FALSE) + expect_equal(res, x) + + svars <- c("rtime", "new_var") + res <- .peaks_filter_ranges(x, svars = svars, pvars = pvars, rtime = NA, + ranges = ranges, spectrumMsLevel = 1L, + new_var = 3) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + + ## * peaks variable value is NA + x[8, 2L] <- NA_real_ + res <- .peaks_filter_ranges(x, pvars = c("mz", "intensity"), + ranges = ranges, spectrumMsLevel = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(unname(res[, 2L]), 9L) + + ## * range value is NA + ranges$rtime <- cbind(NA, 2) + res <- .peaks_filter_ranges(x, svars = c("rtime", "new_var"), rtime = 2, + new_var = 3, spectrumMsLevel = 1L, + ranges = ranges) + expect_true(is.matrix(res)) + expect_equal(colnames(res), colnames(x)) + expect_equal(res, x[logical(), , drop = FALSE]) + res <- .peaks_filter_ranges(x, svars = c("rtime", "new_var"), rtime = 2, + new_var = 3, spectrumMsLevel = 1L, + ranges = ranges, keep = FALSE) + expect_equal(res, x) +}) + +test_that(".peaks_contain_mz works", { + pks <- cbind(mz = c(1.3, 1.5, 32.1, 45.6), c(1, 2, 3, 4)) + + expect_false(.peaks_contain_mz(pks)) + expect_true(.peaks_contain_mz(pks, 1.5)) + expect_false(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = all)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = any)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = any, + tolerance = 0.1)) + expect_true(.peaks_contain_mz(pks, c(1.5, 32.2), condFun = all, + tolerance = 0.1)) +}) diff --git a/vignettes/MsBackend.Rmd b/vignettes/MsBackend.Rmd index a6423e63..c74f82e8 100644 --- a/vignettes/MsBackend.Rmd +++ b/vignettes/MsBackend.Rmd @@ -563,35 +563,39 @@ additionally available variables and the `columns` parameter of the (in addition to the required `"mz"` and `"intensity"` variables). -### `[` - -The `[` method allows to subset `MsBackend` objects. This operation is expected -to reduce a `MsBackend` object to the selected spectra. The method should -support to subset by indices or logical vectors and should also support -duplicating elements (i.e. when duplicated indices are used) as well as to -subset in arbitrary order. An error should be thrown if indices are out of -bounds, but the method should also support returning an empty backend with -`[integer()]`. Note that the `MsCoreUtils::i2index` function can be used to +### `extractByIndex()` and `[` + +The `extractByIndex()` and `[` methods allows to subset `MsBackend` objects. +This operation is expected to reduce a `MsBackend` object to the selected +spectra. These methods must also support duplication (e.g. `[c(1, 1, 1)]` and +extraction in any arbitrary order (e.g. `[c(3, 1, 5, 3)]`). While both methods +subset the object, `extractByIndex()` only supports to subset with an `integer` +index, while `[`, to be compliant with the base R implementation, should support +to subset by indices or logical vectors. An error should be thrown if indices +are out of bounds, but the method should also support returning an empty backend +with `[integer()]`. Note that the `MsCoreUtils::i2index` function can be used to check for correct input (and convert the input to an `integer` index). -Below we implement a possible `[` for our test backend class. We ignore the -parameters `j` from the definition of the `[` generic, since we treat our data -to be one-dimensional (with each spectrum being one element). +The `extractByIndex()` method is used by the data operation and analysis methods +on `Spectra` objects, while the `[` is intended to be used by the end user (if +needed). Below we implement `extractByIndex()` for our backend: ```{r} -setMethod("[", "MsBackendTest", function(x, i, j, ..., drop = FALSE) { - i <- MsCoreUtils::i2index(i, length = length(x)) - x@spectraVars <- x@spectraVars[i, ] - x@mz <- x@mz[i] - x@intensity <- x@intensity[i] - x +setMethod("extractByIndex", c("MsBackendTest", "ANY"), function(object, i) { + object@spectraVars <- object@spectraVars[i, ] + object@mz <- object@mz[i] + object@intensity <- object@intensity[i] + object }) ``` +The `[` does not need to be defined because a default implementation for +the base `MsBackend` exists. + We can now subset our backend to the last two spectra. ```{r} -a <- be[2:3] +a <- extractByIndex(be, 2:3) spectraData(a) ``` @@ -1586,6 +1590,23 @@ setMethod("backendParallelFactor", "MsBackend", function(object, ...) { ``` +### `backendRequiredSpectraVariables()` + +The `backendRequiredSpectraVariables()` method can be implemented if a backend +needs specific spectra variables to work. The default implementation is: + +```{r} +setMethod("backendRequiredSpectraVariables", "MsBackend", + function(object, ...) { + character() + }) +``` + +The implementation for `MsBackendMzR` returns `c("dataStorage", "scanIndex")` as +the backend needs these two spectra variables to load the MS data on-the-fly +from the original data files. + + ### `dropNaSpectraVariables()` The `dropNaSpectraVariables()` is supposed to allow removing all spectra @@ -1656,6 +1677,21 @@ This method thus retrieves first the MS levels of all spectra and then calls operation by selecting the unique MS levels directly using an SQL call. +### `precursorMz<-` + +Replace the values for the *precursor m/z* spectra +variable. Parameter `value` has to be of type `numeric` (`NA_real_` missing +values are supported, e.g. for MS1 spectra). The default implementation uses the +`$<-` method: + +```{r} +setReplaceMethod("precursorMz", "MsBackend", function(object, ..., value) { + object$precursorMz <- value + object +}) +``` + + ### `ionCount()` The `ionCount()` method should return a `numeric` (length equal to the number of diff --git a/vignettes/Spectra.Rmd b/vignettes/Spectra.Rmd index 74bcb1fb..35e0dfbb 100644 --- a/vignettes/Spectra.Rmd +++ b/vignettes/Spectra.Rmd @@ -360,25 +360,39 @@ Similar to spectra variables it is also possible to replace values for **existing** peaks variables using the `$<-` function. -## Filtering, subsetting and merging +## Filtering, aggregating and merging spectra data -Apart from *classical* subsetting operations such as `[` and `split()`, a set of -filter functions are defined for `Spectra` objects (for detailed help please see -the `?Spectra` help). Filter and subset functions either reduce the number of -spectra within a `Spectra` object, or affect the number of -peaks (by either aggregating or subset) within each spectrum. Filter functions -affecting the total number of spectra are (in alphabetic order): +Various functions are available to filter, subset and merge `Spectra` +objects. These can be generally subdivided into functions that subset or filter +*spectra data* and operations that filter *mass peak data*. A third category of +function allows to aggregate data within a `Spectra` or to merge and combine +multiple `Spectra` objects into one. Functions of the various categories are +described in the following subsections. Please refer to the function's +documentation for more details and information. + +### Filter spectra data + +These functions comprise subset operations that reduce the total number of +spectra in a `Spectra` object as well as filter functions that reduce the +content of the `Spectra`'s spectra data (i.e. the content of its +`spectraVariables()`). These functions thus don't change or affect the mass +peaks data of the `Spectra`'s individual spectra. + +- `[`: operation to reduce a `Spectra` object to selected elements. +- `dropNaSpectraVariables()`: drops `spectraVariables()` that contain only + missing values. The function returns a `Spectra` object with the same number + of elements, but with eventually fewer spectra variables. - `filterAcquisitionNum()`: retains spectra with certain acquisition numbers. - `filterDataOrigin()`: subsets to spectra from specific origins. - `filterDataStorage()`: subsets to spectra from certain data storage files. - `filterEmptySpectra()`: removes spectra without mass peaks. -- `filterMzRange()`: subsets spectra keeping only peaks with an m/z within the - provided m/z range. - `filterIsolationWindow()`: keeps spectra with the provided `mz` in their isolation window (m/z range). - `filterMsLevel()`: filters by MS level. - `filterPolarity()`: filters by polarity. +- `filterPrecursorCharge()`: retains (MSn) spectra with specified + precursor charge(s). - `filterPrecursorIsotopes()`: identifies precursor ions (from fragment spectra) that could represent isotopes of the same molecule. For each of these spectra groups only the spectrum of the monoisotopic precursor ion is returned. MS1 @@ -390,50 +404,59 @@ affecting the total number of spectra are (in alphabetic order): the provided m/z range. - `filterPrecursorMzValues(()`: retains (MSn) spectra with precursor m/z value matching the provided value(s) considering also a `tolerance` and `ppm`. -- `filterPrecursorCharge()`: retains (MSn) spectra with specified - precursor charge(s). - `filterPrecursorScan()`: retains (parent and children) scans of an acquisition number. -- `filterRanges()`: allows filtering of the `Spectra` object based on user - defined *numeric* ranges (parameter `ranges`) for one or more available - spectra variables in object (spectra variable names can be specified with - parameter `spectraVariables`). Spectra for which the value of a spectra - variable is within it's defined range are retained. If multiple - ranges/spectra variables are defined, the `match` parameter can be used - to specify whether all conditions (`match = "all"`; the default) or if - any of the conditions must match (`match = "any"`; all spectra for which - values are within any of the provided ranges are retained). +- `filterRanges()`: filters a `Spectra` object based on (multiple) user + defined *numeric* ranges for one or more available (numeric) spectra + variables. - `filterRt()`: filters based on retention time range. -- `filterValues()`: allows filtering of the `Spectra` object based on - similarities of *numeric* values of one or more `spectraVariables(object)` - (parameter `spectraVariables`) to provided values (parameter `values`) - given acceptable differences (parameters tolerance and ppm). If multiple - values/spectra variables are defined, the `match` parameter can be used - to specify whether all conditions (`match = "all"`; the default) or if - any of the conditions must match (`match = "any"`; all spectra for which - values are within any of the provided ranges are retained). - -Filter functions that return the same number of spectra, but affect/subset the -peaks data (m/z and intensity values) within each spectrum are: - -- `combinePeaks()`: groups peaks within each spectrum based on similarity of +- `filterValues()`: filters a `Spectra` object based on similarities of + *numeric* values of one or more available spectra variables. +- `selectSpectraVariables()`: reduces the (spectra) data within the object to + the selected spectra variables. + + +### Filter or aggregate mass peak data + +These function filter or aggregate the mass peak data (`peaksData()`) of each +spectrum in a `Spectra` without changing the total number of spectra. + +- `combinePeaks()`: groups peaks **within each spectrum** based on similarity of their m/z values and combines these into a single peak per peak group. - `deisotopeSpectra()`: deisotopes each individual spectrum keeping only the monoisotopic peak for peaks groups of potential isotopologues. +- `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier transform + artifact peaks from spectra. - `filterIntensity()`: filter each spectrum keeping only peaks with intensities meeting certain criteria. -- `filterMzRange()`: subsets peaks data within each spectrum keeping only peaks - with their m/z values within the specified m/z range. +- `filterMzRange()`: filters mass peaks keeping (or removing) those with an + m/z within the provided m/z range. +- `filterMzValues()`: filters mass peaks within each spectrum keeping (or + removing) those with an m/z matching the provided value(s). +- `filterPeaksRanges()`: filters mass peaks using any set of range-based filters + on numeric spectra or peaks variables. - `filterPrecursorPeaks()`: removes peaks with either an m/z value matching the precursor m/z of the respective spectrum (with parameter `mz = "=="`) or peaks with an m/z value larger or equal to the precursor m/z (with parameter `mz = ">="`). -- `filterMzValues()`: subsets peaks within each spectrum keeping or removing - (all) peaks matching provided m/z value(s) (given parameters `ppm` and - `tolerance`). - `reduceSpectra()`: filters individual spectra keeping only the largest peak for groups of peaks with similar m/z values. + +### Merging, aggregating and splitting + + +- `c()`: combine several `Spectra` into a single `Spectra` object. +- `combineSpectra()`: allows to combine the MS data from sets of spectra into a + single spectrum per set. Thus, instead of filtering the data, this function + aggregates it. +- `joinSpectraData()`: merge a `DataFrame` to the existing spectra data. +- `split()`: splits the `Spectra` object based on a provided grouping factor. + + + +### Examples and use cases for filter operations + In this example, we use the `filterValues()` function to retain spectra with a base peak m/z close to 100 (+/- 30 ppm) and a retention time around 230 (+/- 5 s). @@ -871,18 +894,26 @@ See also `?plotSpectra` for more plotting options and examples. The `Spectra` package provides the `combineSpectra()` function that allows to *aggregate* multiple spectra into a single one. The main parameters of this -function are `f`, which defines the grouping of the spectra, and `FUN` which -allows to define the function that performs the actual aggregation. The default -aggregation function is `combinePeaksData()` (see `?combinePeaksData` for -details) that combines multiple spectra into a single spectrum with all peaks -from all input spectra (with additional paramter `peaks = "union"`), or peaks -that are present in a certain proportion of input spectra (with parameter -`peaks = "intersect"`; parameter `minProp` allows to define the minimum -required proportion of spectra in which a peak needs to be present. Below we -use this function to combine the spectra for 1-methylhistidine and caffeine -into a single spectrum for each compound. We use the spectra variable `$name`, -that contains the names of the compounds, to define which spectra should be -grouped together. +function are `f`, which defines the sets of spectra that should be combined, and +`FUN`, which allows to define the function that performs the actual +aggregation. The default aggregation function is `combinePeaksData()` (see +`?combinePeaksData` for details) that combines multiple spectra into a single +spectrum with all peaks from all input spectra (with additional paramter `peaks += "union"`), or peaks that are present in a certain proportion of input spectra +(with parameter `peaks = "intersect"`; parameter `minProp` allows to define the +minimum required proportion of spectra in which a peak needs to be present. It +is important to mention that, by default, the function combines all mass peaks +from all spectra with a similar m/z value into a single, representative mass +peak aggregating all their intensities into one. To avoid the resulting +intensity to be affected by potential noise peaks it might be advised to first +*clean* the individual mass spectra using e.g. the `combinePeaks()` or +`reduceSpectra()` functions that first aggregate mass peaks **within** each +individual spectrum. + +In this example we below we use `combineSpectra()` to combine the spectra for +1-methylhistidine and caffeine into a single spectrum for each compound. We use +the spectra variable `$name`, that contains the names of the compounds, to +define which spectra should be grouped together. ```{r} sps_agg <- combineSpectra(sps, f = sps$name) @@ -1213,38 +1244,51 @@ head(basename(dataStorage(sps_tmt))) A (possibly incomplete) list of R packages providing additional backends that add support for additional data types or storage options is provided below: -- `r BiocStyle::Biocpkg("MsBackendMgf")`: support for import/export of mass - spectrometry files in mascot generic format (MGF). -- `r BiocStyle::Biocpkg("MsBackendMsp")`: allows to import/export data in NIST - MSP format. Extends the `MsBackendDataFrame` and keeps thus all data, after - import, in memory. -- `MsBackendMassbank` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): - allows to import/export data in MassBank text file format. Extends the - `MsBackendDataFrame` and keeps thus all data, after import, in memory. -- `MsBackendMassbankSql` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): - allows to directly connect to a MassBank SQL database to retrieve all MS data - and variables. Has a minimal memory footprint because all data is retrieved - on-the-fly from the SQL database. -- `r BiocStyle::Biocpkg("MsBackendSql")`: stores all MS data in a SQL database - and has thus a minimal memory footprint. - `MsBackendCompDb` (package `r BiocStyle::Biocpkg("CompoundDb")`): provides access to spectra data (spectra and peaks variables) from a *CompDb* database. Has a small memory footprint because all data (except precursor m/z values) are retrieved on-the-fly from the database. -- `r Biocpkg("MsBackendRawFileReader")`: implements a backend for reading MS - data from Thermo Fisher Scientific's raw data files using the manufacturer's - NewRawFileReader .Net libraries. The package generalizes the functionality - introduced by the `r Biocpkg("rawrr")` package, see also - [@kockmann_rawrr_2021]. + - `MsBackendHmdbXml` (package [`MsbackendHmdb`](https://github.com/rformassspectrometry/MsBackendHmdb)): allows import of MS data from xml files of the Human Metabolome Database (HMDB). Extends the `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMassbank` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): + allows to import/export data in MassBank text file format. Extends the + `MsBackendDataFrame` and keeps thus all data, after import, in memory. + +- `MsBackendMassbankSql` (package `r BiocStyle::Biocpkg("MsBackendMassbank")`): + allows to directly connect to a MassBank SQL database to retrieve all MS data + and variables. Has a minimal memory footprint because all data is retrieved + on-the-fly from the SQL database. + +- `MsBackendMetaboLights` (package `r + BiocStyle::Biocpkg("MsBackendMetaboLights")`): retrieves and caches MS data + files from the MetaboLights repository. + +- `MsBackendMgf`: (package `r BiocStyle::Biocpkg("MsBackendMgf")`): support for + import/export of mass spectrometry files in mascot generic format (MGF). + +- `MsBackendMsp`: (package `r BiocStyle::Biocpkg("MsBackendMsp")`): allows to + import/export data in NIST MSP format. Extends the `MsBackendDataFrame` and + keeps thus all data, after import, in memory. + +- `MsBackendRawFileReader` (package `r Biocpkg("MsBackendRawFileReader")`): + implements a backend for reading MS data from Thermo Fisher Scientific's raw + data files using the manufacturer's NewRawFileReader .Net libraries. The + package generalizes the functionality introduced by the `r Biocpkg("rawrr")` + package, see also [@kockmann_rawrr_2021]. + +- `MsBackendSql` (package `r BiocStyle::Biocpkg("MsBackendSql")`): stores all MS + data in a SQL database and has thus a minimal memory footprint. + - `MsBackendTimsTof` (package [`MsBackendTimsTof`](https://github.com/rformassspectrometry/MsBackendTimsTof): allows import of data from Bruker TimsTOF raw data files (using the `opentimsr` R package). + - `MsBackendWeizMass` (package [`MsBackendWeizMass`](https://github.com/rformassspectrometry/MsBackendWeizMass): allows to access MS data from WeizMass MS/MS spectral databases. @@ -1276,6 +1320,60 @@ a `lengths(sps)` call, the number of peaks per spectra could also be determined 5000L)`. In that way only peak data of 5000 spectra at a time will be loaded into memory. + +# Serializing (saving), moving and loading serialized `Spectra` objects + +Serializing and re-loading variables/objects during an analysis using e.g. the +`save()` and `load()` functions are common in many workflows, especially if some +of the tasks are computationally intensive and take long time. Sometimes such +serialized objects might even be moved from one computer (or file system) to +another. These operations are unproblematic for `Spectra` objects with +*in-memory* backends such as the `MsBackendMemory` or `MsBackendDataFrame`, that +keep all data in memory, would however break for *on-disk* backends such as the +`MsBackendMzR` if the file path to the original data files is not identical. It +is thus suggested (if the size of the MS data respectively the available system +memory allows it) to change the backend for such `Spectra` objects to a +`MsBackendMemory` before serializing the object with `save()`. For `Spectra` +objects with a `MsBackendMzR` an alternative option would be to eventually +update/adapt the path to the directory containing the raw (e.g. mzML) data +files: assuming these data files are available on both computers, the path to +the directory containing these can be updated with the `dataStorageBasePath<-` +function allowing thus to move/copy serialized `Spectra` objects between +computers or file systems. + +An example workflow could be: + +files *a.mzML*, *b.mzML* are stored in a directory */data/mzML/* on one +computer. These get loaded as a `Spectra` object with `MsBackendMzR` and then +serialized to a file *A.RData*. + +```{r, eval = FALSE} +A <- Spectra(c("/data/mzML/a.mzML", "/data/mzML/b.mzML")) +save(A, file = "A.RData") +``` + +Assuming this file gets now copied to another computer (where the data is not +available in a folder */data/mzML/*) and loaded with `load()`. + +```{r, eval = FALSE} +load("A.RData") +``` + +This `Spectra` object would not be valid because its `MsBackendMzR` can no +longer access the MS data in the original data files. Assuming the user also +copied the data files *a.mzML* and *b.mzML*, but to a folder +*/some_other_folder/*, the base storage path of the object would need to be +adapted to match the directory where the data files are available on the second +computer: + +```{r, eval = FALSE} +dataStorageBasePath(A) <- "/some_other_folder" +``` + +By pointing now the storage path to the new storage location of the data files, +the `Spectra` object `A` would also be usable on the second computer. + + # Session information ```{r si}