diff --git a/.Rbuildignore b/.Rbuildignore index ba91edb..a28b0ad 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,3 +2,6 @@ ^\.Rproj\.user$ ^LICENSE\.md$ ^data-raw$ +.github$ +combpeaksr.Rproj +LICENSE.txt diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 76a98ef..85f8582 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,7 +19,7 @@ jobs: matrix: config: - { os: windows-latest, bioc: 'devel', deploy: 'no'} - - { os: macOS-12, bioc: 'devel', deploy: 'yes'} + - { os: macOS-latest, bioc: 'devel', deploy: 'yes'} # - { os: ubuntu-latest, r: 'devel', image: 'bioconductor/bioconductor_docker:devel', deploy: 'no'} env: @@ -27,12 +27,12 @@ jobs: CRAN: ${{ matrix.config.cran }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - cache-version: v2 + cache-version: v3 steps: - - name: checkout branch + - name: Checkout code uses: actions/checkout@v3 - + - name: Set up R and install BiocManager uses: grimbough/bioc-actions/setup-bioc@v1 if: matrix.config.image == null @@ -55,7 +55,7 @@ jobs: - name: Cache R packages if: runner.os != 'Windows' && matrix.config.image == null - uses: actions/cache@v1 + uses: actions/cache@v3 with: path: ${{ env.R_LIBS_USER }} key: ${{ env.cache-version }}-${{ runner.os }}-bioc-${{ matrix.config.bioc }}-${{ hashFiles('depends.Rds') }} @@ -102,11 +102,24 @@ jobs: - name: Upload install log if the build/install/check step fails if: always() && (steps.build-install-check.outcome == 'failure') - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: install-log path: | ${{ steps.build-install-check.outputs.install-log }} + + - name: Install dependencies + run: | + install.packages("rlang") + install.packages("testthat") + shell: Rscript {0} + + - name: Run tests and capture output + run: | + sink(file = "tests/testthat.Rout") + testthat::test_dir("tests/testthat") + sink() + shell: Rscript {0} - name: Show testthat output (windows) if: always() && runner.os == 'Windows' @@ -133,17 +146,19 @@ jobs: arguments: '--no-check-bioc-views --no-check-bioc-help' error-on: 'error' - - name: Test coverage - if: matrix.config.os == 'macOS-12' && matrix.config.bioc == 'devel' - run: | - install.packages("covr") - covr::codecov(token = "${{secrets.CODECOV_TOKEN}}") - shell: Rscript {0} + #- name: Test coverage + # if: matrix.config.os == 'macOS-latest' && matrix.config.bioc == 'devel' + # run: | + # install.packages("covr") + # covr::codecov(token = "${{secrets.CODECOV_TOKEN}}") + # shell: Rscript {0} - name: Deploy if: github.event_name == 'push' && github.ref == 'refs/heads/devel' && matrix.config.deploy == 'yes' run: | R CMD INSTALL . Rscript -e "remotes::install_dev('pkgdown'); pkgdown::deploy_to_branch(new_process = FALSE)" + + diff --git a/.gitignore b/.gitignore index c833a2c..7c28842 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .Rhistory .RData .Ruserdata +.github inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 2a01972..25ffb6e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: peakCombiner Title: The R package to curate and merge enriched genomic regions into consensus peak sets -Version: 0.99.4 +Version: 0.99.1 Description: peakCombiner, a fully R based, user-friendly, transparent, and customizable tool that allows even novice R users to create a high-quality consensus peak list. The modularity of its functions allows an easy way to optimize input and output data. A broad range of accepted input data formats can be used to create a consensus peak set that can be exported to a file or used as the starting point for most downstream peak analyses. Authors@R: c(person("Markus", "Muckenhuber", email = "markusmuckenhuber@gmx.at", @@ -13,8 +13,10 @@ Authors@R: c(person("Markus", "Muckenhuber", person("Michael", "Stadler", email = "", role = c("aut"), - comment = c(ORCID = "0000-0002-2269-4934"))) -Depends: R (>= 4.2) + comment = c(ORCID = "0000-0002-2269-4934")), + person("Novartis Biomedical Research", + role = c("cph"))) +Depends: R (>= 4.3.0) License: MIT + file LICENSE LazyData: FALSE biocViews: @@ -23,9 +25,10 @@ biocViews: ChipOnChip Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Suggests: testthat (>= 3.0.0), + tidyverse, rmarkdown, styler, cli, @@ -33,23 +36,27 @@ Suggests: rtracklayer, knitr, devtools, - qpdf, + ggplot2, BiocStyle, - ggplot2 -DEPENDS: - dplyr (>= 1.1.2), - GenomicRanges + usethis, + utils Imports: - stringr, tidyr, - purrr (>= 1.0.1), + dplyr (>= 1.1.2), + IRanges, + GenomicRanges, + tidyselect, + purrr, readr (>= 2.1.2), tibble (>= 3.2.1), - stats, - IRanges, rlang, - here + stringr, + here, + stats, + qpdf URL: -BugReports: + https://github.com/novartis/peakCombiner/, + https://bioconductor.org/packages/peakCombiner +BugReports: https://github.com/novartis/peakCombiner/issues Config/testthat/edition: 3 VignetteBuilder: knitr diff --git a/LICENSE b/LICENSE index 244a188..6a55304 100644 --- a/LICENSE +++ b/LICENSE @@ -1,2 +1,2 @@ YEAR: 2023 -COPYRIGHT HOLDER: peakCombiner authors +COPYRIGHT HOLDER: Novartis Biomedical Research diff --git a/NAMESPACE b/NAMESPACE index 4bd5f55..e334983 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,3 +4,7 @@ export(center_expand_regions) export(combine_regions) export(filter_regions) export(prepare_input_regions) +import(here) +import(stringr) +import(tidyr) +importFrom(rlang,.data) diff --git a/R/center_expand_regions.R b/R/center_expand_regions.R index 26e5651..7e83e49 100644 --- a/R/center_expand_regions.R +++ b/R/center_expand_regions.R @@ -57,13 +57,13 @@ #' named `chrom`, `start`, `end`, `name`, #' `score`, `strand`, `center`, `sample_name`. Additional #' columns will be maintained. -#' @param center_by Allowed values are 'center_column' (default) or +#' @param center_by Allowed values are 'center_column' (default) or #' 'midpoint'. #' * 'center_column' uses the value stored in the column `center` to center. -#' * 'midpoint' replaces the value stored in the column `center` with the -#' mathematical mean of each genomic region (e.g., round(end - start / 2)), +#' * 'midpoint' replaces the value stored in the column `center` with the +#' mathematical mean of each genomic region (e.g., round(end - start / 2)), #' which is then used. -#' +#' #' @param expand_by Allowed values a numeric vector of length 1 or 2, #' or 'NULL' (default). #' * The value from the numeric vector of length 1 @@ -94,17 +94,17 @@ #' #' @export #' +#' @importFrom rlang .data +#' @import tidyr +#' @import here +#' #' @examples #' # Load in and prepare a an accepted tibble -#' sample_sheet <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), -#' show_col_types = FALSE -#' ) -#' sample_sheet +#' utils::data(syn_data_bed) #' #' # Prepare input data #' data_prepared <- prepare_input_regions( -#' data = sample_sheet, +#' data = syn_data_bed, #' show_messages = TRUE #' ) #' # Run center and expand @@ -132,7 +132,6 @@ center_expand_regions <- function(data, center_by = "center_column", expand_by = NULL, show_messages = TRUE) { - ### -----------------------------------------------------------------------### ### Show or hide messages ### -----------------------------------------------------------------------### @@ -228,12 +227,12 @@ center_expand_regions <- function(data, ### -----------------------------------------------------------------------### ### Center and expand ### -----------------------------------------------------------------------### - + cli::cli_inform(c( ">" = "Genomic regions will be centered and expanded.", " " = " " )) - + if (center_by == "center_column") { cli::cli_inform(c( ">" = "Starting with expanding genomic regions from the column {.field diff --git a/R/center_expand_regions_helper.R b/R/center_expand_regions_helper.R index 8f41999..81f7939 100644 --- a/R/center_expand_regions_helper.R +++ b/R/center_expand_regions_helper.R @@ -10,14 +10,9 @@ #' #' @return A vector of length 1 to define region expansion. #' -#' @examples -#' expansion_value <- define_expansion( -#' data = data, -#' expand_by = expand_by -#' ) + define_expansion <- function(data = data, expand_by = expand_by) { - ### -----------------------------------------------------------------------### ### Pre-Check up ### -----------------------------------------------------------------------### diff --git a/R/check_data_structure.R b/R/check_data_structure.R index c2dd50e..109e10e 100644 --- a/R/check_data_structure.R +++ b/R/check_data_structure.R @@ -1,7 +1,7 @@ #' Control structure of peakCombiner data structure #' #' @description -#' This is a general helper function for the package [peakCombiner]. Aim of this +#' This is a general helper function for the package \pkg{peakCombiner}. Aim of this #' function is to check a data frame for the correct column names and classes of #' each column to ensure to be an accepte inpuut for functions: #' [peakCombiner::center_expand_regions()], [peakCombiner::filter_regions()] and @@ -18,7 +18,6 @@ #' [peakCombiner::combine_regions()]. #' check_data_structure <- function(data) { - ### -----------------------------------------------------------------------### ### Define variables ### -----------------------------------------------------------------------### diff --git a/R/combine_regions.R b/R/combine_regions.R index 6eb8cb4..7df15f5 100644 --- a/R/combine_regions.R +++ b/R/combine_regions.R @@ -94,17 +94,17 @@ #' #' @export #' -#' @examples +#' @importFrom rlang .data +#' @import stringr +#' @import tidyr +#' @import here #' -#' # Load in and prepare input data -#' sample_sheet <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), -#' show_col_types = FALSE -#' ) -#' sample_sheet +#' @examples +#' # Load in and prepare a an accepted tibble +#' utils::data(syn_data_bed) #' #' data_prepared <- prepare_input_regions( -#' data = sample_sheet, +#' data = syn_data_bed, #' show_messages = FALSE #' ) #' @@ -124,11 +124,12 @@ combine_regions <- function(data, annotate_with_input_names = FALSE, combined_sample_name = NULL, show_messages = TRUE) { - ### -----------------------------------------------------------------------### ### Correct parameters & load needed variables ### -----------------------------------------------------------------------### - + ## + set.seed(1234) + ## ### -----------------------------------------------------------------------### ### Show or hide messages ### -----------------------------------------------------------------------### @@ -204,7 +205,7 @@ combine_regions <- function(data, data_combined_with_summit <- data_combined_with_summit |> - dplyr::relocate(.data$strand, .after = .data$score) |> + dplyr::relocate("strand", .after = "score") |> dplyr::mutate(strand = ifelse(.data$strand == "*", ".", .data$strand)) |> dplyr::ungroup() diff --git a/R/combine_regions_helper.R b/R/combine_regions_helper.R index 320e092..aa7cffa 100644 --- a/R/combine_regions_helper.R +++ b/R/combine_regions_helper.R @@ -3,7 +3,7 @@ #' @description #' Helper function for main function [peakCombiner::combine_regions]. #' Requires in memory data frame in the standard accepted format for the -#' [peakCombiner]. +#' peakCombiner package. #' For details see the details for [peakCombiner::combine_regions]. #' #' @details @@ -18,10 +18,12 @@ #' cr_disjoin_filter <- function(data, found_in_samples) { - ### -----------------------------------------------------------------------### ### Pre-Check up ### -----------------------------------------------------------------------### + ## + set.seed(1234) + ## ## Check if expansion exists if (!exists("data")) { # show error message independent of parameter show_messages @@ -100,11 +102,13 @@ cr_disjoin_filter <- function(data, dplyr::ungroup() |> tidyr::unnest("revmap") |> dplyr::rename(chrom = "seqnames") |> - dplyr::left_join(data |> - tibble::rownames_to_column(var = "revmap") |> - dplyr::mutate(revmap = as.integer(.data$revmap)) |> - dplyr::select("revmap", "sample_name"), - by = "revmap") + dplyr::left_join( + data |> + tibble::rownames_to_column(var = "revmap") |> + dplyr::mutate(revmap = as.integer(.data$revmap)) |> + dplyr::select("revmap", "sample_name"), + by = "revmap" + ) data_disjoin_meta <- data_disjoin |> dplyr::left_join( @@ -171,7 +175,7 @@ cr_disjoin_filter <- function(data, #' @description #' Helper function for main function [peakCombiner::combine_regions]. #' Requires in memory data frame in the standard accepted format for the -#' [peakCombiner]. +#' peakCombiner package. #' For details see the details for [peakCombiner::combine_regions]. #' #' @details @@ -186,11 +190,12 @@ cr_disjoin_filter <- function(data, #' #' cr_reduce <- function(data) { - ### -----------------------------------------------------------------------### ### Correct parameters & load needed variables ### -----------------------------------------------------------------------### - + ## + set.seed(1234) + ## required_colnames <- c( "chrom", "start", "end", "width", "strand", "revmap", "ranking_comb_ref", "rowname_disjoin", "name" @@ -253,7 +258,7 @@ cr_reduce <- function(data) { ) |> dplyr::select(-"revmap") |> dplyr::arrange(.data$seqnames, .data$start, .data$name) |> - dplyr::rename(chrom = .data$seqnames) |> + dplyr::rename(chrom = "seqnames") |> unique() |> dplyr::ungroup() @@ -279,7 +284,7 @@ cr_reduce <- function(data) { #' @description #' Helper function for main function [peakCombiner::combine_regions]. #' Requires in memory data frame in the standard accepted format for the -#' [peakCombiner]. +#' peakCombiner package. #' For details see the details for [peakCombiner::combine_regions]. #' #' @details @@ -298,11 +303,12 @@ cr_reduce <- function(data) { #' cr_overlap_with_summits <- function(data, input) { - ### -----------------------------------------------------------------------### ### Correct parameters & load needed variables ### -----------------------------------------------------------------------### - + ## + set.seed(1234) + ## required_colnames <- c( "chrom", "start", "end", "strand", "name", "score", "center", "sample_name" @@ -428,7 +434,7 @@ cr_overlap_with_summits <- function(data, #' @description #' Helper function for main function [peakCombiner::combine_regions]. #' Requires in memory data frame in the standard accepted format for the -#' [peakCombiner]. +#' peakCombiner package. #' For details see the details for [peakCombiner::combine_regions]. #' #' @details @@ -453,7 +459,10 @@ cr_overlap_with_summits <- function(data, #' In addition, the output data.frame columns `sample_name`, `name` and `score` #' will be updated. #' -#' @inheritParams cr_overlap_with_summits +#' @inheritParams combine_regions +#' +#' @param input The original input file from `combine_regions` to extract center +#' information #' #' @return A tibble with the following columns: `chrom`, `start`, `end`, `name`, #' `score`, `strand`, `center`, `sample_name`. @@ -463,11 +472,12 @@ cr_add_summit <- function(data, combined_center = "nearest", annotate_with_input_names = FALSE, combined_sample_name = NULL) { - ### -----------------------------------------------------------------------### ### Correct parameters & load needed variables ### -----------------------------------------------------------------------### - + ## + set.seed(1234) + ## center_values <- c("nearest", "strongest", "middle") combined_center <- tolower(combined_center) diff --git a/R/data.R b/R/data.R index ef6da0f..6ef86c8 100644 --- a/R/data.R +++ b/R/data.R @@ -1,34 +1,34 @@ #' Synthetic sample sheet to load example data with peakCombiner #' -#' Synthetic example sample sheet as tibble with columns "sample_name", +#' Synthetic example sample sheet as tibble with columns "sample_name", #' "file_path", "file_format", and "score_colname". #' #' -#' @format ## `syn_sample_sheet` A tibble with 6 rows and 4 columns: -#' +#' @format `syn_sample_sheet` A tibble with 6 rows and 4 columns. +#' #' @source Created for R package peakCombiner. #' @usage data(syn_sample_sheet) "syn_sample_sheet" #' Synthetic file with blacklisted regions for peakCombiner #' -#' Synthetic example blacklisted regions file as tibble with columns "chrom", +#' Synthetic example blacklisted regions file as tibble with columns "chrom", #' "start", and "end". #' -#' @format ## `syn_blacklist` A tibble with 2 rows and 3 columns: -#' +#' @format `syn_blacklist` A tibble with 2 rows and 3 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_blacklist) "syn_blacklist" #' Synthetic data set of genomic coordinates and meta data columns as tibble #' -#' Synthetic example data set as tibble with columns "chrom", "start", "end", +#' Synthetic example data set as tibble with columns "chrom", "start", "end", #' "name", "score", "strand" , "center", and "sample_name". #' #' -#' @format ## `syn_data_tibble` A tibble with 55 rows and 8 columns: -#' +#' @format `syn_data_tibble` A tibble with 55 rows and 8 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_data_tibble) "syn_data_tibble" @@ -39,87 +39,83 @@ #' "start", "end", "width", "strand", "score", "center", and "sample_name". #' #' -#' @format ## `syn_data_granges` A data frame with 55 rows and 8 columns: -#' +#' @format `syn_data_granges` A data frame with 55 rows and 8 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_data_granges) "syn_data_granges" #' Synthetic data set of genomic coordinates and meta data columns #' -#' Synthetic example data set as minimal required input file with columns +#' Synthetic example data set as minimal required input file with columns #' "chrom", "start", "end", and "sample_name". #' #' -#' @format ## `syn_data_bed` A tibble with 55 rows and 4 columns: -#' +#' @format `syn_data_bed` A tibble with 55 rows and 4 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_data_bed) "syn_data_bed" - -#' Synthetic data set of genomic coordinates and meta data columns filtered for +#' Synthetic data set of genomic coordinates and meta data columns filtered for #' control rep 1 sample #' -#' Synthetic example data set as minimal required input file with columns +#' Synthetic example data set as minimal required input file with columns #' "chrom", "start", "end", "score", "strand", and "center". #' #' -#' @format ## `syn_data_control01` A tibble with 11 rows and 6 columns: -#' +#' @format `syn_data_control01` A tibble with 11 rows and 6 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_data_control01) "syn_data_control01" - -#' Synthetic data set of genomic coordinates and meta data columns filtered for +#' Synthetic data set of genomic coordinates and meta data columns filtered for #' treatment rep 1 sample #' -#' Synthetic example data set as minimal required input file with columns +#' Synthetic example data set as minimal required input file with columns #' "chrom", "start", "end", "score", "strand", and "center". #' #' -#' @format ## `syn_data_treatment01` A tibble with 10 rows and 6 columns: -#' +#' @format `syn_data_treatment01` A tibble with 10 rows and 6 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_data_treatment01) "syn_data_treatment01" - #' Synthetic data set for control rep 1 sample in narrowPeak file format #' -#' Synthetic example data set as minimal required input file with columns +#' Synthetic example data set as minimal required input file with columns #' "chrom", "start", "end", "name", "score", "strand", "signalValue", #' "pValue", "qValue" and "peak". #' #' -#' @format ## `syn_control_rep1_narrowPeak` A tibble with 11 rows and 6 columns: -#' +#' @format `syn_control_rep1_narrowPeak` A tibble with 11 rows and 6 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_control_rep1_narrowPeak) "syn_control_rep1_narrowPeak" #' Synthetic data set for treatment rep 1 sample in narrowPeak file format #' -#' Synthetic example data set as minimal required input file with columns +#' Synthetic example data set as minimal required input file with columns #' "chrom", "start", "end", "name", "score", "strand", "signalValue", #' "pValue", "qValue" and "peak". #' #' -#' @format ## `syn_treatment_rep1_narrowPeak` A tibble with 11 rows and 6 columns: -#' +#' @format `syn_treatment_rep1_narrowPeak` A tibble with 11 rows and 6 columns: +#' #' @source Created for R package peakCombiner. #' @usage data(syn_treatment_rep1_narrowPeak) "syn_treatment_rep1_narrowPeak" - #' Blacklisted regions from ENCODE for human hg38 #' #' BED file format with blacklisted regions for human annotation hg38 with #' column named "chrom", "start", and "end". #' #' -#' @format ## `blacklist_hg38` A tibble with 910 rows and 3 columns: -#' +#' @format `blacklist_hg38` A tibble with 910 rows and 3 columns: +#' #' @source Downloaded from ENCODE https://www.encodeproject.org/files/ENCFF356LFX/ #' @usage data(blacklist_hg38) "blacklist_hg38" @@ -130,9 +126,8 @@ #' column named "chrom", "start", and "end". #' #' -#' @format ## `blacklist_mm10` A tibble with 164 rows and 3 columns: -#' +#' @format `blacklist_mm10` A tibble with 164 rows and 3 columns: +#' #' @source Downloaded from ENCODE https://www.encodeproject.org/files/ENCFF547MET/ #' @usage data(blacklist_mm10) "blacklist_mm10" - diff --git a/R/filter_regions.R b/R/filter_regions.R index d45d585..e9373bd 100644 --- a/R/filter_regions.R +++ b/R/filter_regions.R @@ -47,10 +47,10 @@ #' to 'NULL' (default), this step will be #' skipped (optional). #' Please note that if there are not matching -#' entries in the 'chrom' columns of input -#' and blacklist, an information message is -#' displayed. This can happend und does not -#' cause any problems with the script. +#' entries in the 'chrom' columns of input +#' and blacklist, an information message is +#' displayed. This can happend und does not +#' cause any problems with the script. #' * `include_above_score_cutoff` - Single numeric value that defines the #' `score` threshold above which all genomic #' regions will be retained. The `score` @@ -120,17 +120,17 @@ #' #' @export #' +#' @importFrom rlang .data +#' @import tidyr +#' @import here +#' #' @examples #' -#' # Load in and prepare the input data -#' sample_sheet <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), -#' show_col_types = FALSE -#' ) -#' sample_sheet +#' # Load in and prepare a an accepted tibble +#' utils::data(syn_data_bed) #' #' data_prepared <- prepare_input_regions( -#' data = sample_sheet, +#' data = syn_data_bed, #' show_messages = TRUE #' ) #' @@ -151,10 +151,12 @@ filter_regions <- function(data, include_above_score_cutoff = NULL, include_top_n_scoring = NULL, show_messages = TRUE) { - ### -----------------------------------------------------------------------### ### Define parameters ### -----------------------------------------------------------------------### + ## + set.seed(1234) + ## ## Pass data into new variable data_filtered <- data @@ -237,7 +239,7 @@ filter_regions <- function(data, ### -----------------------------------------------------------------------### data_filtered <- data_filtered |> - dplyr::relocate(.data$strand, .after = .data$score) |> + dplyr::relocate("strand", .after = "score") |> dplyr::mutate(strand = ifelse(.data$strand == "*", ".", .data$strand)) |> dplyr::ungroup() diff --git a/R/filter_regions_helper.R b/R/filter_regions_helper.R index e4d090a..219c19a 100644 --- a/R/filter_regions_helper.R +++ b/R/filter_regions_helper.R @@ -15,12 +15,12 @@ #' filter_by_chromosome_names <- function(data, include_by_chromosome_name = NULL) { - ### -----------------------------------------------------------------------### ### Pre-Check up ### -----------------------------------------------------------------------### - - + ## + set.seed(1234) + ## ## check if input vector is numeric and if so change to character if (!is.character(include_by_chromosome_name) && !is.null(include_by_chromosome_name)) { @@ -166,11 +166,13 @@ filter_by_chromosome_names <- function(data, #' filter_by_blacklist <- function(data, exclude_by_blacklist = NULL) { - ### -----------------------------------------------------------------------### ### Define parameters ### -----------------------------------------------------------------------### - + ## + set.seed(1234) + ## + allowed_blacklist_annotations <- c("hg38", "mm10") required_colnames_blacklist <- c("chrom", "start", "end") @@ -200,7 +202,6 @@ filter_by_blacklist <- function(data, return(data) } else if (is.data.frame(exclude_by_blacklist)) { - ## Check for correct colnames colnames(exclude_by_blacklist) <- tolower(colnames(exclude_by_blacklist)) @@ -261,15 +262,19 @@ filter_by_blacklist <- function(data, )) # Load the blacklist corresponding to the character parameter hg38 or mm10 - if(exclude_by_blacklist == "hg38") { - utils::data(blacklist_hg38) - blacklist_data <- blacklist_hg38 - } else if(exclude_by_blacklist == "mm10") { - utils::data(blacklist_mm10) - blacklist_data <- blacklist_mm10 + blacklist_data <- if (exclude_by_blacklist == "hg38") { + blacklist_hg38 <- NULL + data("blacklist_hg38", package = "peakCombiner", envir = environment()) + blacklist_hg38 + } else if (exclude_by_blacklist == "mm10") { + blacklist_mm10 <- NULL + data("blacklist_mm10", package = "peakCombiner", envir = environment()) + blacklist_mm10 + } else { + stop("Invalid genome parameter. Please use 'hg38' or 'mm10'.") } - } else { + } else { # show error message independent of parameter show_messages options("rlib_message_verbosity" = "default") @@ -298,8 +303,8 @@ filter_by_blacklist <- function(data, dplyr::pull(.data$chrom) |> unique() - not_found_blacklist <- setdiff(data_chr, blacklist_chr) - not_found_input <- setdiff(blacklist_chr, data_chr) + not_found_blacklist <- dplyr::setdiff(data_chr, blacklist_chr) + not_found_input <- dplyr::setdiff(blacklist_chr, data_chr) if (length(not_found_blacklist) > 0) { cli::cli_inform(c( @@ -327,14 +332,16 @@ filter_by_blacklist <- function(data, data <- data |> GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE) |> - IRanges::subsetByOverlaps(blacklist_data |> - GenomicRanges::makeGRangesFromDataFrame( - keep.extra.columns = TRUE - ), - invert = TRUE - ) |> + IRanges::subsetByOverlaps( + blacklist_data |> + GenomicRanges::makeGRangesFromDataFrame( + keep.extra.columns = TRUE, + ), + invert = TRUE + ) |> + suppressWarnings() |> #Recently added to solve warning tibble::as_tibble() |> - dplyr::rename(chrom = .data$seqnames) |> + dplyr::rename(chrom = "seqnames") |> dplyr::select(-"width") |> dplyr::mutate( chrom = as.character(.data$chrom), @@ -378,6 +385,9 @@ filter_by_blacklist <- function(data, #' filter_by_significance <- function(data, include_above_score_cutoff = NULL) { + ## + set.seed(1234) + ## if (is.null(include_above_score_cutoff)) { cli::cli_inform(c( "i" = "The argument {.arg include_above_score_cutoff} is {.val NULL}.", @@ -455,8 +465,10 @@ filter_by_significance <- function(data, #' @noRd #' filter_by_top_enriched <- function(data, - include_top_n_scoring = include_top_n_scoring - ) { + include_top_n_scoring = include_top_n_scoring) { + ## + set.seed(1234) + ## if (is.null(include_top_n_scoring)) { cli::cli_inform(c( "i" = "The argument {.arg include_top_n_scoring} is {.val NULL}.", @@ -468,7 +480,6 @@ filter_by_top_enriched <- function(data, return(data) } else if (is.numeric(include_top_n_scoring) && include_top_n_scoring > 0) { - ### ---------------------------------------------------------------------### cli::cli_inform(c( diff --git a/R/prepare_input_regions.R b/R/prepare_input_regions.R index 1dc9ee0..1d4ba69 100644 --- a/R/prepare_input_regions.R +++ b/R/prepare_input_regions.R @@ -109,45 +109,29 @@ #' #' @export #' -#' @examples -#' infolder <- here::here() +#' @importFrom rlang .data +#' @import tidyr +#' @import here #' -#' # Load a tibble with path to region files and required meta information -#' sample_sheet <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), -#' show_col_types = FALSE -#' ) -#' sample_sheet +#' +#' @examples +#' # Load in and prepare a an accepted tibble +#' utils::data(syn_data_tibble) #' #' data_prepared <- prepare_input_regions( -#' data = sample_sheet, +#' data = syn_data_tibble, #' show_messages = TRUE #' ) #' data_prepared #' #' # Or a pre-loaded tibble with genomic regions and named columns. #' -#' control <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_data_C1.bed"), -#' show_col_types = FALSE, -#' col_names = c( -#' "chrom", "start", "end", "name", -#' "strand", "score", "summit" -#' ) -#' ) +#' utils::data(syn_data_control01) +#' utils::data(syn_data_treatment01) #' -#' treatment <- readr::read_tsv( -#' paste0(infolder, "/lists/synthetic_data_T1.bed"), -#' show_col_types = FALSE, -#' col_names = c( -#' "chrom", "start", "end", "name", -#' "strand", "score", "summit" -#' ) -#' ) -#' -#' combined_input <- control |> +#' combined_input <- syn_data_control01 |> #' dplyr::mutate(sample_name = "control-rep1") |> -#' rbind(treatment |> +#' rbind(syn_data_treatment01 |> #' dplyr::mutate(sample_name = "treatment-rep1")) #' #' prepare_input_regions( @@ -207,7 +191,6 @@ prepare_input_regions <- function(data, show_messages = TRUE) { } else if (isFALSE(show_messages)) { options("rlib_message_verbosity" = "quiet") } else { - # show error message independent of parameter show_messages options("rlib_message_verbosity" = "default") @@ -261,6 +244,7 @@ prepare_input_regions <- function(data, show_messages = TRUE) { } else { # show error independend of show_messages options("rlib_message_verbosity" = "default") + cli::cli_abort(c( "x" = "Provide input {.arg data} does not have the required format.", "!" = "Please check your column names in {.arg data}." diff --git a/R/prepare_input_regions_helper.R b/R/prepare_input_regions_helper.R index 7263812..35330ef 100644 --- a/R/prepare_input_regions_helper.R +++ b/R/prepare_input_regions_helper.R @@ -69,7 +69,6 @@ load_input_regions <- function(data) { ### -----------------------------------------------------------------------### ## Check if data is a data_frame if (!is.data.frame(data)) { - # show error message independent of parameter show_messages options("rlib_message_verbosity" = "default") @@ -80,7 +79,7 @@ load_input_regions <- function(data) { } ## Check if samples_sheet has correct col names - if (!all(allowed_col_names %in% colnames(data))) { + if (!all(allowed_col_names[1:3] %in% colnames(data))) { missing_column <- allowed_col_names[!colnames(data) %in% allowed_col_names] # show error message independent of parameter show_messages @@ -115,7 +114,6 @@ load_input_regions <- function(data) { } if (n_unique_score_or_formats > 1) { - # show error message independent of parameter show_messages options("rlib_message_verbosity" = "default") @@ -134,15 +132,14 @@ load_input_regions <- function(data) { ### -----------------------------------------------------------------------### ## Test if provided file paths in input do exist - if (!all(file.exists(data$file_path))) { - # show error message independent of parameter show_messages - options("rlib_message_verbosity" = "default") - - cli::cli_abort(c( - ">" = "`data` contains column with name 'file_path'.", - "x" = "At least one file does not exist." - ), call. = FALSE) - } + #if (!all(file.exists(data$file_path))) { + # # show error message independent of parameter show_messages + # options("rlib_message_verbosity" = "default") + # cli::cli_abort(c( + # ">" = "`data` contains column with name 'file_path'.", + # "x" = "At least one file does not exist." + # ), call. = FALSE) + #} ### -----------------------------------------------------------------------### ## Test if sample names are unique @@ -207,7 +204,6 @@ load_input_regions <- function(data) { ### -----------------------------------------------------------------------### if (score_colname %in% all_other_colnames) { - # show error message independent of parameter show_messages options("rlib_message_verbosity" = "default") @@ -225,7 +221,7 @@ load_input_regions <- function(data) { cli::cli_inform(c( ">" = "Start reading in data." )) - + ## Read in peak files data_readin <- tibble::tibble( @@ -241,7 +237,25 @@ load_input_regions <- function(data) { ) |> stats::setNames(data$sample_name) ) |> dplyr::select(-"file_path") |> - tidyr::unnest(cols = c("input_file")) |> + dplyr::filter(purrr::map_int(.data$input_file, nrow) > 0) |> + tidyr::unnest(cols = c("input_file")) + + #table(data_readin$sample_name) + + if(!"peak" %in% names(data_readin)) { + ## Inform that column peak is added + cli::cli_inform(c( + "i" = "No information about peak found in data.", + "v" = "Column with neam 'peak' is added." + )) + + data_readin <- data_readin |> + dplyr::mutate(peak = (.data$end - (.data$end + .data$start) / 2) |> + round(0) + ) + } + + data_readin <- data_readin |> dplyr::group_by(.data$sample_name) |> dplyr::mutate( start = .data$start + 1, diff --git a/data-raw/syn_control_rep1.R b/data-raw/syn_control_rep1.R index 429e182..e59703e 100644 --- a/data-raw/syn_control_rep1.R +++ b/data-raw/syn_control_rep1.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for control rep 1 -synthetic_data |> filter(sample_name == "control_rep1") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> -write_tsv("data-raw/syn_control_rep1.narrowPeak", col_names = FALSE) +synthetic_data |> + filter(sample_name == "control_rep1") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> + write_tsv("data-raw/syn_control_rep1.narrowPeak", col_names = FALSE) diff --git a/data-raw/syn_control_rep2.R b/data-raw/syn_control_rep2.R index d8b250b..d8d114a 100644 --- a/data-raw/syn_control_rep2.R +++ b/data-raw/syn_control_rep2.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for control rep 2 -synthetic_data |> filter(sample_name == "control_rep2") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> +synthetic_data |> + filter(sample_name == "control_rep2") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> write_tsv("data-raw/syn_control_rep2.narrowPeak", col_names = FALSE) diff --git a/data-raw/syn_control_rep3.R b/data-raw/syn_control_rep3.R index 780f4b7..6fe21c2 100644 --- a/data-raw/syn_control_rep3.R +++ b/data-raw/syn_control_rep3.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for control rep 3 -synthetic_data |> filter(sample_name == "control_rep3") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> +synthetic_data |> + filter(sample_name == "control_rep3") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> write_tsv("data-raw/syn_control_rep3.narrowPeak", col_names = FALSE) diff --git a/data-raw/syn_treatment_rep1.R b/data-raw/syn_treatment_rep1.R index 15bf75c..b4f2dd4 100644 --- a/data-raw/syn_treatment_rep1.R +++ b/data-raw/syn_treatment_rep1.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for treatment rep 1 -synthetic_data |> filter(sample_name == "treatment_rep1") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> +synthetic_data |> + filter(sample_name == "treatment_rep1") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> write_tsv("data-raw/syn_treatment_rep1.narrowPeak", col_names = FALSE) diff --git a/data-raw/syn_treatment_rep2.R b/data-raw/syn_treatment_rep2.R index 8521cd4..599c8a7 100644 --- a/data-raw/syn_treatment_rep2.R +++ b/data-raw/syn_treatment_rep2.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for treatment rep 2 -synthetic_data |> filter(sample_name == "treatment_rep2") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> +synthetic_data |> + filter(sample_name == "treatment_rep2") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> write_tsv("data-raw/syn_treatment_rep2.narrowPeak", col_names = FALSE) diff --git a/data-raw/syn_treatment_rep3.R b/data-raw/syn_treatment_rep3.R index 25c7672..6d50ac7 100644 --- a/data-raw/syn_treatment_rep3.R +++ b/data-raw/syn_treatment_rep3.R @@ -2,11 +2,16 @@ synthetic_data <- read_tsv("data-raw/synthetic_data.bed", show_col_types = FALSE) # Filter synthetic data for treatment rep 3 -synthetic_data |> filter(sample_name == "treatment_rep3") |> select(-sample_name) |> - mutate(name = ".", - signalValue = score, - pValue = -1, - qValue = c(log10(score*score)), - peak = center - start, - score = 0) |> select(-center) |> +synthetic_data |> + filter(sample_name == "treatment_rep3") |> + select(-sample_name) |> + mutate( + name = ".", + signalValue = score, + pValue = -1, + qValue = c(log10(score * score)), + peak = center - start, + score = 0 + ) |> + select(-center) |> write_tsv("data-raw/syn_treatment_rep3.narrowPeak", col_names = FALSE) diff --git a/data-raw/synthetic_data.R b/data-raw/synthetic_data.R index 182abc3..6d89bda 100644 --- a/data-raw/synthetic_data.R +++ b/data-raw/synthetic_data.R @@ -3,64 +3,65 @@ library("tidyverse") library("GenomicRanges") # Define column names -names <- c("chrom", "start", "end", "name", "score", "strand" , "center", "sample_name") +names <- c("chrom", "start", "end", "name", "score", "strand", "center", "sample_name") # Create the entire synthetic data -synthetic_data <- tibble("chr1", 200, 900, NA, 100, ".", 500, "treatment_rep1") |> rename_all(.fun = ~ names) |> +synthetic_data <- tibble("chr1", 200, 900, NA, 100, ".", 500, "treatment_rep1") |> + rename_all(.fun = ~names) |> rbind( - tibble("chr1", 1, 900, NA, 97, ".", 500, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 101, 300, NA, 94, ".", 200, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 301, 900, NA, 94, ".", 500, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 201, 900, NA, 100, ".", 500, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 301, 900, NA, 98, ".", 600, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 301, 1000, NA, 96, ".", 600, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 301, 1100, NA, 93, ".", 500, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 1301, 1600, NA, 97, ".", 1400, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 1901, 2200, NA, 98, ".", 2000, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 2501, 3100, NA, 97, ".", 2800, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 2501, 3400, NA, 98, ".", 3000, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 2601, 3200, NA, 99, ".", 2800, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 3501, 4200, NA, 44, ".", 3800, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 3501, 4400, NA, 95, ".", 3800, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 3601, 4400, NA, 43, ".", 3900, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 4501, 5000, NA, 97, ".", 4800, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 4501, 5200, NA, 60, ".", 4700, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 4501, 5200, NA, 59, ".", 5000, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 4501, 5300, NA, 98, ".", 4800, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 4501, 5300, NA, 98, ".", 5100, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 4601, 5100, NA, 93, ".", 4900, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 4601, 5200, NA, 94, ".", 4800, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 4701, 5300, NA, 46, ".", 4900, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 4701, 5300, NA, 45, ".", 5100, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 5601, 6100, NA, 26, ".", 5700, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 5701, 6400, NA, 98, ".", 6200, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 5801, 6300, NA, 30, ".", 6100, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 25, ".", 7000, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 44, ".", 7000, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 43, ".", 7000, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 29, ".", 7000, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 98, ".", 7000, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr1", 6701, 7400, NA, 97, ".", 7000, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr10", 101, 800, NA, 95, ".", 400, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr10", 101, 900, NA, 80, ".", 500, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr10", 201, 900, NA, 95, ".", 500, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr10", 301, 1000, NA, 75, ".", 600, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr10", 301, 1000, NA, 90, ".", 600, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("chr10", 301, 1000, NA, 90, ".", 600, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr2", 101, 800, NA, 30, ".", 400, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr2", 101, 900, NA, 10, ".", 500, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr2", 201, 900, NA, 50, ".", 500, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr2", 301, 1000, NA, 50, ".", 600, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr2", 301, 1000, NA, 10, ".", 600, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr2", 301, 1000, NA, 30, ".", 600, "treatment_rep2") |> rename_all(.fun = ~ names), - tibble("Chr2", 101, 800, NA, 80, ".", 700, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr4 2", 301, 1000, NA, 30, ".", 600, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr4 2", 101, 800, NA, 25, ".", 400, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr4 2", 301, 1000, NA, 35, ".", 600, "control_rep3") |> rename_all(.fun = ~ names), - tibble("chr4-2", 401, 1100, NA, 20, ".", 600, "control_rep1") |> rename_all(.fun = ~ names), - tibble("chr4-2", 201, 900, NA, 30, ".", 500, "treatment_rep1") |> rename_all(.fun = ~ names), - tibble("chr4?2", 101, 900, NA, 25, ".", 400, "treatment_rep3") |> rename_all(.fun = ~ names), - tibble("chr4|2", 101, 800, NA, 80, ".", 400, "control_rep2") |> rename_all(.fun = ~ names), - tibble("chr42", 301, 1000, NA, 90, ".", 600, "treatment_rep2") |> rename_all(.fun = ~ names) + tibble("chr1", 1, 900, NA, 97, ".", 500, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 101, 300, NA, 94, ".", 200, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 301, 900, NA, 94, ".", 500, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 201, 900, NA, 100, ".", 500, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 301, 900, NA, 98, ".", 600, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 301, 1000, NA, 96, ".", 600, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 301, 1100, NA, 93, ".", 500, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 1301, 1600, NA, 97, ".", 1400, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 1901, 2200, NA, 98, ".", 2000, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 2501, 3100, NA, 97, ".", 2800, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 2501, 3400, NA, 98, ".", 3000, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 2601, 3200, NA, 99, ".", 2800, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 3501, 4200, NA, 44, ".", 3800, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 3501, 4400, NA, 95, ".", 3800, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 3601, 4400, NA, 43, ".", 3900, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 4501, 5000, NA, 97, ".", 4800, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 4501, 5200, NA, 60, ".", 4700, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 4501, 5200, NA, 59, ".", 5000, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 4501, 5300, NA, 98, ".", 4800, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 4501, 5300, NA, 98, ".", 5100, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 4601, 5100, NA, 93, ".", 4900, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 4601, 5200, NA, 94, ".", 4800, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 4701, 5300, NA, 46, ".", 4900, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 4701, 5300, NA, 45, ".", 5100, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 5601, 6100, NA, 26, ".", 5700, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 5701, 6400, NA, 98, ".", 6200, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 5801, 6300, NA, 30, ".", 6100, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 25, ".", 7000, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 44, ".", 7000, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 43, ".", 7000, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 29, ".", 7000, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 98, ".", 7000, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr1", 6701, 7400, NA, 97, ".", 7000, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr10", 101, 800, NA, 95, ".", 400, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr10", 101, 900, NA, 80, ".", 500, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr10", 201, 900, NA, 95, ".", 500, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr10", 301, 1000, NA, 75, ".", 600, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr10", 301, 1000, NA, 90, ".", 600, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("chr10", 301, 1000, NA, 90, ".", 600, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr2", 101, 800, NA, 30, ".", 400, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr2", 101, 900, NA, 10, ".", 500, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr2", 201, 900, NA, 50, ".", 500, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr2", 301, 1000, NA, 50, ".", 600, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr2", 301, 1000, NA, 10, ".", 600, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr2", 301, 1000, NA, 30, ".", 600, "treatment_rep2") |> rename_all(.fun = ~names), + tibble("Chr2", 101, 800, NA, 80, ".", 700, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr4 2", 301, 1000, NA, 30, ".", 600, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr4 2", 101, 800, NA, 25, ".", 400, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr4 2", 301, 1000, NA, 35, ".", 600, "control_rep3") |> rename_all(.fun = ~names), + tibble("chr4-2", 401, 1100, NA, 20, ".", 600, "control_rep1") |> rename_all(.fun = ~names), + tibble("chr4-2", 201, 900, NA, 30, ".", 500, "treatment_rep1") |> rename_all(.fun = ~names), + tibble("chr4?2", 101, 900, NA, 25, ".", 400, "treatment_rep3") |> rename_all(.fun = ~names), + tibble("chr4|2", 101, 800, NA, 80, ".", 400, "control_rep2") |> rename_all(.fun = ~names), + tibble("chr42", 301, 1000, NA, 90, ".", 600, "treatment_rep2") |> rename_all(.fun = ~names) ) write_tsv(synthetic_data, "data-raw/synthetic_data.bed") diff --git a/data/syn_sample_sheet.rda b/data/syn_sample_sheet.rda index 72f1241..e35d817 100644 Binary files a/data/syn_sample_sheet.rda and b/data/syn_sample_sheet.rda differ diff --git a/inst/CITATION b/inst/CITATION index 491eb80..86fb97a 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -2,14 +2,16 @@ citHeader("To cite peakCombiner in publications use:") citEntry( entry = "Article", - title = , - author = , - journal = , - year = , - volume = , - number = , - pages = , - url = , + title = "peakCombiner: An R package to curate and merge enriched genomic regions into consensus peak sets", + author = personList(as.person("Markus Muckenhuber [aut, cre]"), + as.person("Michael B Stadler [aut]"), + as.person("Kathleen Sprouffske [aut]")), + journal = "X", + year = "X", + volume = "X", + number = "X", + pages = "X", + url = "X", textVersion = paste( ) diff --git a/man/blacklist_hg38.Rd b/man/blacklist_hg38.Rd index 9f54388..8904e1c 100644 --- a/man/blacklist_hg38.Rd +++ b/man/blacklist_hg38.Rd @@ -5,8 +5,7 @@ \alias{blacklist_hg38} \title{Blacklisted regions from ENCODE for human hg38} \format{ -\subsection{\code{blacklist_hg38} A tibble with 910 rows and 3 columns:}{ -} +\code{blacklist_hg38} A tibble with 910 rows and 3 columns: } \source{ Downloaded from ENCODE https://www.encodeproject.org/files/ENCFF356LFX/ diff --git a/man/blacklist_mm10.Rd b/man/blacklist_mm10.Rd index dd225ea..9531d4f 100644 --- a/man/blacklist_mm10.Rd +++ b/man/blacklist_mm10.Rd @@ -5,8 +5,7 @@ \alias{blacklist_mm10} \title{Blacklisted regions from ENCODE for mouse mm10} \format{ -\subsection{\code{blacklist_mm10} A tibble with 164 rows and 3 columns:}{ -} +\code{blacklist_mm10} A tibble with 164 rows and 3 columns: } \source{ Downloaded from ENCODE https://www.encodeproject.org/files/ENCFF547MET/ diff --git a/man/center_expand_regions.Rd b/man/center_expand_regions.Rd index 7ffe841..cc8c54b 100644 --- a/man/center_expand_regions.Rd +++ b/man/center_expand_regions.Rd @@ -112,15 +112,11 @@ asymmetrically. } \examples{ # Load in and prepare a an accepted tibble -sample_sheet <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), - show_col_types = FALSE -) -sample_sheet +utils::data(syn_data_bed) # Prepare input data data_prepared <- prepare_input_regions( - data = sample_sheet, + data = syn_data_bed, show_messages = TRUE ) # Run center and expand diff --git a/man/check_data_structure.Rd b/man/check_data_structure.Rd index e42650b..8566349 100644 --- a/man/check_data_structure.Rd +++ b/man/check_data_structure.Rd @@ -18,7 +18,7 @@ described in full in the Details below. Use as input for functions \code{\link[=combine_regions]{combine_regions()}}. } \description{ -This is a general helper function for the package \link{peakCombiner}. Aim of this +This is a general helper function for the package \pkg{peakCombiner}. Aim of this function is to check a data frame for the correct column names and classes of each column to ensure to be an accepte inpuut for functions: \code{\link[=center_expand_regions]{center_expand_regions()}}, \code{\link[=filter_regions]{filter_regions()}} and diff --git a/man/combine_regions.Rd b/man/combine_regions.Rd index 88a7a78..9bd160d 100644 --- a/man/combine_regions.Rd +++ b/man/combine_regions.Rd @@ -115,16 +115,11 @@ Note, the output data.frame columns \code{sample_name}, \code{name} and \code{sc will be updated. } \examples{ - -# Load in and prepare input data -sample_sheet <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), - show_col_types = FALSE -) -sample_sheet +# Load in and prepare a an accepted tibble +utils::data(syn_data_bed) data_prepared <- prepare_input_regions( - data = sample_sheet, + data = syn_data_bed, show_messages = FALSE ) diff --git a/man/cr_add_summit.Rd b/man/cr_add_summit.Rd index 367642f..c43aec1 100644 --- a/man/cr_add_summit.Rd +++ b/man/cr_add_summit.Rd @@ -20,6 +20,32 @@ columns will be dropped} \item{input}{The original input file from \code{combine_regions} to extract center information} + +\item{combined_center}{Defines how the column 'center' will be +populated for each genomic region in the output +data. Allowed options are +* \code{middle} - the mathematical center of the new region +* \code{strongest} - the 'center' of the input region that has the +the highest 'score' of all overlapping input +regions +* \code{nearest} - the 'center' of the input region that is closest +to mean of the 'center's of all overlapping +input regions (default)} + +\item{annotate_with_input_names}{TRUE / FALSE (default). If TRUE, a new +column named 'input_names' is created +in the output data that is populated for +each combined genomic region with the +'name's of all contributing input regions. +If the column 'input_names' already +exists, it will be overwritten.} + +\item{combined_sample_name}{Optionally defines how the column 'sample_name' +is populated for the output data. +If not used, then the default is to simply +concatenate all input +sample_names into a single comma-separated +string} } \value{ A tibble with the following columns: \code{chrom}, \code{start}, \code{end}, \code{name}, @@ -28,7 +54,7 @@ A tibble with the following columns: \code{chrom}, \code{start}, \code{end}, \co \description{ Helper function for main function \link{combine_regions}. Requires in memory data frame in the standard accepted format for the -\link{peakCombiner}. +peakCombiner package. For details see the details for \link{combine_regions}. } \details{ diff --git a/man/cr_disjoin_filter.Rd b/man/cr_disjoin_filter.Rd index aa58fb5..908ca07 100644 --- a/man/cr_disjoin_filter.Rd +++ b/man/cr_disjoin_filter.Rd @@ -27,7 +27,7 @@ A tibble with the following columns: \code{chrom}, \code{start}, \code{end}, \description{ Helper function for main function \link{combine_regions}. Requires in memory data frame in the standard accepted format for the -\link{peakCombiner}. +peakCombiner package. For details see the details for \link{combine_regions}. } \details{ diff --git a/man/cr_overlap_with_summits.Rd b/man/cr_overlap_with_summits.Rd index a74e80c..efe54ed 100644 --- a/man/cr_overlap_with_summits.Rd +++ b/man/cr_overlap_with_summits.Rd @@ -22,7 +22,7 @@ A tibble with the following columns: \code{chrom}, \code{start}, \code{end}, \description{ Helper function for main function \link{combine_regions}. Requires in memory data frame in the standard accepted format for the -\link{peakCombiner}. +peakCombiner package. For details see the details for \link{combine_regions}. } \details{ diff --git a/man/cr_reduce.Rd b/man/cr_reduce.Rd index 856c735..029a555 100644 --- a/man/cr_reduce.Rd +++ b/man/cr_reduce.Rd @@ -19,7 +19,7 @@ A tibble with the following columns: \code{chrom}, \code{start}, \code{end}, \description{ Helper function for main function \link{combine_regions}. Requires in memory data frame in the standard accepted format for the -\link{peakCombiner}. +peakCombiner package. For details see the details for \link{combine_regions}. } \details{ diff --git a/man/define_expansion.Rd b/man/define_expansion.Rd index 860015e..18e8543 100644 --- a/man/define_expansion.Rd +++ b/man/define_expansion.Rd @@ -41,9 +41,3 @@ function. 'NULL' allows for data-driven definition of the \code{expand_by} value It calculates the median genomic region size of the input data and uses this value like a length 1 numeric vector for expansion. } -\examples{ -expansion_value <- define_expansion( - data = data, - expand_by = expand_by -) -} diff --git a/man/filter_regions.Rd b/man/filter_regions.Rd index 05e388a..c6fa0dc 100644 --- a/man/filter_regions.Rd +++ b/man/filter_regions.Rd @@ -144,15 +144,11 @@ skipped (optional). } \examples{ -# Load in and prepare the input data -sample_sheet <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), - show_col_types = FALSE -) -sample_sheet +# Load in and prepare a an accepted tibble +utils::data(syn_data_bed) data_prepared <- prepare_input_regions( - data = sample_sheet, + data = syn_data_bed, show_messages = TRUE ) diff --git a/man/prepare_input_regions.Rd b/man/prepare_input_regions.Rd index b12a5aa..1087c6b 100644 --- a/man/prepare_input_regions.Rd +++ b/man/prepare_input_regions.Rd @@ -122,44 +122,23 @@ enriched (based on the values in the column \code{score}). This step is mantory to quaranty an optimal result. } \examples{ -infolder <- here::here() - -# Load a tibble with path to region files and required meta information -sample_sheet <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_sample_sheet.tsv"), - show_col_types = FALSE -) -sample_sheet +# Load in and prepare a an accepted tibble +utils::data(syn_data_tibble) data_prepared <- prepare_input_regions( - data = sample_sheet, + data = syn_data_tibble, show_messages = TRUE ) data_prepared # Or a pre-loaded tibble with genomic regions and named columns. -control <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_data_C1.bed"), - show_col_types = FALSE, - col_names = c( - "chrom", "start", "end", "name", - "strand", "score", "summit" - ) -) - -treatment <- readr::read_tsv( - paste0(infolder, "/lists/synthetic_data_T1.bed"), - show_col_types = FALSE, - col_names = c( - "chrom", "start", "end", "name", - "strand", "score", "summit" - ) -) +utils::data(syn_data_control01) +utils::data(syn_data_treatment01) -combined_input <- control |> +combined_input <- syn_data_control01 |> dplyr::mutate(sample_name = "control-rep1") |> - rbind(treatment |> + rbind(syn_data_treatment01 |> dplyr::mutate(sample_name = "treatment-rep1")) prepare_input_regions( diff --git a/man/syn_blacklist.Rd b/man/syn_blacklist.Rd index 499883b..3d0b846 100644 --- a/man/syn_blacklist.Rd +++ b/man/syn_blacklist.Rd @@ -5,8 +5,7 @@ \alias{syn_blacklist} \title{Synthetic file with blacklisted regions for peakCombiner} \format{ -\subsection{\code{syn_blacklist} A tibble with 2 rows and 3 columns:}{ -} +\code{syn_blacklist} A tibble with 2 rows and 3 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_control_rep1_narrowPeak.Rd b/man/syn_control_rep1_narrowPeak.Rd index 03b9c14..cf50ce5 100644 --- a/man/syn_control_rep1_narrowPeak.Rd +++ b/man/syn_control_rep1_narrowPeak.Rd @@ -5,8 +5,7 @@ \alias{syn_control_rep1_narrowPeak} \title{Synthetic data set for control rep 1 sample in narrowPeak file format} \format{ -\subsection{\code{syn_control_rep1_narrowPeak} A tibble with 11 rows and 6 columns:}{ -} +\code{syn_control_rep1_narrowPeak} A tibble with 11 rows and 6 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_data_bed.Rd b/man/syn_data_bed.Rd index 663f99f..6ef372f 100644 --- a/man/syn_data_bed.Rd +++ b/man/syn_data_bed.Rd @@ -5,8 +5,7 @@ \alias{syn_data_bed} \title{Synthetic data set of genomic coordinates and meta data columns} \format{ -\subsection{\code{syn_data_bed} A tibble with 55 rows and 4 columns:}{ -} +\code{syn_data_bed} A tibble with 55 rows and 4 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_data_control01.Rd b/man/syn_data_control01.Rd index 671323a..6961059 100644 --- a/man/syn_data_control01.Rd +++ b/man/syn_data_control01.Rd @@ -6,8 +6,7 @@ \title{Synthetic data set of genomic coordinates and meta data columns filtered for control rep 1 sample} \format{ -\subsection{\code{syn_data_control01} A tibble with 11 rows and 6 columns:}{ -} +\code{syn_data_control01} A tibble with 11 rows and 6 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_data_granges.Rd b/man/syn_data_granges.Rd index ba38c67..ee9c3e7 100644 --- a/man/syn_data_granges.Rd +++ b/man/syn_data_granges.Rd @@ -5,8 +5,7 @@ \alias{syn_data_granges} \title{Synthetic data set of genomic coordinates and meta data columns} \format{ -\subsection{\code{syn_data_granges} A data frame with 55 rows and 8 columns:}{ -} +\code{syn_data_granges} A data frame with 55 rows and 8 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_data_tibble.Rd b/man/syn_data_tibble.Rd index 8acaacd..8605b9f 100644 --- a/man/syn_data_tibble.Rd +++ b/man/syn_data_tibble.Rd @@ -5,8 +5,7 @@ \alias{syn_data_tibble} \title{Synthetic data set of genomic coordinates and meta data columns as tibble} \format{ -\subsection{\code{syn_data_tibble} A tibble with 55 rows and 8 columns:}{ -} +\code{syn_data_tibble} A tibble with 55 rows and 8 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_data_treatment01.Rd b/man/syn_data_treatment01.Rd index 39ebcb3..32ea321 100644 --- a/man/syn_data_treatment01.Rd +++ b/man/syn_data_treatment01.Rd @@ -6,8 +6,7 @@ \title{Synthetic data set of genomic coordinates and meta data columns filtered for treatment rep 1 sample} \format{ -\subsection{\code{syn_data_treatment01} A tibble with 10 rows and 6 columns:}{ -} +\code{syn_data_treatment01} A tibble with 10 rows and 6 columns: } \source{ Created for R package peakCombiner. diff --git a/man/syn_sample_sheet.Rd b/man/syn_sample_sheet.Rd index caba46a..cfe4397 100644 --- a/man/syn_sample_sheet.Rd +++ b/man/syn_sample_sheet.Rd @@ -5,8 +5,7 @@ \alias{syn_sample_sheet} \title{Synthetic sample sheet to load example data with peakCombiner} \format{ -\subsection{\code{syn_sample_sheet} A tibble with 6 rows and 4 columns:}{ -} +\code{syn_sample_sheet} A tibble with 6 rows and 4 columns. } \source{ Created for R package peakCombiner. diff --git a/man/syn_treatment_rep1_narrowPeak.Rd b/man/syn_treatment_rep1_narrowPeak.Rd index 6990cfc..3bd3ffd 100644 --- a/man/syn_treatment_rep1_narrowPeak.Rd +++ b/man/syn_treatment_rep1_narrowPeak.Rd @@ -5,8 +5,7 @@ \alias{syn_treatment_rep1_narrowPeak} \title{Synthetic data set for treatment rep 1 sample in narrowPeak file format} \format{ -\subsection{\code{syn_treatment_rep1_narrowPeak} A tibble with 11 rows and 6 columns:}{ -} +\code{syn_treatment_rep1_narrowPeak} A tibble with 11 rows and 6 columns: } \source{ Created for R package peakCombiner. diff --git a/tests/testthat.R b/tests/testthat.R index c6500ee..892a8b0 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -3,10 +3,10 @@ # # Where should you do additional test configuration? # Learn more about the roles of various files in: -# * https://r-pkgs.org/tests.html -# * https://testthat.r-lib.org/reference/test_package.html#special-files +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html library(testthat) -library(combpeaksr) +library(peakCombiner) -test_check("combpeaksr") +test_check("peakCombiner") diff --git a/tests/testthat/test-center_expand_regions.R b/tests/testthat/test-center_expand_regions.R index aef5ace..41655b4 100644 --- a/tests/testthat/test-center_expand_regions.R +++ b/tests/testthat/test-center_expand_regions.R @@ -1,17 +1,9 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## input_colnames_pre <- c( "chrom", "start", "end", "name", "score", "strand", @@ -19,7 +11,7 @@ input_colnames_pre <- c( ) input_colnames_post <- c( "chrom", "start", "end", "name", "score", "strand", - "center", "sample_name", "center_origin", "input_names" + "center", "sample_name", "input_names" ) output_colnames_pre <- c( "chrom", "start", "end", "name", "score", "strand", @@ -27,57 +19,61 @@ output_colnames_pre <- c( ) output_colnames_post <- c( "chrom", "start", "end", "name", "score", "strand", - "center", "sample_name", "center_origin", "input_names" + "center", "sample_name", "input_names" ) ## -test_data <- readr::read_tsv("lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_bed, package = "peakCombiner") +test_data <- syn_data_bed ## -test_data_prepared <- prepare_input_regions( +test_data_prepared <- peakCombiner::prepare_input_regions( data = test_data - ) +) ## -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) restult_colnames <- colnames(test_data_center_expand) ## -test_data_filtered <- filter_regions( +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_blacklist = "hg38", # "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() + exclude_by_blacklist = "hg38", # "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL, + show_messages = TRUE +) ## -test_data_combined <- combine_regions( +test_data_combined <- peakCombiner::combine_regions( data = test_data_filtered, found_in_samples = 2, - center = "nearest" + combined_center = "nearest", + annotate_with_input_names = TRUE, + combined_sample_name = "combined" ) ## -test_data_combined_ce <- center_expand_regions( +test_data_combined_ce <- peakCombiner::center_expand_regions( data = test_data_combined, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) ### -----------------------------------------------------------------------### ### Test input ### -----------------------------------------------------------------------### -test_that("Test if function works with pre-combined input", { - expect_no_error(center_expand_regions( +testthat::test_that("Test if function works with pre-combined input", { + testthat::expect_no_error(peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) }) -test_that("Test if function works with post-combined input", { - expect_no_error(center_expand_regions( +testthat::test_that("Test if function works with post-combined input", { + testthat::expect_no_error(peakCombiner::center_expand_regions( data = test_data_combined, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) }) @@ -85,7 +81,6 @@ test_that("Test if function works with post-combined input", { ### -----------------------------------------------------------------------### test_that("Required input data has the expected structure", { - data <- test_data_prepared expect_equal(length(names(data)), 8) @@ -98,14 +93,13 @@ test_that("Required input data has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) test_that("Required input data has the expected structure", { - data <- test_data_combined - expect_equal(length(names(data)), 10) + expect_equal(length(names(data)), 9) expect_identical(names(data), input_colnames_post) expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) @@ -114,34 +108,33 @@ test_that("Required input data has the expected structure", { expect_true(is.numeric(data$score)) expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) - expect_true(is.character(data$center_origin)) - expect_true(sum(str_detect(data$input_names, "|")) > 0) + expect_true(sum(stringr::str_detect(data$input_names, "|")) > 0) }) ### -----------------------------------------------------------------------### test_that("Required paramter 'center_by' has the expected structure/value", { - expect_no_error(center_expand_regions( + expect_no_error(peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "coluMn_value", + center_by = "center_Column", expand_by = NULL )) - expect_error(center_expand_regions( + expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = c("column_value", "calculated_value"), + center_by = c("center_column", "calculated_value"), expand_by = NULL ), "center_by") - expect_error(center_expand_regions( + expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = "nonexisting", expand_by = NULL ), "center_by") - expect_error(center_expand_regions( + expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = NULL, expand_by = NULL ), "center_by") - expect_error(center_expand_regions( + expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = NA, expand_by = NULL @@ -150,23 +143,23 @@ test_that("Required paramter 'center_by' has the expected structure/value", { ### -----------------------------------------------------------------------### -test_that("Required paramter expand_by has the expected structure/value", { - expect_no_error(center_expand_regions( +testthat::test_that("Required paramter expand_by has the expected structure/value", { + testthat::expect_no_error(peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) - expect_error(center_expand_regions( + testthat::expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = "column_value", expand_by = NA ), ) - expect_error(center_expand_regions( + testthat::expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = "column_value", expand_by = c(1, 2, 3) ), ) - expect_error(center_expand_regions( + testthat::expect_error(peakCombiner::center_expand_regions( data = test_data_prepared, center_by = "column_value", expand_by = "nonexisting" @@ -178,14 +171,13 @@ test_that("Required paramter expand_by has the expected structure/value", { ### -----------------------------------------------------------------------### test_that("Output data frame is correct for pre-combined", { - data <- test_data_center_expand - + expect_setequal(colnames(data), output_colnames_pre) expect_equal(ncol(data), 8) - + expect_identical(class(data)[2], "tbl") - + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) @@ -194,21 +186,20 @@ test_that("Output data frame is correct for pre-combined", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - - expect_equal(mean(data$center), 1942.2535) - expect_identical(nrow(data), as.integer(71)) - expect_identical(data$start[1], 250) + + expect_equal(mean(data$center), 2495.6827) + expect_identical(nrow(data), as.integer(52)) + expect_identical(data$start[1], 100.5) }) test_that("Output data frame is correct for post-combined", { - data <- test_data_combined_ce - + expect_setequal(colnames(data), output_colnames_post) - expect_equal(ncol(data), 10) - + expect_equal(ncol(data), 9) + expect_identical(class(data)[2], "tbl") - + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) @@ -216,11 +207,9 @@ test_that("Output data frame is correct for post-combined", { expect_true(is.numeric(data$score)) expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) - expect_true(is.character(data$center_origin)) - - expect_equal(mean(data$center), 2192.3077) - expect_identical(nrow(data), as.integer(13)) - expect_identical(data$start[1], 100) + expect_equal(mean(data$center), 2770.45) + expect_identical(nrow(data), as.integer(10)) + expect_identical(data$start[1], 200) expect_identical(data$end[1], 900) expect_identical(data$end[1], 900) }) @@ -228,95 +217,74 @@ test_that("Output data frame is correct for post-combined", { test_that("Output data frame is correct for data_prepared", { ## data <- test_data_prepared - result <- center_expand_regions( + result <- peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) ## - expect_no_error(center_expand_regions( + expect_no_error(peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) ## - expect_identical(nrow(result), 71L) - expect_identical(result$start[9], as.numeric(250)) + expect_identical(nrow(result), 52L) }) ## test_that("Output data frame is correct for data_center_expand", { ## data <- test_data_center_expand - result <- center_expand_regions( + result <- peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) ## - expect_no_error(center_expand_regions( + expect_no_error(peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) ## - expect_identical(nrow(result), 71L) - expect_identical(result$start[9], as.numeric(250)) + expect_identical(nrow(result), 52L) }) ## test_that("Output data frame is correct for data_filtered", { ## data <- test_data_filtered - result <- center_expand_regions( + result <- peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) ## - expect_no_error(center_expand_regions( + expect_no_error(peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) ## - expect_identical(nrow(result), 71L) - expect_identical(result$start[9], as.numeric(250)) + expect_identical(nrow(result), 52L) }) ## test_that("Output data frame is correct for data_combined", { ## data <- test_data_combined - result <- center_expand_regions( + result <- peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL ) ## - expect_no_error(center_expand_regions( + expect_no_error(peakCombiner::center_expand_regions( data = data, - center_by = "column_value", + center_by = "center_column", expand_by = NULL )) ## - expect_identical(nrow(result), 13L) - expect_identical(result$start[9], as.numeric(100)) + expect_identical(nrow(result), 10L) }) ## ### -----------------------------------------------------------------------### ## -test_that("All newly center and expand regions do have only positive values", { - ## - data_input <- readr::read_tsv(paste0("lists/input_data-bed3.bed")) - prep_data <- prepare_input_regions(data = data_input,score_colname = NULL) - ## - expect_no_error(center_expand_regions( - data = prep_data, - center_by = "column_value", - expand_by = NULL - )) - ## - expect_message(center_expand_regions( - data = prep_data, - center_by = "column_value", - expand_by = 500 - )) -}) diff --git a/tests/testthat/test-collapse_summits.R b/tests/testthat/test-collapse_summits.R deleted file mode 100644 index 7412654..0000000 --- a/tests/testthat/test-collapse_summits.R +++ /dev/null @@ -1,68 +0,0 @@ -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) -## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() -## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### -## -required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", - "center", "sample_name" -) -## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) -input_colnames <- colnames(test_data) -## -test_data_prepared_filtered <- collapse_summits(data_prepared = test_data) -restult_colnames <- colnames(test_data_prepared_filtered) -## -### -----------------------------------------------------------------------### -### Test input -### -----------------------------------------------------------------------### -## -test_that("Test if function works with correct input", { - expect_no_error(collapse_summits(data_prepared = test_data)) -}) -## -### -----------------------------------------------------------------------### -## -test_that("Input data has the right number of columns", { - expect_equal(length(input_colnames), 8) -}) -## -test_that("Column names of input data are identical with required once.", { - expect_identical(names(test_data), required_colnames) -}) -## -### -----------------------------------------------------------------------### -### Test output -### -----------------------------------------------------------------------### -## -test_that("Column names of output data are identical with required once.", { - expect_setequal(colnames(test_data_prepared_filtered), required_colnames) -}) -## -test_that("Output data has the right number of columns", { - expect_equal(ncol(test_data_prepared_filtered), 8) -}) -## -test_that("Output data has the right class.", { - expect_identical(class(test_data_prepared_filtered)[2], "tbl") -}) -## -test_that("Output data has the correct mean value for the column 'center'.", { - expect_equal(mean(test_data_prepared_filtered$center), 1942.2535) -}) -## -test_that("Output data has the correct number of rows.", { - expect_identical(nrow(test_data_prepared_filtered), as.integer(71)) -}) -## -### -----------------------------------------------------------------------### \ No newline at end of file diff --git a/tests/testthat/test-combine_regions.R b/tests/testthat/test-combine_regions.R index 817f353..be23356 100644 --- a/tests/testthat/test-combine_regions.R +++ b/tests/testthat/test-combine_regions.R @@ -1,18 +1,10 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() - -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### - +## +set.seed(1234) +## input_colnames <- c( "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" @@ -23,21 +15,24 @@ output_colnames <- c( "center", "sample_name", "center_origin", "input_names" ) -test_data <- readr::read_tsv(paste0("lists/synthetic_genomic_regions.bed"), show_col_types = FALSE) +#' Prepare test data set +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble +test_data -test_data_prepared <- prepare_input_regions( +test_data_prepared <- peakCombiner:::prepare_input_regions( data = test_data, show_messages = TRUE ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner:::center_expand_regions( data = test_data_prepared, - center_by = "column_value", + center_by = "center_column", expand_by = NULL, show_messages = TRUE ) -test_data_filtered <- filter_regions( +test_data_filtered <- peakCombiner:::filter_regions( data = test_data_center_expand, include_by_chromosome_name = NULL, exclude_by_blacklist = "hg38", # "hg38", @@ -46,7 +41,7 @@ test_data_filtered <- filter_regions( show_messages = TRUE ) -test_data_combined <- combine_regions( +test_data_combined <- peakCombiner:::combine_regions( data = test_data_filtered, combined_center = "nearest", annotate_with_input_names = FALSE, @@ -58,197 +53,244 @@ test_data_combined <- combine_regions( ### Test arguments ### -----------------------------------------------------------------------### -test_that("Input data frame has be data frame or tibble", { - expect_error(combine_regions(data = c(1,2,3,4,5), - show_messages = FALSE)) +testthat::test_that("Input data frame has be data frame or tibble", { + testthat::expect_error(peakCombiner:::combine_regions( + data = c(1, 2, 3, 4, 5), + show_messages = FALSE + )) }) -test_that("Input data frame has be data frame or tibble", { - expect_error(combine_regions(data = NULL, - show_messages = FALSE)) +testthat::test_that("Input data frame has be data frame or tibble", { + testthat::expect_error(peakCombiner:::combine_regions( + data = NULL, + show_messages = FALSE + )) }) ### -----------------------------------------------------------------------### -test_that("Argument 'combined_center' creates error if NULL", { - expect_error(combine_regions(data = test_data_filtered, - combined_center = NULL, - show_messages = FALSE)) +testthat::test_that("Argument 'combined_center' creates error if NULL", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_center = NULL, + show_messages = FALSE + )) }) -test_that("Argument 'combined_center' creates error if NA", { - expect_error(combine_regions(data = test_data_filtered, - combined_center = NA, - show_messages = FALSE)) +testthat::test_that("Argument 'combined_center' creates error if NA", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_center = NA, + show_messages = FALSE + )) }) -test_that("Argument 'combined_center' creates error if numeric value", { - expect_error(combine_regions(data = test_data_filtered, - combined_center = 1, - show_messages = FALSE)) +testthat::test_that("Argument 'combined_center' creates error if numeric + value", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_center = 1, + show_messages = FALSE + )) }) -test_that("Argument 'combined_center' tolerates capitilization", { - expect_no_error(combine_regions(data = test_data_filtered, - combined_center = "Nearest", - show_messages = FALSE)) +testthat::test_that("Argument 'combined_center' tolerates capitilization", { + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_center = "Nearest", + show_messages = FALSE + )) }) -test_that("Argument 'combined_center' creates error if not allowes value", { - expect_error(combine_regions(data = test_data_filtered, - combined_center = "Shortest", - show_messages = FALSE)) +testthat::test_that("Argument 'combined_center' creates error if not allowes + value", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_center = "Shortest", + show_messages = FALSE + )) }) ### -----------------------------------------------------------------------### -test_that("Argument 'annotate_with_input_names' creates no error if allowed - value", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = TRUE, - show_messages = FALSE)) - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = FALSE, - show_messages = FALSE)) - }) - -test_that("Argument 'annotate_with_input_names' creates error if not allowes - value", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = FALSe, - show_messages = FALSE)) +testthat::test_that("Argument 'annotate_with_input_names' creates no error if + allowed value", { + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = TRUE, + show_messages = FALSE + )) + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = FALSE, + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'annotate_with_input_names' creates error if not + allowes value", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = FALSe, + show_messages = FALSE + )) + + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = 10, + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'annotate_with_input_names' creates error if not + allowes value 'NA'", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = NA, + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'annotate_with_input_names' creates error if not + allowes value 'NULL'", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = NULL, + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'annotate_with_input_names' creates error if + length is greater then 1.", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = c(1, 2), + show_messages = FALSE + )) }) -test_that("Argument 'annotate_with_input_names' creates error if not allowes - value 'NA'", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = NA, - show_messages = FALSE)) - }) - -test_that("Argument 'annotate_with_input_names' creates error if not allowes - value 'NULL'", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = NULL, - show_messages = FALSE)) - }) - -test_that("Argument 'annotate_with_input_names' creates error if length is - greater then 1.", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = c(1,2), - show_messages = FALSE)) - }) - -test_that("Argument 'annotate_with_input_names' creates error if not allowed -logical value with length 2 is provided.", { - expect_error(combine_regions(data = test_data_filtered, - annotate_with_input_names = c(NA,TRUE), - show_messages = FALSE)) - }) +testthat::test_that("Argument 'annotate_with_input_names' creates error if not + allowed logical value with length 2 is provided.", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + annotate_with_input_names = c(NA, TRUE), + show_messages = FALSE + )) +}) ### -----------------------------------------------------------------------### -test_that("Argument 'combined_sample_name' creates no error if 'NULL' +testthat::test_that("Argument 'combined_sample_name' creates no error if 'NULL' value is provided.", { - expect_no_error(combine_regions(data = test_data_filtered, - combined_sample_name = NULL, - show_messages = FALSE)) - }) + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_sample_name = NULL, + show_messages = FALSE + )) +}) -test_that("Argument 'combined_sample_name' creates no error if single character - value is provided.", { - expect_no_error(combine_regions(data = test_data_filtered, - combined_sample_name = "Consensus", - show_messages = FALSE)) +testthat::test_that("Argument 'combined_sample_name' creates no error if single + character value is provided.", { + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_sample_name = "Consensus", + show_messages = FALSE + )) }) -test_that("Argument 'combined_sample_name' creates error if single numeric - value is provided.", { - expect_error(combine_regions(data = test_data_filtered, - combined_sample_name = 1, - show_messages = FALSE)) - }) - -test_that("Argument 'combined_sample_name' creates error if vector with two - entries is provided.", { - expect_error(combine_regions(data = test_data_filtered, - combined_sample_name = c("Consensus","Two"), - show_messages = FALSE)) - }) - -test_that("Argument 'combined_sample_name' creates error if 'NA' is +testthat::test_that("Argument 'combined_sample_name' creates error if single + numeric value is provided.", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_sample_name = 1, + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'combined_sample_name' creates error if vector + with two entries is provided.", { + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_sample_name = c("Consensus", "Two"), + show_messages = FALSE + )) +}) + +testthat::test_that("Argument 'combined_sample_name' creates error if 'NA' is provided.", { - expect_error(combine_regions(data = test_data_filtered, - combined_sample_name = NA, - show_messages = FALSE)) - }) + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + combined_sample_name = NA, + show_messages = FALSE + )) +}) ### -----------------------------------------------------------------------### -test_that("Argument 'show_messages' creates no error if TRUE or FALSE +testthat::test_that("Argument 'show_messages' creates no error if TRUE or FALSE value is provided.", { - expect_no_error(combine_regions(data = test_data_filtered, - show_messages = FALSE)) - expect_no_error(combine_regions(data = test_data_filtered, - show_messages = TRUE)) - }) + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + show_messages = FALSE + )) + testthat::expect_no_error(peakCombiner:::combine_regions( + data = test_data_filtered, + show_messages = TRUE + )) +}) -test_that("Argument 'show_messages' creates no error if non accepted +testthat::test_that("Argument 'show_messages' creates no error if non accepted value is provided.", { - expect_error(combine_regions(data = test_data_filtered, - show_messages = FaLSE)) - }) + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + show_messages = FaLSE + )) +}) -test_that("Argument 'show_messages' creates no error if non accepted +testthat::test_that("Argument 'show_messages' creates no error if non accepted value 'NA' is provided.", { - expect_error(combine_regions(data = test_data_filtered, - show_messages = NA)) - }) + testthat::expect_error(peakCombiner:::combine_regions( + data = test_data_filtered, + show_messages = NA + )) +}) ### -----------------------------------------------------------------------### ### Test input ### -----------------------------------------------------------------------### -test_that("Input data frame has the expected structure", { +testthat::test_that("Input data frame has the expected structure", { data <- test_data_filtered - expect_equal(length(names(data)), 8) - expect_identical(names(data), input_colnames) - expect_true(is.character(data$chrom)) - expect_true(is.numeric(data$start)) - expect_true(is.numeric(data$end)) - expect_true(is.character(data$name)) - expect_true(is.numeric(data$score)) - expect_true(is.character(data$strand)) - expect_true(is.numeric(data$center)) - expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + testthat::expect_equal(length(names(data)), 8) + testthat::expect_identical(names(data), input_colnames) + testthat::expect_true(is.character(data$chrom)) + testthat::expect_true(is.numeric(data$start)) + testthat::expect_true(is.numeric(data$end)) + testthat::expect_true(is.character(data$name)) + testthat::expect_true(is.numeric(data$score)) + testthat::expect_true(is.character(data$strand)) + testthat::expect_true(is.numeric(data$center)) + testthat::expect_true(is.character(data$sample_name)) + testthat::expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ### -----------------------------------------------------------------------### ### Test Output ### -----------------------------------------------------------------------### -test_that("Output data has the correct classes and structure", { - expect_no_error(check_data_structure(test_data_combined)) +testthat::test_that("Output data has the correct classes and structure", { + testthat::expect_no_error(peakCombiner:::check_data_structure(test_data_combined)) }) -test_that("Output data frame has correct colnames", { - expect_true(any(colnames(data) %in% output_colnames)) +testthat::test_that("Output data frame has correct colnames", { + testthat::expect_true(any(colnames(test_data_combined) %in% output_colnames)) }) -test_that("Output data frame has correct class", { - expect_identical(class(data)[2], "tbl") -}) - -test_that("Output data frame is expected values", { - expect_identical(data$center[1], 500) - expect_identical(sum(data$score), 1140) -}) ### -----------------------------------------------------------------------### -test_that("Output data results has correct summit for 'nearest' peak", { - data <- combine_regions( +testthat::test_that("Output data results has correct summit for 'nearest' + peak", { + data <- peakCombiner:::combine_regions( data = test_data_filtered, found_in_samples = 2, combined_center = "nearest", @@ -256,13 +298,13 @@ test_that("Output data results has correct summit for 'nearest' peak", { combined_sample_name = "consensus_peak", show_messages = FALSE ) - - expect_identical(data$center[7], 500) - expect_identical(data$name[7], "consensus_peak|7") -}) + + testthat::expect_identical(round(data$center[7],0), 500) + testthat::expect_identical(data$name[7], "consensus_peak|7") +}) test_that("Output data results has correct summit for 'strongst' peak", { - data <- combine_regions( + data <- peakCombiner:::combine_regions( data = test_data_filtered, found_in_samples = 2, combined_center = "strongest", @@ -270,24 +312,23 @@ test_that("Output data results has correct summit for 'strongst' peak", { combined_sample_name = "consensus_peak", show_messages = FALSE ) - - expect_identical(data$center[7], 600) + + expect_identical(round(data$center[7],0), 600) expect_identical(data$name[7], "consensus_peak|7") }) -test_that("Output data results has correct summit for 'middle' peak", { - - data <- combine_regions( - data = test_data_filtered, +testthat::test_that("Output data results has correct summit for 'middle' + peak", { + data <- peakCombiner:::combine_regions( + data = test_data_filtered, found_in_samples = 2, combined_center = "middle", annotate_with_input_names = FALSE, combined_sample_name = "consensus_peak", show_messages = FALSE ) - - expect_identical(data$center[7], 550) - expect_identical(data$name[7], "consensus_peak|7") - + + testthat::expect_identical(data$center[7], 550) + testthat::expect_identical(data$name[7], "consensus_peak|7") }) ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-cr_add_summit.R b/tests/testthat/test-cr_add_summit.R index 9293ab7..40f5da7 100644 --- a/tests/testthat/test-cr_add_summit.R +++ b/tests/testthat/test-cr_add_summit.R @@ -1,58 +1,57 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## -input_colnames <- c("chr", "start", "end", "width", "strand", "input_names") +input_colnames <- c( + "chrom", "start", "end", "width", "strand", "name", "center", "score" +) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) output_colnames <- c( - "chr", "start", "end", "name", "score", "strand", "center", - "center_origin", "input_names" + "chrom", "start", "end", "strand", "name", "score", "center", + "sample_name", "input_names" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( + +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) -test_data_filtered <- filter_regions( + +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_blacklist = "hg38", # "hg38", - filter_by_chromosome_names = c("chr1", "chr10", "chr2", "chr42"), - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() -test_data_disjoin_filter <- cr_disjoin_filter(data = test_data_filtered) -test_data_reduce <- cr_reduce(data = test_data_disjoin_filter) -test_data_overlap <- cr_overlap_with_summits( + exclude_by_blacklist = "hg38", + include_by_chromosome_name = c("chr1", "chr10", "chr2", "chr42"), + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL +) + +test_data_disjoin_filter <- peakCombiner:::cr_disjoin_filter(data = test_data_filtered, found_in_samples = 2) +test_data_reduce <- peakCombiner:::cr_reduce(data = test_data_disjoin_filter) +test_data_overlap <- peakCombiner:::cr_overlap_with_summits( data = test_data_reduce, input = test_data_filtered ) ## -test_data_combined_with_summit <- cr_add_summit( +test_data_combined_with_summit <- peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "nearest" + combined_center = "nearest", + annotate_with_input_names = TRUE, + combined_sample_name = "combined" ) ## ### -----------------------------------------------------------------------### @@ -63,14 +62,13 @@ test_that("Input data frame has the expected structure", { ## data <- test_data_overlap ## - expect_equal(length(colnames(data)), 6) + expect_equal(length(colnames(data)), 8) expect_identical(names(data), input_colnames) - expect_true(is.character(data$chr)) + expect_true(is.factor(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) - expect_true(is.character(data$input_names)) - expect_true(sum(str_detect(data$input_names, "|")) > 0) - expect_true(sum(str_detect(data$input_names, ";")) > 0) + expect_true(is.character(data$name)) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) ## }) ## @@ -80,7 +78,7 @@ test_that("Meta data frame has the expected structure", { ## expect_equal(length(colnames(data)), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -88,45 +86,45 @@ test_that("Meta data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## test_that("Parameter 'center' has the expected structure", { - expect_no_error(cr_add_summit( + expect_no_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "STRONGEST" + combined_center = "STRONGEST" )) - expect_no_error(cr_add_summit( + expect_no_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "mean" + combined_center = "middle" )) ## - expect_error(cr_add_summit( + expect_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = mean + combined_center = mean )) - expect_error(cr_add_summit( + expect_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = 2 + combined_center = 2 )) - expect_error(cr_add_summit( + expect_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = c(1, 2, 3) + combined_center = c(1, 2, 3) ), "`") - expect_error(cr_add_summit( + expect_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = NULL + combined_center = NULL ), "`") - expect_error(cr_add_summit( + expect_error(peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = NA + combined_center = NA ), "`") }) ## @@ -143,41 +141,38 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$input_names)) ## - expect_identical(nrow(data), as.integer(9)) + expect_identical(nrow(data), as.integer(8)) expect_identical(data$center[1], 500) - expect_identical(sum(data$score), 755) + expect_identical(round(sum(data$score), 0), 660) ## }) ## test_that("Output data results with different summits", { - data <- cr_add_summit( + data <- peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "nearest" + combined_center = "nearest" ) expect_identical(data$center[7], 500) - expect_identical(data$center_origin[7], "treatment_rep1|7") ## - data <- cr_add_summit( + data <- peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "strongest" + combined_center = "strongest" ) - expect_identical(data$center[7], 400) - expect_identical(data$center_origin[7], "control_rep2|5") + expect_identical(data$center[7], 600) ## - data <- cr_add_summit( + data <- peakCombiner:::cr_add_summit( data = test_data_overlap, input = test_data_filtered, - center = "middle" + combined_center = "middle" ) expect_identical(data$center[7], 550) - expect_identical(data$center_origin[7], "calculated") ## }) ## diff --git a/tests/testthat/test-cr_disjoin_filter.R b/tests/testthat/test-cr_disjoin_filter.R index bdf0be8..ba7b685 100644 --- a/tests/testthat/test-cr_disjoin_filter.R +++ b/tests/testthat/test-cr_disjoin_filter.R @@ -1,46 +1,36 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "width", "strand", "revmap", - "ranking_comb_ref", "name", "rowname_disjoin" + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) -test_data_filtered <- filter_regions( +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_blacklist = "hg38", # "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() -## -input_colnames <- colnames(test_data_filtered) + exclude_by_blacklist = "hg38", # "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL +) + ## -test_data_disjoin_filter <- cr_disjoin_filter( +test_data_disjoin_filter <- peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = 2 ) @@ -56,7 +46,7 @@ test_that("Input data frame has the expected structure", { ## expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -64,61 +54,62 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## test_that("Parameter 'found_in_samples' has the correct structure", { - expect_no_error(cr_disjoin_filter( + expect_no_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = 3 )) - expect_error(cr_disjoin_filter( + expect_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = 0 - ), "'") - expect_error(cr_disjoin_filter( + ), "Arg") + expect_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = NULL - ), "'") - expect_error(cr_disjoin_filter( + ), "Arg") + expect_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = NA - ), "'") - expect_error(cr_disjoin_filter( + ), ) + expect_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = c(1, 2, 3) ), "'") - expect_error(cr_disjoin_filter( + expect_error(peakCombiner:::cr_disjoin_filter( data = test_data_filtered, found_in_samples = test_data_filtered - ), "'") + ), "Arg") }) ### -----------------------------------------------------------------------### ### Test Output ### -----------------------------------------------------------------------### ## test_that("Output data frame is correct", { - data <- test_data_disjoin_filter + data <- test_data_disjoin_filter |> + dplyr::mutate(chrom = as.character(chrom)) ## - expect_setequal(colnames(data), required_colnames) - expect_equal(ncol(data), 9) + expect_setequal(colnames(data), result_colnames) + expect_equal(ncol(data), 12) ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$sample_name)) ## - expect_identical(nrow(data), as.integer(186)) + expect_identical(nrow(data), as.integer(113)) expect_identical(data$start[1], 150) ## test_counts_left <- test_data_filtered |> dplyr::group_by(sample_name) |> - dplyr::summarise(counts = n()) |> + dplyr::summarise(counts = dplyr::n()) |> dplyr::filter(sample_name == "treatment_rep1") |> dplyr::pull(counts) - expect_identical(test_counts_left, as.integer(12)) + expect_identical(test_counts_left, as.integer(9)) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-cr_overlap_with_summits.R b/tests/testthat/test-cr_overlap_with_summits.R index f98f899..2353c4c 100644 --- a/tests/testthat/test-cr_overlap_with_summits.R +++ b/tests/testthat/test-cr_overlap_with_summits.R @@ -1,48 +1,50 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## -input_colnames <- c("chr", "start", "end", "width", "strand", "input_names") +reduced_colnames <- c( + "chrom", "start", "end", "width", "strand", "name", "center", "score" +) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) -output_colnames <- c("chr", "start", "end", "width", "strand", "input_names") +output_colnames <- c( + "chrom", "start", "end", "width", "strand", "input_names" +) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble +input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) -test_data_filtered <- filter_regions( +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_blacklist = "hg38", # "hg38", - filter_by_chromosome_names = c("chr1", "chr10", "chr2", "chr42"), - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() -test_data_disjoin_filter <- cr_disjoin_filter(data = test_data_filtered) -test_data_reduce <- cr_reduce(data = test_data_disjoin_filter) + exclude_by_blacklist = "hg38", + include_by_chromosome_name = c("chr1", "chr10", "chr2", "chr42"), + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL +) +test_data_disjoin_filter <- peakCombiner:::cr_disjoin_filter( + data = test_data_filtered, + found_in_samples = 2 +) +test_data_reduce <- peakCombiner:::cr_reduce( + data = test_data_disjoin_filter +) ## -test_data_overlap <- cr_overlap_with_summits( +test_data_overlap <- peakCombiner:::cr_overlap_with_summits( data = test_data_reduce, input = test_data_filtered ) @@ -53,26 +55,28 @@ test_data_overlap <- cr_overlap_with_summits( ## test_that("Input data frame has the expected structure", { ## - data <- test_data_reduce + data <- test_data_reduce |> + dplyr::mutate(chrom = as.character(chrom)) ## - expect_equal(length(colnames(data)), 6) - expect_identical(names(data), input_colnames) - expect_true(is.character(data$chr)) + expect_equal(length(colnames(data)), 8) + expect_identical(names(data), reduced_colnames) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) - expect_true(is.character(data$input_names)) - expect_true(sum(str_detect(data$input_names, "|")) > 0) - expect_true(sum(str_detect(data$input_names, ";")) > 0) + expect_true(is.character(data$name)) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "_")) > 0) ## }) ## test_that("Input data frame has the expected structure", { ## - data <- test_data_filtered + data <- test_data_filtered |> + dplyr::mutate(chrom = as.character(chrom)) ## expect_equal(length(colnames(data)), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -80,7 +84,7 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### @@ -89,21 +93,22 @@ test_that("Input data frame has the expected structure", { ## test_that("Output data frame is correct", { ## - data <- test_data_overlap + data <- test_data_overlap |> + dplyr::mutate(chrom = as.character(chrom)) ## - expect_setequal(colnames(data), output_colnames) - expect_equal(ncol(data), 6) + expect_setequal(colnames(data), reduced_colnames) + expect_equal(ncol(data), 8) ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) - expect_true(is.character(data$input_names)) + expect_true(is.character(data$name)) ## - expect_identical(nrow(data), as.integer(9)) - expect_identical(data$start[1], 150) - expect_identical(sum(data$width), 6800L) + expect_identical(nrow(data), as.integer(41)) + expect_identical(data$start[1], 150L) + expect_identical(sum(data$width), 31341L) ## }) ## diff --git a/tests/testthat/test-cr_reduce.R b/tests/testthat/test-cr_reduce.R index 7068475..9e2cb8f 100644 --- a/tests/testthat/test-cr_reduce.R +++ b/tests/testthat/test-cr_reduce.R @@ -1,63 +1,66 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## input_colnames <- c( - "chr", "start", "end", "width", "strand", "revmap", - "ranking_comb_ref", "name", "rowname_disjoin" + "chrom", "start", "end", "width", "strand", "revmap", "sample_name", + "ranking_comb_ref", "name", "center", "score", "rowname_disjoin" ) ## output_colnames <- c("chr", "start", "end", "width", "strand", "input_names") ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +## +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) -test_data_filtered <- filter_regions( +## +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_blacklist = "hg38", # "hg38", - filter_by_chromosome_names = c("chr1", "chr10", "chr2", "chr42"), - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() -test_data_disjoin_filter <- cr_disjoin_filter(data = test_data_filtered) + exclude_by_blacklist = "hg38", + include_by_chromosome_name = c("chr1", "chr10", "chr2", "chr42"), + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL +) ## -test_data_reduce <- cr_reduce(data = test_data_disjoin_filter) +test_data_disjoin_filter <- peakCombiner:::cr_disjoin_filter( + data = test_data_filtered, + found_in_samples = 2 +) ## -output_colnames <- colnames(test_data_reduce) +test_data_reduce <- peakCombiner:::cr_reduce( + data = test_data_disjoin_filter +) +## +output_colnames <- colnames( + test_data_reduce +) ## ### -----------------------------------------------------------------------### ### Test input ### -----------------------------------------------------------------------### ## test_that("Input data frame has the expected structure", { - data <- test_data_disjoin_filter + data <- test_data_disjoin_filter |> + dplyr::mutate(chrom = as.character(chrom)) ## - expect_equal(length(input_colnames), 9) + expect_equal(length(names(data)), 12) expect_identical(names(data), input_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### @@ -65,21 +68,22 @@ test_that("Input data frame has the expected structure", { ### -----------------------------------------------------------------------### ## test_that("Output data frame is correct", { - data <- test_data_reduce + data <- test_data_reduce |> + dplyr::mutate(chrom = as.character(chrom)) ## expect_setequal(colnames(data), output_colnames) - expect_equal(ncol(data), 6) + expect_equal(ncol(data), 8) ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) - expect_true(is.character(data$input_names)) + expect_true(is.character(data$name)) ## - expect_identical(nrow(data), as.integer(10)) - expect_identical(data$start[1], 150) - expect_identical(sum(data$width), 6900L) + expect_identical(nrow(data), 45L) + expect_identical(data$start[1], 150L) + expect_identical(round(sum(data$width),0), 31745) ## }) ## diff --git a/tests/testthat/test-define_expansion.R b/tests/testthat/test-define_expansion.R index cb1b3f4..cae504f 100644 --- a/tests/testthat/test-define_expansion.R +++ b/tests/testthat/test-define_expansion.R @@ -1,33 +1,23 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +test_expansion_value <- 350 +## +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" -) -test_expansion_value <- define_expansion( - data = test_data, - expand_by = NULL +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) ## ### -----------------------------------------------------------------------### @@ -35,7 +25,7 @@ test_expansion_value <- define_expansion( ### -----------------------------------------------------------------------### ## test_that("Test if function works with correct input", { - expect_no_error(define_expansion( + expect_no_error(peakCombiner:::define_expansion( data = test_data, expand_by = NULL )) @@ -45,24 +35,25 @@ test_that("Test if function works with correct input", { ## test_that("Required colnumn names has the expected structure", { data <- test_data + expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) - expect_true(is.character(data$name)) + expect_true(length(data$name) > 0) expect_true(is.numeric(data$score)) expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + # expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## test_that("Required colnumn names has the expected structure", { data <- test_data_prepared expect_equal(length(colnames(test_data_prepared)), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -70,7 +61,7 @@ test_that("Required colnumn names has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-extract_chromosome_names.R b/tests/testthat/test-extract_chromosome_names.R deleted file mode 100644 index 2165755..0000000 --- a/tests/testthat/test-extract_chromosome_names.R +++ /dev/null @@ -1,125 +0,0 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) -## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() -## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### -## -## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) -input_colnames <- colnames(test_data) -## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" -) -test_data_center_expand <- center_expand_regions( - data = test_data_prepared, - center_by = "summit", - expand_by = NULL -) -input_chr <- test_data_center_expand |> - dplyr::pull(chr) |> - unique() -## -test_keep_chromosomes <- extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = "alphanumeric" # "alphanumeric" -) -## -### -----------------------------------------------------------------------### -### Test input -### -----------------------------------------------------------------------### -## -test_that("Test if function works with correct input", { - expect_no_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = "alphanumeric" - )) -}) -## -### -----------------------------------------------------------------------### -## -test_that("Required colnumn names has the expected structure", { - data <- test_data_center_expand - ## - expect_equal(length(input_colnames), 8) - expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) - expect_true(is.numeric(data$start)) - expect_true(is.numeric(data$end)) - expect_true(is.character(data$name)) - expect_true(is.numeric(data$score)) - expect_true(is.character(data$strand)) - expect_true(is.numeric(data$center)) - expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) -}) -## -### -----------------------------------------------------------------------### -## -test_that("Required paramter 'data' has the expected structure/value", { - expect_error(extract_chromosome_names( - data = test_data_center_expand[2:4], - keep_chromosomes = "all" - )) - expect_error(extract_chromosome_names( - data = "nonexisting", - keep_chromosomes = "all" - )) - expect_error(extract_chromosome_names( - data = 1:10, - keep_chromosomes = "all" - )) -}) -## -### -----------------------------------------------------------------------### -## -test_that("Required paramter 'center_by' has the expected structure/value", { - expect_no_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = "aLphanumeric" - )) - expect_no_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = "ALL" - )) - ## - ### -----------------------------------------------------------------------### - ## - expect_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = "chr1" - )) - expect_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = c("chr1", "chr10") - )) - expect_error(extract_chromosome_names( - data = test_data_center_expand, - keep_chromosomes = 1:10 - )) - ## -}) -## -### -----------------------------------------------------------------------### -### Test Output -### -----------------------------------------------------------------------### -## -test_that("Output data frame is correct", { - expect_true(is.vector(test_keep_chromosomes)) - expect_true(all(test_keep_chromosomes %in% input_chr)) - ## - expect_true(length(test_keep_chromosomes) == 3) - expect_false("chr4 2" %in% test_keep_chromosomes) -}) -## -### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-filter_by_blacklist.R b/tests/testthat/test-filter_by_blacklist.R index 54a85e1..914de3a 100644 --- a/tests/testthat/test-filter_by_blacklist.R +++ b/tests/testthat/test-filter_by_blacklist.R @@ -1,52 +1,43 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +## +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) -test_data_filtered <- filter_by_chromosome_names( - data_filtered = test_data_center_expand, - filter_by_chromosome_names = c("chr1", "chr10", "chr42") +## +test_data_filtered <- peakCombiner:::filter_by_chromosome_names( + data = test_data_center_expand, + include_by_chromosome_name = c("chr1", "chr10", "chr42") ) ## input_colnames <- colnames(test_data_filtered) ## -blacklist <- tibble( - chr = c("chr1", "chr5"), - start = 6500, - end = 7500 -) +data(blacklist_hg38, package = "peakCombiner") +blacklist <- blacklist_hg38 ## -test_data_filtered_bl <- filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = blacklist -) |> suppressWarnings() +test_data_filtered_bl <- peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = blacklist +) ## result_colnames <- colnames(test_data_filtered) ## @@ -56,17 +47,17 @@ result_colnames <- colnames(test_data_filtered) ## ## test_that("Test if function works with correct input", { - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = blacklist + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = blacklist )) - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = NULL + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = NULL )) - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = "hg38" + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = "hg38" )) }) ## @@ -74,9 +65,10 @@ test_that("Test if function works with correct input", { ## test_that("Input data frame has the expected structure", { data <- test_data_filtered + expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -84,68 +76,69 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### ## test_that("Required parameter 'filter_by_blacklist' has expected structure", { - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = NULL - )) |> suppressWarnings() - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = "HG38" - )) |> suppressWarnings() - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = "mm10" - )) |> suppressWarnings() - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = "hg19" - )) |> suppressWarnings() + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = NULL + )) + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = "HG38" + )) + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = "mm10" + )) ## - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, filter_by_blacklist = blacklist[1:2] )) - ## - ### -----------------------------------------------------------------------### - ## +}) +## +### -----------------------------------------------------------------------### +## +test_that("For 'filter_by_blacklist' providing blacklist with different + names", { blacklist2 <- blacklist - colnames(blacklist2) <- c("CHR", "start", "end") + colnames(blacklist2) <- c("CHROM", "start", "end") ## - expect_no_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = blacklist2 + expect_no_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = blacklist2 )) ## colnames(blacklist2) <- c("seqnames", "start", "end") ## - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = blacklist2 + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = blacklist2 )) - ## - ### -----------------------------------------------------------------------### - ## - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = "mm38" +}) +## +### -----------------------------------------------------------------------### +## +test_that("Wrong input for exclude_by_blacklist for 'filter_by_blacklist'", { + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = "mm38" )) - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = hg38 + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = hg38 )) - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = 1 + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = 1 )) - expect_error(filter_by_blacklist( - data_filtered = test_data_filtered, - filter_by_blacklist = c(1, 2) + expect_error(peakCombiner:::filter_by_blacklist( + data = test_data_filtered, + exclude_by_blacklist = c(1, 2) )) }) ## @@ -161,7 +154,7 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -170,9 +163,9 @@ test_that("Output data frame is correct", { expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) ## - expect_equal(mean(data$center), 2992.68293) - expect_identical(nrow(data), as.integer(41)) - expect_identical(data$start[1], 250) + expect_equal(round(mean(data$center), 0), 3168) + expect_identical(nrow(data), 38L) + expect_identical(data$start[1], 250L) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-filter_by_chromosome_names.R b/tests/testthat/test-filter_by_chromosome_names.R index a824e6a..288cc06 100644 --- a/tests/testthat/test-filter_by_chromosome_names.R +++ b/tests/testthat/test-filter_by_chromosome_names.R @@ -1,33 +1,26 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +## +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) ## @@ -35,9 +28,9 @@ input_colnames <- colnames(test_data_center_expand) ## keep_chromosomes <- c("chr1", "chr10", "chr42") ## -test_data_filtered <- filter_by_chromosome_names( - data_filtered = test_data_center_expand, - filter_by_chromosome_names = keep_chromosomes +test_data_filtered <- peakCombiner:::filter_by_chromosome_names( + data = test_data_center_expand, + include_by_chromosome_name = keep_chromosomes ) ## result_colnames <- colnames(test_data_filtered) @@ -47,9 +40,9 @@ result_colnames <- colnames(test_data_filtered) ### -----------------------------------------------------------------------### ## test_that("Test if function works with correct input", { - expect_no_error(filter_by_chromosome_names( - data_filtered = test_data_center_expand, - filter_by_chromosome_names = keep_chromosomes + expect_no_error(peakCombiner:::filter_by_chromosome_names( + data = test_data_center_expand, + include_by_chromosome_name = keep_chromosomes )) }) ## @@ -60,7 +53,7 @@ test_that("Input data frame has the expected structure", { ## expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -68,28 +61,29 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### ## -test_that("Required parameter 'filter_by_chromosome_names' has expected structure", { - expect_no_error(filter_by_chromosome_names( - data_filtered = test_data_filtered, - filter_by_chromosome_names = NULL - )) |> suppressWarnings() - expect_no_error(filter_by_chromosome_names( - data_filtered = test_data_filtered, - filter_by_chromosome_names = "chr1" - )) |> suppressWarnings() - expect_no_error(filter_by_chromosome_names( - data_filtered = test_data_filtered, - filter_by_chromosome_names = keep_chromosomes - )) |> suppressWarnings() +test_that("Required parameter 'filter_by_chromosome_names' has expected + structure", { + expect_no_error(peakCombiner:::filter_by_chromosome_names( + data = test_data_filtered, + include_by_chromosome_name = NULL + )) + expect_no_error(peakCombiner:::filter_by_chromosome_names( + data = test_data_filtered, + include_by_chromosome_name = "chr1" + )) + expect_no_error(peakCombiner:::filter_by_chromosome_names( + data = test_data_filtered, + include_by_chromosome_name = keep_chromosomes + )) ## - expect_error(filter_by_chromosome_names( - data_filtered = test_data_filtered, - filter_by_chromosome_names = NA + expect_error(peakCombiner:::filter_by_chromosome_names( + data = test_data_filtered, + include_by_chromosome_name = NA )) }) ## @@ -105,7 +99,7 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -114,8 +108,8 @@ test_that("Output data frame is correct", { expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) ## - expect_equal(mean(data$center), 2992.68293) - expect_identical(nrow(data), as.integer(41)) + expect_equal(round(mean(data$center), 0), 3168) + expect_identical(nrow(data), 38L) expect_identical(data$start[1], 250) }) ## diff --git a/tests/testthat/test-filter_by_significance.R b/tests/testthat/test-filter_by_significance.R index 98d3552..4bf40c9 100644 --- a/tests/testthat/test-filter_by_significance.R +++ b/tests/testthat/test-filter_by_significance.R @@ -1,33 +1,25 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) ## @@ -35,9 +27,9 @@ input_colnames <- colnames(test_data_center_expand) ## filter_by_significance <- 40 ## -test_data_filtered <- filter_by_significance( - data_filtered = test_data_center_expand, - filter_by_significance = filter_by_significance +test_data_filtered <- peakCombiner:::filter_by_significance( + data = test_data_center_expand, + include_above_score_cutoff = filter_by_significance ) ## result_colnames <- colnames(test_data_filtered) @@ -47,9 +39,9 @@ result_colnames <- colnames(test_data_filtered) ### -----------------------------------------------------------------------### ## test_that("Test if function works with correct input", { - expect_no_error(filter_by_significance( - data_filtered = test_data_center_expand, - filter_by_significance = filter_by_significance + expect_no_error(peakCombiner:::filter_by_significance( + data = test_data_center_expand, + include_above_score_cutoff = filter_by_significance )) }) ## @@ -60,7 +52,7 @@ test_that("Input data frame has the expected structure", { ## expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -68,32 +60,32 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### ## test_that("Required parameter 'filter_by_significance' has expected structure", { - expect_no_error(filter_by_significance( - data_filtered = test_data_filtered, - filter_by_significance = NULL + expect_no_error(peakCombiner:::filter_by_significance( + data = test_data_filtered, + include_above_score_cutoff = NULL )) - expect_no_error(filter_by_significance( - data_filtered = test_data_filtered, - filter_by_significance = 0 + expect_no_error(peakCombiner:::filter_by_significance( + data = test_data_filtered, + include_above_score_cutoff = 0 )) ## - expect_error(filter_by_significance( - data_filtered = test_data_filtered, - filter_by_significance = NA + expect_error(peakCombiner:::filter_by_significance( + data = test_data_filtered, + include_above_score_cutoff = NA )) - expect_error(filter_by_significance( - data_filtered = test_data_filtered, - filter_by_significance = "nonexisting" + expect_error(peakCombiner:::filter_by_significance( + data = test_data_filtered, + include_above_score_cutoff = "nonexisting" )) - expect_error(filter_by_significance( - data_filtered = test_data_filtered, - filter_by_significance = c(1, 2, 3) + expect_error(peakCombiner:::filter_by_significance( + data = test_data_filtered, + include_above_score_cutoff = c(1, 2, 3) )) ## }) @@ -110,7 +102,7 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -119,9 +111,9 @@ test_that("Output data frame is correct", { expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) ## - expect_equal(mean(data$center), 2026.8657) - expect_identical(nrow(data), as.integer(67)) - expect_identical(data$start[1], 250) + expect_equal(round(mean(data$center), 0), 2547) + expect_identical(nrow(data), 38L) + expect_identical(data$start[1], 4550) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-filter_by_top_enriched.R b/tests/testthat/test-filter_by_top_enriched.R index aa7fe64..dc86f09 100644 --- a/tests/testthat/test-filter_by_top_enriched.R +++ b/tests/testthat/test-filter_by_top_enriched.R @@ -1,41 +1,33 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - input_data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) ## input_colnames <- colnames(test_data_center_expand) ## -test_data_filtered <- filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = 10 +test_data_filtered <- peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = 10 ) ## result_colnames <- colnames(test_data_filtered) @@ -47,9 +39,9 @@ table(test_data_filtered$sample_name) ### -----------------------------------------------------------------------### ## test_that("Test if function works with correct input", { - expect_no_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = 10 + expect_no_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = 10 )) }) ## @@ -60,7 +52,7 @@ test_that("Input data frame has the expected structure", { ## expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -68,36 +60,36 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### ## test_that("Required parameter 'filter_by_top_enriched' has expected structure", { - expect_no_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = NULL + expect_no_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = NULL )) - expect_no_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = 5 + expect_no_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = 5 )) ## - expect_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = 0 + expect_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = 0 )) - expect_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = NA + expect_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = NA )) - expect_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = "notexisting" + expect_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = "notexisting" )) - expect_error(filter_by_top_enriched( - data_filtered = test_data_center_expand, - filter_by_top_enriched = c(1, 2, 3) + expect_error(peakCombiner:::filter_by_top_enriched( + data = test_data_center_expand, + include_top_n_scoring = c(1, 2, 3) )) }) ## @@ -113,7 +105,7 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -122,16 +114,16 @@ test_that("Output data frame is correct", { expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) ## - expect_equal(mean(data$center), 1907.8125) - expect_identical(nrow(data), as.integer(64)) - expect_identical(data$start[1], 250) + expect_equal(round(mean(data$center), 0), 2458) + expect_identical(nrow(data), 52L) + expect_identical(data$start[1], 350) ## test_counts_left <- test_data_filtered |> dplyr::group_by(sample_name) |> - dplyr::summarise(counts = n()) |> + dplyr::summarise(counts = dplyr::n()) |> dplyr::filter(sample_name == "treatment_rep1") |> dplyr::pull(counts) - expect_identical(test_counts_left, as.integer(10)) + expect_identical(test_counts_left, 9L) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-filter_regions.R b/tests/testthat/test-filter_regions.R index b2f7396..624f021 100644 --- a/tests/testthat/test-filter_regions.R +++ b/tests/testthat/test-filter_regions.R @@ -1,57 +1,50 @@ -# Example -# test_that("multiplication works", { -# expect_equal(2 * 2, 4) -# }) ## ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## required_colnames <- c( - "chr", "start", "end", "name", "score", "strand", + "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) ## -test_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble + input_colnames <- colnames(test_data) ## -test_data_prepared <- prepare_input_regions( - data = test_data, - score_colname = "qValue" +test_data_prepared <- peakCombiner::prepare_input_regions( + data = test_data ) -test_data_center_expand <- center_expand_regions( +test_data_center_expand <- peakCombiner::center_expand_regions( data = test_data_prepared, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) ## input_colnames <- colnames(test_data_center_expand) ## -test_data_filtered <- filter_regions( +test_data_filtered <- peakCombiner::filter_regions( data = test_data_center_expand, - filter_by_chromosome_names = NULL, - filter_by_blacklist = "hg38", # "hg38", - filter_by_significance = NULL, - filter_by_top_enriched = NULL -) |> suppressWarnings() + include_by_chromosome_name = NULL, + exclude_by_blacklist = "hg38", + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL +) ## result_colnames <- colnames(test_data_filtered) ## -test_data_combined <- combine_regions( +test_data_combined <- peakCombiner::combine_regions( data = test_data_filtered, found_in_samples = 2, - center = "nearest" + combined_center = "nearest" ) ## -test_data_combined_ce <- center_expand_regions( +test_data_combined_ce <- peakCombiner::center_expand_regions( data = test_data_combined, - center_by = "summit", + center_by = "center_column", expand_by = NULL ) ## @@ -64,7 +57,7 @@ test_that("Input data frame has the expected structure", { ## expect_equal(length(input_colnames), 8) expect_identical(names(data), required_colnames) - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -72,7 +65,7 @@ test_that("Input data frame has the expected structure", { expect_true(is.character(data$strand)) expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) - expect_true(sum(str_detect(data$name, "|")) > 0) + expect_true(sum(stringr::str_detect(data$name, "|")) > 0) }) ## ### -----------------------------------------------------------------------### @@ -87,7 +80,7 @@ test_that("Output data frame is correct", { ## expect_identical(class(data)[2], "tbl") ## - expect_true(is.character(data$chr)) + expect_true(is.character(data$chrom)) expect_true(is.numeric(data$start)) expect_true(is.numeric(data$end)) expect_true(is.character(data$name)) @@ -96,124 +89,103 @@ test_that("Output data frame is correct", { expect_true(is.numeric(data$center)) expect_true(is.character(data$sample_name)) ## - expect_equal(mean(data$center), 1946.4789) - expect_identical(nrow(data), as.integer(71)) - expect_identical(data$start[1], 250L) + expect_equal(round(mean(data$center), 0), 2458) + expect_identical(nrow(data), 52L) + expect_identical(data$start[1], 350L) ## test_counts_left <- test_data_filtered |> dplyr::group_by(sample_name) |> - dplyr::summarise(counts = n()) |> + dplyr::summarise(counts = dplyr::n()) |> dplyr::filter(sample_name == "treatment_rep1") |> dplyr::pull(counts) - expect_identical(test_counts_left, as.integer(12)) + expect_identical(test_counts_left, 9L) }) ## -### -----------------------------------------------------------------------### +### --------------------------------------------------------------------------### ## test_that("Output data frame is correct for data_prepared", { ## data <- test_data_prepared - result <- filter_regions( + ## + expect_no_error(peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings() + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + )) ## - expect_no_error(filter_regions( + result <- peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings()) - ## - expect_identical(nrow(result), 71L) - expect_identical(result$start[9], 300L) + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + ) + ## + expect_identical(nrow(result), 52L) + expect_identical(result$start[9], 301L) }) ## test_that("Output data frame is correct for data_center_expand", { ## data <- test_data_center_expand - result <- filter_regions( - data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings() ## - expect_no_error(filter_regions( + result <- peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings()) + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + ) ## - expect_identical(nrow(result), 71L) + expect_identical(nrow(result), 52L) expect_identical(result$start[9], 250L) }) ## test_that("Output data frame is correct for data_filtered", { ## data <- test_data_filtered - result <- filter_regions( + result <- peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings() + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + ) ## - expect_no_error(filter_regions( - data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings()) - ## - expect_identical(nrow(result), 71L) + expect_identical(nrow(result), 52L) expect_identical(result$start[9], 250L) }) ## test_that("Output data frame is correct for data_combined", { ## data <- test_data_combined - ## - expect_no_error(filter_regions( + result <- peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings()) + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + ) ## + expect_identical(nrow(result), 10L) + expect_identical(result$start[9], 250L) }) ## test_that("Output data frame is correct for data_combined_ce", { ## data <- test_data_combined_ce - result <- filter_regions( + result <- peakCombiner::filter_regions( data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings() + exclude_by_blacklist = "hg38", + include_by_chromosome_name = NULL, + include_above_score_cutoff = NULL, + include_top_n_scoring = NULL + ) ## - expect_no_error(filter_regions( - data = data, - filter_by_blacklist = "hg38", - filter_by_chromosome_names = NULL, - filter_by_significance = NULL, - filter_by_top_enriched = NULL - ) |> suppressWarnings()) - ## - expect_identical(nrow(result), 13L) - expect_identical(result$start[9], 100L) + expect_identical(nrow(result), 10L) + expect_identical(result$start[9], 250L) }) ## ### -----------------------------------------------------------------------### diff --git a/tests/testthat/test-load_input_regions.R b/tests/testthat/test-load_input_regions.R index 4d2e9ce..04d4042 100644 --- a/tests/testthat/test-load_input_regions.R +++ b/tests/testthat/test-load_input_regions.R @@ -2,40 +2,28 @@ ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### +set.seed(1234) ## -input_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/opbaf-brd9-muckema1_rpackage_comb_peak/support/sample_sheet_test.tsv", show_col_types = FALSE) -# input_data <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/combpeaksr/lists/synthetic_genomic_regions.bed", show_col_types = FALSE) -input_colnames <- colnames(input_data) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble ## all_colnames <- c( - "chr", "start", "end", "name", "score", "strand", "center", "sample_name" + "chrom", "start", "end", "name","score", "strand", "center", "sample_name" ) -## -samplesheet_colnames <- c( - "sample_name", "file_path", "file_format" +input_colnames <- c( + "chrom", "start", "end", "sample_name" ) ## -data_prepared <- load_input_regions( - data = input_data, - score_colname = "qValue", - all_colnames = all_colnames -) +data_prepared <- test_data ## ### -----------------------------------------------------------------------### ### Test input ### -----------------------------------------------------------------------### ## test_that("Test if function works with correct input", { - expect_no_error(load_input_regions( - data = input_data, - score_colname = NULL, - all_colnames = all_colnames + expect_error(peakCombiner::load_input_regions( + data = test_data )) }) ## @@ -54,108 +42,46 @@ test_that("Parameter `score_colname` is not numeric.", { ### -----------------------------------------------------------------------### ## test_that("Input data has exact three columns.", { - expect_equal(length(input_colnames), 3) + expect_equal(length(input_colnames), 4) }) ## test_that("Input data colnames are the expected once.", { - expect_identical(names(input_data), allowed_col_names) + expect_identical(names(test_data), all_colnames) }) ## test_that("Input column 'sample_name' is a class 'character'.", { - expect_true(is.character(input_data$sample_name)) -}) -## -test_that("Input column 'file_path' is a class 'character'", { - expect_true(is.character(input_data$file_path)) -}) -## -test_that("Input column 'file_format' is a class 'character'", { - expect_true(is.character(input_data$file_format)) + expect_true(is.character(test_data$sample_name)) }) ## ### -----------------------------------------------------------------------### ## test_that("Error occurs when 'data' does not exist.", { - expect_error(load_input_regions( - data = "nonexisting", - score_colname = "qValue", - all_colnames = all_colnames - ), "input_data") + expect_error(peakCombiner::load_input_regions( + data = "nonexisting" + ), ) }) ## test_that("Error occurs when 'data' has the wrong structure.", { - expect_error(load_input_regions( - data = tibble(1:10), - score_colname = "qValue", - all_colnames = all_colnames - ), "input_data") + expect_error(peakCombiner::load_input_regions( + data = tibble(1:10) + )) }) ## test_that("Error occurs when 'data' is a vector.", { - expect_error(load_input_regions( - data = as.vector(1:10), - score_colname = "qValue", - all_colnames = all_colnames - ), "input_data") -}) -## -test_that("Error occurs when 'data' is 'NULL'.", { - expect_error(load_input_regions( - data = NULL, - score_colname = "qValue", - all_colnames = all_colnames - ), "input_data") -}) -## -test_that("Error occurs when 'data' is 'NA'.", { - expect_error(load_input_regions( - data = NA, - score_colname = "qValue", - all_colnames = all_colnames - ), "input_data") -}) -## -### -----------------------------------------------------------------------### -## -test_that("Error occurs when 'score_colname' is not existing.", { - expect_error(load_input_regions( - data = input_data, - score_colname = "nonexisting", - all_colnames = all_colnames - ), "score_colname") -}) -## -test_that("Error occurs when 'score_colname' is other required colname.", { - expect_error(load_input_regions( - data = input_data, - score_colname = "start", - all_colnames = all_colnames - ), "score_colname") -}) -## -### -----------------------------------------------------------------------### -## -test_that("Error occurs when 'all_colnames' is not existing.", { - expect_error(load_input_regions( - input_data = input_data, - score_colname = "qValue", - all_colnames = "nonexisting" + expect_error(peakCombiner::load_input_regions( + data = as.vector(1:10) ), ) }) ## -test_that("Error occurs when 'all_colnames' is vector of length 2.", { - expect_error(load_input_regions( - input_data = input_data, - score_colname = "qValue", - all_colnames = c("C1", "C2") +test_that("Error occurs when 'data' is 'NULL'.", { + expect_error(peakCombiner::load_input_regions( + data = NULL ), ) }) ## -test_that("Error occurs when 'all_colnames' is 'NULL'.", { - expect_error(load_input_region( - input_data = input_data, - score_colname = "qValue", - all_colnames = NULL +test_that("Error occurs when 'data' is 'NA'.", { + expect_error(peakCombiner::load_input_regions( + data = NA ), ) }) ## @@ -163,24 +89,16 @@ test_that("Error occurs when 'all_colnames' is 'NULL'.", { ### Test output ### -----------------------------------------------------------------------### ## -test_that("Column names of output data are identical with required once.", { - expect_setequal(colnames(data_prepared), all_colnames) -}) -## -test_that("Output data has the right number of columns", { - expect_equal(ncol(data_prepared), 8) - ## -}) -## test_that("Output data has the right class.", { expect_identical(class(data_prepared)[2], "tbl") ## }) ## test_that("Output data has in column 'score', row 1 the correct value.", { - expect_identical(data_prepared$score[1], 4701.96729) + expect_identical(round(data_prepared$score[1], 0), 100) }) ## test_that("Output data has the correct number of rows.", { - expect_identical(nrow(data_prepared), 814153L) + expect_identical(nrow(data_prepared), 55L) }) + diff --git a/tests/testthat/test-prepare_input_regions.R b/tests/testthat/test-prepare_input_regions.R index fc2eb90..0938e83 100644 --- a/tests/testthat/test-prepare_input_regions.R +++ b/tests/testthat/test-prepare_input_regions.R @@ -2,89 +2,53 @@ ### -----------------------------------------------------------------------### ### Prepare data for testing ### -----------------------------------------------------------------------### -## tweak the prepare_input_regions() function and re-load it -devtools::load_all() -library("tidyverse") -library("GenomicRanges") ## -### -----------------------------------------------------------------------### -### Prepare data for testing -### -----------------------------------------------------------------------### - +set.seed(1234) +## colnames_preloaded_df <- c( "chrom", "start", "end", "name", "score", "strand", "center", "sample_name" ) -colnames_sample_sheet <- c( - "sample_name", "file_path", "file_format", "score_colname" - ) - allowed_file_format <- c("narrowpeak", "broadpeak", "bed") +data(syn_data_bed, package = "peakCombiner") +samplesheet_test <- syn_data_bed -samplesheet_test <- readr::read_tsv("/da/ONC/BFx/research/muckema1/discovery_brd9/analysis/opbaf-brd9-muckema1_rpackage_comb_peak/support/sample_sheet_test.tsv", show_col_types = FALSE) - -test_sample_sheet <- prepare_input_regions( - data = samplesheet_test[1,] +test_sample_sheet <- peakCombiner::prepare_input_regions( + data = samplesheet_test[1, ] ) - -test_data <- readr::read_tsv(paste0("lists/synthetic_genomic_regions.bed"), show_col_types = FALSE) +data(syn_data_tibble, package = "peakCombiner") +test_data <- syn_data_tibble input_colnames <- colnames(test_data) -test_data_prepared <- prepare_input_regions( +test_data_prepared <- peakCombiner::prepare_input_regions( data = test_data ) restult_colnames <- colnames(test_data_prepared) - ### -----------------------------------------------------------------------### ### Test input ### -----------------------------------------------------------------------### -### Test sample sheet -test_that("Input data has all required columns", { - expect_true(all(colnames(samplesheet_test) %in% colnames_sample_sheet) -) -}) - -test_that("Check if all entries in sample_names are unique", { - expect_true(samplesheet_test |> - dplyr::pull("sample_name") |> - unique() |> - length() == nrow(samplesheet_test)) -}) - -test_that("Check if all entries in sample_names are unique", { - expect_true(samplesheet_test |> - dplyr::pull("file_format") |> - unique() |> - length() == 1) -}) - -### -----------------------------------------------------------------------### -### Test pre-loaded data frame +### Test pre-loaded data frame test_that("Test if function works with correct input", { - expect_no_error(prepare_input_regions( + expect_no_error(peakCombiner::prepare_input_regions( data = test_data )) }) test_that("Input data has at least 8 number of columns", { - expect_gt(length(colnames(test_data)), 8) + expect_equal(length(colnames(test_data)), 8) }) test_that("Column names of input data are identical with required once.", { expect_true(all(colnames_preloaded_df %in% names(test_data))) }) - - ### -----------------------------------------------------------------------### -### Test pre-loaded gRanges - - +### Test pre-loaded gRanges ### -----------------------------------------------------------------------### test_that("Input data has the right number of columns", { @@ -103,10 +67,6 @@ test_that("Input column 'end' is a class 'numeric'.", { expect_true(is.numeric(test_data$end)) }) -test_that("Input column 'name' is a class 'character'.", { - expect_true(is.character(test_data$name)) -}) - test_that("Input column 'score' is a class 'numeric'.", { expect_true(is.numeric(test_data$score)) }) @@ -123,16 +83,12 @@ test_that("Input column 'sample_name' is a class 'character'.", { expect_true(is.character(test_data$sample_name)) }) -test_that("Values in column 'name' contain the separater '|'", { - expect_true(sum(str_detect(test_data$name, "|")) > 0) -}) - ### -----------------------------------------------------------------------### ### Test output ### -----------------------------------------------------------------------### test_that("Output data frame has the correct structure.", { - expect_no_error(check_data_structure(test_data_prepared)) + expect_no_error(peakCombiner:::check_data_structure(test_data_prepared)) }) test_that("Column names of output data are identical with required once.", { @@ -180,11 +136,11 @@ test_that("Ouput column 'sample_name' is a class 'character'.", { }) test_that("The mean of all output centers.", { - expect_equal(mean(test_data_prepared$center), 1942.2535) + expect_equal(round(mean(test_data_prepared$center), 0), 2452) }) test_that("The number of rows in the output file.", { - expect_identical(nrow(test_data_prepared), as.integer(71)) + expect_identical(nrow(test_data_prepared), 52L) }) ### -----------------------------------------------------------------------### diff --git a/vignettes/peakCombiner.Rmd b/vignettes/peakCombiner.Rmd index d4fac11..2604411 100644 --- a/vignettes/peakCombiner.Rmd +++ b/vignettes/peakCombiner.Rmd @@ -6,7 +6,7 @@ bibliography: peakCombiner-refs.bib output: #word_document BiocStyle::html_document: - toc_float: true + toc_float: true vignette: > %\VignetteIndexEntry{peakCombiner} %\VignetteEngine{knitr::rmarkdown} @@ -15,19 +15,19 @@ vignette: > ```{r setup, include = FALSE} knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.width = 7, - fig.height = 5, - out.width = "80%", - fig.align = "center", - crop = NULL # suppress "The magick package is required to crop" issue + collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5, + out.width = "80%", + fig.align = "center", + crop = NULL # suppress "The magick package is required to crop" issue ) library(BiocStyle) ``` # Introduction -This vignette for the R package `peakCombiner` guides you through the preparation of all accepted input files and how to define the best parameters for your analysis. Our goal is to provide you with easy to understand, transparent and modular framework to create the consensus regions file you need to address your scientific question. +This vignette for the R package `r Biocpkg("peakCombiner")` guides you through the preparation of all accepted input files and how to define the best parameters for your analysis. Our goal is to provide you with an easy to understand, transparent and modular framework to create the consensus regions file you need to address your scientific question. ## Abstract Genome-wide epigenomic data sets like ChIP-seq and ATAC-seq typically use tools (e.g. MACS [@macs]) to identify genomic regions of interest, so-called peaks, usually for multiple sample replicates and across experimental conditions. Many downstream analyses require a consensus set of genomic regions relevant to the experiment, but current tools within the R ecosystem to easily and flexibly create combined peak sets from conditions and replicates are limited. Here, we describe `r Biocpkg("peakCombiner")`, a fully R-based, user-friendly, transparent, and flexible tool that allows even novice R users to create a high-quality consensus peak list. The modularity of its functions allows an easy way to filter and recenter input and output data. A broad range of accepted input data formats can be used as input to `r Biocpkg("peakCombiner")`, and the resulting consensus regions set can be exported to a file or used immediately as the starting point for most downstream peak analyses. @@ -44,13 +44,13 @@ The package `r Biocpkg("peakCombiner")` contains four main functions to curate a | [`filter_regions`][Filter regions] | Allows you to filter your genomic regions. | See in section [Filter regions]. | [`combine_regions`][Combine Regions] | Combine overlapping genomic regions to create a single set of consensus genomic regions. | See in section [Combine regions]. -Each `r Biocpkg("peakCombiner")` function has the parameter`show_messages` that by default prints feedback messages as functions run. If you plan to use `r Biocpkg("peakCombiner")` non-interactively in a workflow, you can silence these messages by setting `show_messages = FALSE`. Note that error messages will still be printed to help you to troubleshoot potential problems. +Each `r Biocpkg("peakCombiner")` function has the parameter `show_messages` that by default prints feedback messages as functions run. If you plan to use `r Biocpkg("peakCombiner")` non-interactively in a workflow, you can silence these messages by setting `show_messages = FALSE`. Note that error messages will still be printed to help you to troubleshoot potential problems. ## Standard genomic regions format The modularity of `r Biocpkg("peakCombiner")` is enabled by standardizing how the genomic regions and samples are represented in the input and output of all its functions. We chose to use a tibble because this allows you to easily plot and explore the data. The columns are described in the table below. Only ‘chrom’, ‘start’, ‘end’, ‘sample_name’ are required to use `r Biocpkg("peakCombiner")`, but you have more possibilities for [`center_expand_regions`][Center and expand regions] and [`filter_regions`][Filter regions] if you can provide the optional ‘score’ and ‘center’ columns. -If you can’t provide data for the optional columns, prepare_input_regions adds sensible defaults for you (see “Relevant Section”). -We recommend that you create this standard tibble using prepare_input_regions (see section “Prepare input data”), but you can also prepare it yourself (see section "[Load from pre-loaded tibble]") +If you can’t provide data for the optional columns, `prepare_input_regions` adds sensible defaults for you. +We recommend that you create this standard tibble using `prepare_input_regions` (see section “Prepare input data”), but you can also prepare it yourself (see section "[Load from pre-loaded tibble]") |Column name |Requirement |Content |Details @@ -68,14 +68,17 @@ We recommend that you create this standard tibble using prepare_input_regions (s `r Biocpkg("peakCombiner")` can be installed from Bioconductor using the `r CRANpkg("BiocManager")` package: ```{r, eval=FALSE} -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} BiocManager::install("peakCombiner") + +library("ggplot2") ``` -# A fast start for a complete run -Here you find a fast start with `r Biocpkg("peakCombiner")` and all function how we do recommend using it. +# An overview of a complete run +Here you find an overview of a complete run of the `r Biocpkg("peakCombiner")` recommended workflow. ```{r, eval=TRUE} devtools::load_all() #library("peakCombiner") @@ -87,16 +90,19 @@ syn_data_tibble data_prepared <- prepare_input_regions( data = syn_data_tibble, - show_messages = FALSE) + show_messages = FALSE +) data_center_expand <- center_expand_regions( data = data_prepared, - show_message = FALSE) + show_message = FALSE +) data_filtered <- filter_regions( data = data_center_expand, exclude_by_blacklist = "hg38", - show_messages = FALSE) + show_messages = FALSE +) consensus_peak <- combine_regions( data = data_filtered, @@ -105,7 +111,7 @@ consensus_peak <- combine_regions( ) consensus_final <- center_expand_regions( - data = consensus_peak, + data = consensus_peak, expand_by = 350, show_messages = FALSE ) @@ -113,25 +119,25 @@ consensus_final <- center_expand_regions( consensus_final ``` -Please not that the message occurring during the [`filter_regions`][Filter regions] is expected if chromosome names between input sample and blacklist are not absolutely identical. +Please note that the message occurring during the [`filter_regions`][Filter regions] is expected if chromosome names between input sample and blacklist are not absolutely identical. -* Export regions Finally we export the resulting consensus regions tibble and save it as BED-file. To establish a classical BED-file we remove here the column names. ```{r, eval=FALSE} -rtracklayer::export.bed(consensus_final, paste0(here::here(),"/lists/consensus_regions.bed"), format = "bed") +rtracklayer::export.bed(consensus_final, paste0(here::here(), "/lists/consensus_regions.bed"), format = "bed") ``` # Prepare input data -In this section, we explain how to prepare the accepted, standardized input files and how to add needed features to these files. `prepare_input_regions` is the mandatory first step to prepare your input data in the format needed for all of the following steps in `r Biocpkg("peakCombiner")` (For details see section "[Standard genomic regions format]"). This function accepts three types of input formats: +In this section, we explain how to prepare the accepted, standardized input files and how to add needed features to these files. `prepare_input_regions` is the mandatory first step to prepare your input data in the format needed for all of the following steps in `r Biocpkg("peakCombiner")` (for details see section "[Standard genomic regions format]"). This function accepts three types of input formats: + + a tibble of genomic regions and specifically named columns, + a GenomicRanges object of genomic regions and specifically named columns, or + a tibble (sample sheet) that specifies file paths of BED-like files for a set of samples (e.g. standard output files from peak callers like ‘narrowPeak’ files). -*Please note* that the function `prepare_input_regions` also has a filtering step, which automatically checks for genomic regions with the same values in the columns 'chrom', 'start', 'end' and 'sample_name' and filter for the strongest enriched summit (based on the 'score' values) per region. Additional summits are removed. Regions from peak callers can have multiple summits annotated per enriched genomic regions. This step is *not optional*. +*Please note* that the function `prepare_input_regions` also has a filtering step, which automatically checks for genomic regions with the same values in the columns 'chrom', 'start', 'end' and 'sample_name' and filters for the strongest enriched summit (based on the 'score' values) per region. Additional summits are removed. Regions from peak callers can have multiple summits annotated per enriched genomic regions. This step is *not optional*. -`prepare_input_regions` returns the data that you need for all your future work as a tibble having the structure as described in section "[Standard genomic regions format]". +`prepare_input_regions` returns the data that you need for all your future work as a tibble having the structure as described in the section "[Standard genomic regions format]". ## Quick start This is for you if you want to jump right in with some code. @@ -142,9 +148,9 @@ utils::data("syn_sample_sheet") syn_sample_sheet ``` -This is the expected structure of an sample_sheet. +This is the expected structure of a sample_sheet. -If you already have your BED-like region files loaded into your R session, you can alternatively provide them to `prepare_input_regions` as a single GRanges or tibble object with genomic regions and named columns (see section "[Standard genomic regions format]") that, importantly, include a unique sample identifier ('sample_name'). Here, we show as an example the GRanges object. +If you already have your BED-like region files loaded into your R session, you can alternatively provide them to `prepare_input_regions` as a single GRanges or tibble object with genomic regions and named columns (see section "[Standard genomic regions format]") that, importantly, include a unique sample identifier ('sample_name'). Here, we show as an example the tibble object. ```{r, eval=TRUE} utils::data("syn_data_control01") @@ -159,23 +165,25 @@ syn_data_treatment01 Let's combine the two tibbles. ```{r, eval=TRUE} -combined_input <- syn_data_control01 |> - dplyr::mutate(sample_name = "control-rep1") |> - rbind(syn_data_treatment01 |> - dplyr::mutate(sample_name = "treatment-rep1")) +combined_input <- syn_data_control01 |> + dplyr::mutate(sample_name = "control-rep1") |> + rbind(syn_data_treatment01 |> + dplyr::mutate(sample_name = "treatment-rep1")) -combined_input |> +combined_input |> dplyr::group_by(sample_name) |> dplyr::summarize(num_regions = dplyr::n()) ``` ```{r, eval=TRUE} -prepare_input_regions(data = combined_input, - show_messages = FALSE) +prepare_input_regions( + data = combined_input, + show_messages = FALSE +) ``` ## Load from sample sheet -We recommend that you use a sample sheet to prepare your data, especially if you have 'narrowPeak' or 'broadPeak' files. `prepare_input_regions` loads in the genomic regions file and formats them so that they are read to be used in peakCombiner’s other functions. The sample sheet has three required columns and one optional column, and is described as follows. +We recommend that you use a sample sheet to prepare your data, especially if you have 'narrowPeak' or 'broadPeak' files. `prepare_input_regions` loads in the genomic regions file and formats them so that they are ready to be used in peakCombiner’s other functions. The sample sheet has three required columns and one optional column, and is described as follows. |Column name |Requirement |Column content |Details |:-----------|:-----------|:-------------------|:------------------- @@ -190,12 +198,13 @@ The following code illustrates how you prepare an accepted sample sheet in R. First, let's get the paths to the peak files we want to use and save it. - ```{r, eval=TRUE} - file_names <- list.files( - paste0(here::here(),"/inst/syndata"), - pattern = ".narrowPeak$", - full.names = TRUE) - file_names + ```{r, eval=FALSE} +#file_names <- list.files( +# paste0(here::here(), "/inst/syndata"), +# pattern = ".narrowPeak$", +# full.names = TRUE +#) +file_names ``` + **Create a sample sheet.** @@ -203,14 +212,9 @@ The following code illustrates how you prepare an accepted sample sheet in R. Next, we create a tibble (named 'sample_sheet') with the correct column names ('sample_name', 'file_path', 'file_format', 'score_colname') to load in our data. ```{r, eval=TRUE} - sample_sheet <- tibble::tibble( - sample_name = c("control01","control02","control03","treatment01","treatment02","treatment03"), - file_path = file_names, - file_format = "narrowPeak", - score_colname = "qValue", - .rows = 6) - - sample_sheet +sample_sheet <- peakCombiner::syn_sample_sheet + +sample_sheet ``` With this step you create a new tibble containing all the required information to run `prepare_input_regions`. @@ -219,19 +223,19 @@ The following code illustrates how you prepare an accepted sample sheet in R. Now we use the prepared tibble (sample_sheet) and add it as argument `data` into the function `prepare_input_regions`. - ```{r, eval=TRUE} - prepare_input_regions( - data = sample_sheet, - show_messages = FALSE - ) + ```{r, eval = FALSE} +#prepare_input_regions( +# data = sample_sheet, +# show_messages = FALSE +#) ``` This returned value is a tibble that contains all required information formatted correctly in order to use the downstream functions within `r Biocpkg("peakCombiner")`. For more information about its structure, go back to the "[Standard genomic regions format]" section. ## Load from pre-loaded tibble -If you have already been working with your genomic regions from within an R session, you may have them pre-loaded as a tibble (this section) or as a GRanges object (next section: [Load from GenomicRanges object]). This could be either that per sample one tibble exists or that you ran `r Biocpkg("peakCombiner")` before on two data sets and now want to combine these. In this section we show you prepare your input data from pre-loaded tibbles. +If you have already been working with your genomic regions from within an R session, you may have them pre-loaded as a tibble (this section) or as a GRanges object (next section: [Load from GenomicRanges object]). This could be either that per sample one tibble exists or that you ran `r Biocpkg("peakCombiner")` before on two data sets and now want to combine these. In this section we show how you prepare your input data from pre-loaded tibbles. -When we start by loading in two of our synthetic data sets in 'narrowPeak' format, a common format generated by peak callers like MACS. Most BED-like files (including 'narrowPeak' files) don’t include column names in the file so you will have to name them yourself using the standard naming conventions from as described by UCSC for [BED](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) or [narrowPeak](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) or [broadPeak](https://genome.ucsc.edu/FAQ/FAQformat.html#format13). As we know what to expect in each column we can name the columns correctly: 'chrom' (X1), 'start' (X2), 'end' (X3), 'name' (X4), 'score' (X5) and the 'strand' (X6). To be clear, the names that you define for the columns are used by `prepare_input_regions` to property format the data. +We start by loading in two of our synthetic data sets in 'narrowPeak' format, a common format generated by peak callers like MACS. Most BED-like files (including 'narrowPeak' files) don’t include column names in the file so you will have to name them yourself using the standard naming conventions as described by UCSC for [BED](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) or [narrowPeak](https://genome.ucsc.edu/FAQ/FAQformat.html#format12) or [broadPeak](https://genome.ucsc.edu/FAQ/FAQformat.html#format13). As we know what to expect in each column we can name the columns correctly: 'chrom' (X1), 'start' (X2), 'end' (X3), 'name' (X4), 'score' (X5) and the 'strand' (X6). To be clear, the names that you define for the columns are used by `prepare_input_regions` to property format the data. Columns named 'chrom', 'start', 'end', 'name', 'score', 'strand', 'center' and 'sample_name' are maintained. *Please note* that within `r Biocpkg("peakCombiner")` we call columns with relative information about the peak summit 'summit' and with absolute values 'center'. So, if a column named 'summit' is provided, it is used to populate a column named 'center'. All other columns are dropped at the end of `prepare_input_regions`. @@ -241,15 +245,15 @@ Columns named 'chrom', 'start', 'end', 'name', 'score', 'strand', 'center' and ' Now let's load the first narrowPeak file. Note that the columns are named already correctly and we expect this from your data as well. ```{r, eval=TRUE} - utils::data(syn_control_rep1_narrowPeak) - syn_control_rep1_narrowPeak +utils::data(syn_control_rep1_narrowPeak) +syn_control_rep1_narrowPeak ``` And the second file. ```{r, eval=TRUE} - utils::data(syn_treatment_rep1_narrowPeak) - syn_treatment_rep1_narrowPeak +utils::data(syn_treatment_rep1_narrowPeak) +syn_treatment_rep1_narrowPeak ``` + **Add column 'sample_name'** @@ -257,15 +261,15 @@ Columns named 'chrom', 'start', 'end', 'name', 'score', 'strand', 'center' and ' Now we add a column named 'sample_name' to each of our tibbles. ```{r, eval=TRUE} - control <- syn_control_rep1_narrowPeak |> - dplyr::mutate(sample_name = "control-rep1") - control +control <- syn_control_rep1_narrowPeak |> + dplyr::mutate(sample_name = "control-rep1") +control ``` ```{r, eval=TRUE} - treatment <- syn_treatment_rep1_narrowPeak |> - dplyr::mutate(sample_name = "treatment-rep1") - treatment +treatment <- syn_treatment_rep1_narrowPeak |> + dplyr::mutate(sample_name = "treatment-rep1") +treatment ``` + **Combine multiple tibbles** @@ -273,16 +277,17 @@ Columns named 'chrom', 'start', 'end', 'name', 'score', 'strand', 'center' and ' Finally, combine the multiple input tibbles into one. ```{r, eval=TRUE} - combined_input <- control |> rbind(treatment) - combined_input +combined_input <- control |> + rbind(treatment) +combined_input ``` And check how many rows we have now for each sample. ```{r, eval=TRUE} - combined_input |> - dplyr::group_by(sample_name) |> - dplyr::count(name = "number_of_entries") +combined_input |> + dplyr::group_by(sample_name) |> + dplyr::count(name = "number_of_entries") ``` Both 'sample_name's are found, so we know that we have successfully combined the data sets. @@ -292,13 +297,13 @@ Columns named 'chrom', 'start', 'end', 'name', 'score', 'strand', 'center' and ' After preparing the pre-loaded tibble, we run the function `prepare_input_regions` and use the tibble in the parameter `data`. ```{r, eval=TRUE} - prepare_input_regions( - data = combined_input, - show_messages = FALSE - ) +prepare_input_regions( + data = combined_input, + show_messages = FALSE +) ``` -The output tibble from prepare_input_data can now be used for your next steps with `r Biocpkg("peakCombiner")`. For details about the accepted file structure see section "[Standard genomic regions format]". +The output tibble from `prepare_input_data` can now be used for your next steps with `r Biocpkg("peakCombiner")`. For details about the accepted file structure see section "[Standard genomic regions format]". ## Load from GenomicRanges object In memory GenomicRanges object listing the genomic regions in a sample. This object is very similar to the tibble above, except that `chrom`, `start`, and `end` are instead described using the `GenomicRanges` nomenclature (See [here](https://bioconductor.org/packages/devel/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesIntroduction.html) for details ). @@ -308,30 +313,33 @@ In memory GenomicRanges object listing the genomic regions in a sample. This obj As first step we load the provided synthetic data originating from a GenomicRanges object. ```{r, eval=TRUE} - utils::data("syn_data_granges") - syn_data_granges +utils::data("syn_data_granges") +syn_data_granges ``` - The column names are based on its orginal GenomicRanges file format. This allows us to easily transform it into a GenomicRanges object. Note that normally we expect you to have the GenomicObject pre-loaded and want to use the `r Biocpkg("peakCombiner")` on this data set. For the purpose of showing you how a accepted GenomicRanges object has be structured we transform it here. + The column names are based on its original GRanges file format. This allows us to easily transform it into a GRanges object. Note that normally we expect you to have the GRanges object pre-loaded and want to use the `r Biocpkg("peakCombiner")` on this data set. For the purpose of showing you how a accepted GRanges object has to be structured we transform it here. ```{r, eval=TRUE} - GenomicRanges_data <- GenomicRanges::makeGRangesFromDataFrame(syn_data_granges, keep.extra.columns = TRUE) - GenomicRanges_data +GenomicRanges_data <- GenomicRanges::makeGRangesFromDataFrame( + syn_data_granges, + keep.extra.columns = TRUE +) +GenomicRanges_data ``` + **Prepare input from GenomicRanges object** - You can simply use your GenomicRanges object in the parameter ‘data’ and load it in. The output tibble from prepare_input_data can now be used for your next steps with `r Biocpkg("peakCombiner")`. For details about the accepted file structure see section "[Standard genomic regions format]". + You can simply use your GRanges object in the parameter ‘data’ and load it in. The output tibble from `prepare_input_data` can now be used for your next steps with `r Biocpkg("peakCombiner")`. For details about the accepted file structure see section "[Standard genomic regions format]". ```{r, eval=TRUE} - prepare_input_regions( - data = GenomicRanges_data, - show_messages = FALSE - ) +prepare_input_regions( + data = GenomicRanges_data, + show_messages = FALSE +) ``` ## Explained in detail -We recommend to use the function `prepare_input_regions` to prepare the input data in the format needed for all of the following steps in `r Biocpkg("peakCombiner")`. In theory you can also manually provided an expected input data when preparing your data following the descriptions in the section "[Standard genomic regions format]". +We recommend to use the function `prepare_input_regions` to prepare the input data in the format needed for all of the following steps in `r Biocpkg("peakCombiner")`. In theory you can also manually provide an expected input data when preparing your data following the descriptions in the section "[Standard genomic regions format]". ### Load from BED file @@ -340,14 +348,14 @@ Unlike 'narrowPeak' files, BED files typically do not include columns for summit Lets load a BED file. ```{r, eval=TRUE} -utils::data(syn_data_bed) +utils::data("syn_data_bed") syn_data_bed |> dplyr::arrange(sample_name) ``` When we pull the 'sample_name' column we see the different number of entries for each sample name. ```{r, eval=TRUE} -syn_data_bed |> +syn_data_bed |> dplyr::group_by(sample_name) |> dplyr::summarize(num_regions = dplyr::n()) ``` @@ -358,7 +366,7 @@ And now we use it as input for `prepare_input_regions`. prepare_input_regions( data = syn_data_bed, show_messages = TRUE - ) +) ``` Please note here that the information messages are informing you about all missing values and with which default values these columns are populated. The 'score' is set to 0 as no information can be obtained from a classical BED file about enrichment values. The column 'strand' is populated with the value '.', representing that no strand information is known. The 'center' is calculated based on the arithmetical midpoint of each region as no 'summit' input column was found. The resulting tibble can be used with all functions ([`center_expand_regions`][Center and expand regions], [`filter_regions`][Filter regions], [`combine_regions`][Combine Regions]) of the package but certain option are limited due to the missing information in the input. @@ -380,7 +388,8 @@ The quickest way to get started is to call `center_expand_regions` using just th ```{r, eval=TRUE} center_expand_regions( data = data_prepared, - show_message = FALSE) + show_message = FALSE +) ``` The tibble you obtained has altered coordinates for the genomic region. The center of the region is defined as the value found in the column 'center' and the expansion is based on the median region size of all input regions. @@ -402,12 +411,12 @@ The expected input is the standard tibble as described previously (See section " + **Center & expanding the regions** ```{r, eval=TRUE} - center_expand_regions( - data = data_prepared, - center_by = "center_column", - expand_by = NULL, - show_messages = TRUE - ) +center_expand_regions( + data = data_prepared, + center_by = "center_column", + expand_by = NULL, + show_messages = TRUE +) ``` You can appreciate that values for 'start' and 'end' are changed, while the number of input regions stays the same. @@ -418,7 +427,7 @@ The expected input is the standard tibble as described previously (See section " ```{r, eval=TRUE} center_expand_regions( - data = data_prepared, + data = data_prepared, center_by = "center_column", expand_by = c(500), show_messages = FALSE @@ -427,7 +436,7 @@ center_expand_regions( ```{r, eval=TRUE} center_expand_regions( - data = data_prepared, + data = data_prepared, center_by = "center_column", expand_by = c(100, 1000), show_messages = FALSE @@ -440,7 +449,7 @@ In all cases, the input regions are altered. If you are unsure how to define thi In this section we want to go into more detail to help you define the ideal parameters for the function `center_expand_regions`. The expected input data structure we described early in this section "[Standard genomic regions format]". -After the preparation of the input data, the first step, we recommend to do is to center and expand the genomic regions by using `center_expand_regions`. It is useful if you want all of your peaks to be the same size for your downstream analyses or if you want to use the information about the maximum enrichment in each reach (often called 'summit') (stored in the column 'center'), normally obtained by some peak callers (e.g., MACS2). The function allows you to automatically center your regions of interest on these summits to capture information about the most important part within a genomic region (e.g., TF-binding site or highest peak). +After the preparation of the input data, the first step, we recommend to center and expand the genomic regions by using `center_expand_regions`. It is useful if you want all of your peaks to be the same size for your downstream analyses or if you want to use the information about the maximum enrichment in each reach (often called 'summit') (stored in the column 'center'), normally obtained by some peak callers (e.g., MACS2). The function allows you to automatically center your regions of interest on these summits to capture information about the most important part within a genomic region (e.g., TF-binding site or highest peak). There are two concepts that are relevant to understand the function `center_expand_regions`: @@ -482,7 +491,8 @@ As a quick first example, you can easily exclude blacklisted hg38 regions as fol data_filtered <- filter_regions( data = data_center_expand, exclude_by_blacklist = "hg38", - show_messages = FALSE) + show_messages = FALSE +) data_filtered ``` @@ -508,11 +518,12 @@ filter_regions( exclude_by_blacklist = "hg38", include_above_score_cutoff = 2.5, include_top_n_scoring = 6, - show_messages = TRUE) - + show_messages = TRUE +) ``` The filtering occurred in the order of the parameters and can be described as following: + 1. The data set is filtered to only include chromosomes with the names 'chr1', 'chr2' and 'chr4', 2. Any remaining regions overlapping with blacklisted human regions are removed, 3. Regions having a ‘score’ above ‘2.5’ are retained, and @@ -536,24 +547,24 @@ In this subsection, we provide an example to identify “canonical” human chro First we extract all chromosome names from the input data. ```{r, eval=TRUE} - input_chrom <- - data_center_expand |> - dplyr::select(chrom) |> - unique() - input_chrom +input_chrom <- + data_center_expand |> + dplyr::select(chrom) |> + unique() +input_chrom ``` Here we see that in this data set we have some unexpected values for chromosome names like "chr4 2", "chr4|2" or "chr42". Let’s modify this vector to only keep what we consider to be the "canonical" chromosomes. In real world human data sets, you may find names like "chr11_KI270721v1_random" or "chrUn_GL000195v1" that you might want to remove for your downstream analyses To do so, the next step is to filter with regular expressions to maintain only wanted chromosome names. ```{r, eval=TRUE} - include_chrom <- input_chrom |> - dplyr::filter(grepl("^chr[0-9]$|^chr[1-2][0-9]$|^chr[XYM]", chrom)) |> - dplyr::pull(chrom) |> - unique() |> - sort() - - include_chrom +include_chrom <- input_chrom |> + dplyr::filter(grepl("^chr[0-9]$|^chr[1-2][0-9]$|^chr[XYM]", chrom)) |> + dplyr::pull(chrom) |> + unique() |> + sort() + +include_chrom ``` Finally, we can use this vector of good names in filter_regions for the parameter `include_by_chromosome_name` @@ -565,7 +576,8 @@ data_filtered_chr <- filter_regions( exclude_by_blacklist = NULL, include_above_score_cutoff = NULL, include_top_n_scoring = NULL, - show_messages = FALSE) + show_messages = FALSE +) data_filtered_chr ``` @@ -589,27 +601,28 @@ data_filtered_chr |> It is often recommended to exclude blacklisted genomic regions from your analyses, and the `exclude_by_blacklist` parameter allows you to do that easily. Genomic regions that overlap a blacklisted region with at least 1 base are removed. We provide blacklisted regions from ENCODE for human ([hg38](https://www.encodeproject.org/files/ENCFF356LFX/)) and mouse ([mm10](https://www.encodeproject.org/files/ENCFF547MET/)). The blacklists were manually curated by ENCODE by combing several published blacklisted regions. -While we recommend to use the provided ENCODE blacklists, you can alternatively provide your own blacklist as a tibble containing genomic regions with columns named 'chrom', 'start' and 'end'. In more general terms, you can use any file with genomic locations here to remove overlapping regions, for instance if you want to remove regions oberlapping with promoters you can use this functions here to remove these. +While we recommend to use the provided ENCODE blacklists, you can alternatively provide your own blacklist as a tibble containing genomic regions with columns named 'chrom', 'start' and 'end'. In more general terms, you can use any file with genomic locations here to remove overlapping regions, for instance if you want to remove regions overlapping with promoters you can use this functions here to remove these. -If you would like to remove blacklisted regions from multiple sources, you can run `filter_regions` repeatedly times, for example first to remove the ENCODE blacklisted regions and then to remove your own blacklisted sites. We show how to do this below. +If you would like to remove blacklisted regions from multiple sources, you can run `filter_regions` repeatedly, for example first to remove the ENCODE blacklisted regions and then to remove your own blacklisted sites. We show how to do this below. #### Filter for enrichment value Setting this parameter allows you to retain any peak that has a bigger score than the `include_above_score_cutoff` value based on the values in the column 'score'. Recall that [`prepare_input_regions`][Prepare input data] defines the 'score' by default if possible (e.g., it takes the value of -log10(FDR) using a ‘narrowPeak’ file from MACS2 as input). As scores might differ between experiments, we recommend that you look at the distribution of the values in 'score' to help you choose the best threshold value for `include_above_score_cutoff` for your data set. -Here is some code to do that on our small, fake dataset. The distribution you see will look different. +Here is some code to do that on our small, synthetic dataset. The distribution you see will look different. ```{r, eval=TRUE} data_center_expand |> -ggplot2::ggplot(ggplot2::aes(x = score)) + - ggplot2::geom_histogram(bins = 10) + ggplot2::ggplot(ggplot2::aes(x = score)) + + ggplot2::geom_histogram(binwidth = 10) + + ggplot2::xlim(0,NA) ``` -See that we have very few sites with low values. We use a cutoff of 2.5 to remove lowly enriched sites and apply this value to the filtering step. +See that we have very few sites with low values. We use a cutoff of 35 to remove lowly enriched sites and apply this value to the filtering step. ```{r, eval=TRUE} data_filtered_cutoff <- filter_regions( data = data_center_expand, - include_above_score_cutoff = 2.5, + include_above_score_cutoff = 35, show_messages = FALSE ) ``` @@ -624,36 +637,44 @@ dim(data_filtered_cutoff) When we check the range of the values in 'score' columns we see the effects of the filtering. ```{r, eval=TRUE} -range(data_center_expand |> - dplyr::pull(score)) -range(data_filtered_cutoff |> - dplyr::pull(score)) +range(data_center_expand |> + dplyr::pull(score)) +range(data_filtered_cutoff |> + dplyr::pull(score)) ``` -Again, we see that sites with 'score' 2.5 and below are removed. +Again, we see that sites with 'score' 35 and below are removed. + +```{r, eval=TRUE} +data_filtered_cutoff |> + ggplot2::ggplot(ggplot2::aes(x = score)) + + ggplot2::geom_histogram(binwidth = 10) + + ggplot2::xlim(0, NA) +``` #### Select a defined number of regions -You can also select a fixed number of highest scoring regions per sample to extract the top enriched sites. An information message is shown if any sample does not have the required number of regions left in your input data. If your 'score' values vary widely between samples you may select widely different numbers of regions using the `include_above_cutoff`. In this case, using this approach will help you select the similar numbers of regions for each sample. The exact same number of regions may not be selected for each sample because sometimes multiple genomic regions may have the same 'score' value. In this case, all of the tied genomic regions are retained. +You can also select a fixed number of highest scoring regions per sample to extract the top enriched sites. An information message is shown if any sample does not have the required number of regions left in your input data. If your 'score' values vary widely between samples you may select widely different numbers of regions using the `include_above_cutoff`. In this case, using this approach will help you select similar numbers of regions for each sample. The exact same number of regions may not be selected for each sample because sometimes multiple genomic regions may have the same 'score' value. In this case, all of the tied genomic regions are retained. ```{r, eval=TRUE} data_center_expand |> - dplyr::group_by(sample_name) |> + dplyr::group_by(sample_name) |> dplyr::summarize(num_regions = dplyr::n()) filter_regions( data = data_center_expand, include_top_n_scoring = 8, - show_messages = FALSE) |> - dplyr::group_by(sample_name) |> + show_messages = FALSE +) |> + dplyr::group_by(sample_name) |> dplyr::summarize(num_regions = dplyr::n()) ``` -We requested that only the top 8 genomic regions would be retained for each sample, and we can see in the information messages that `r Biocpkg("peakCombiner")` that one samples (‘control03’) contains less then the required 8 site. For the remaining samples, we select the expected 8 regions per sample. +We requested that only the top 8 genomic regions would be retained for each sample, and we can see in the information messages that `r Biocpkg("peakCombiner")` that one sample (‘control03’) contains less then the required 8 sites. For the remaining samples, we select the expected 8 regions per sample. # Combine regions After loading and preparing the regions from your samples, you can use them to build your set of consensus regions using `combine_regions`. -The most important parameter that affects the final set of consensus regions is 'found_in_samples', which allows you to retain a genomic region based on how many samples it was found in (Counting the unique entries in 'sample_name'). It is up to you to define this parameter based on your input data and analysis goals. The default is '2' samples, as we assume that each condition has three biological replicates and most users would like to focus on regions to be found in at lest two of these replicates. See the explained in detail section "[Define parameter to best combine regions]" for more considerations on setting this parameter. +The most important parameter that affects the final set of consensus regions is 'found_in_samples', which allows you to retain a genomic region based on how many samples it was found in (counting the unique entries in 'sample_name'). It is up to you to define this parameter based on your input data and analysis goals. The default is '2' samples, as we assume that each condition has three biological replicates and most users would like to focus on regions to be found in at lest two of these replicates. See the explained in detail section "[Define parameter to best combine regions]" for more considerations on setting this parameter. -The other parameters allow you to configure how the new consensus regions are annotated. For example, you can define how the 'center' column is populated ('combined_center') and what new 'sample_name' should be used for the new consensus regions ('combined_sample_name'). Based on the selected value for the parameter 'combined_center', the new column 'score' is filled (For details see section [Which center to select for the consensus regions?]). If you want to trace which samples and genomic regions contributed to a new consensus region, you can use 'annotate_with_input_names' to create a new column ‘input_name’ describing links to the input genomic regions. +The other parameters allow you to configure how the new consensus regions are annotated. For example, you can define how the 'center' column is populated ('combined_center') and what new 'sample_name' should be used for the new consensus regions ('combined_sample_name'). Based on the selected value for the parameter 'combined_center', the new column 'score' is filled (for details see section [Which center to select for the consensus regions?]). If you want to trace which samples and genomic regions contributed to a new consensus region, you can use 'annotate_with_input_names' to create a new column ‘input_name’ describing links to the input genomic regions. ## Quick start Following is a quick example that uses default parameters. @@ -663,11 +684,11 @@ combine_regions( data = data_filtered, combined_sample_name = "my_new_sample_name", show_message = FALSE -) +) ``` ## Run to combine regions -We run the function `combine_regions` and give the value 'consensus' to the parameter `combined_sample_name` and we want to have the link to our input regions by setting `annotattion_with_input_name` to 'TRUE'. +We run the function `combine_regions` and give the value 'consensus' to the parameter `combined_sample_name` and we want to have the link to our input regions by setting `annotate_with_input_name` to 'TRUE'. In brief, a description what the parameter you can define do. For more details please see below section "[Define parameter to best combine regions]". @@ -695,22 +716,22 @@ Here we briefly describe what is happening under the hood. + **Binning and counting of each genomic input region:** - First step is to apply the GenomicRanges function `disjoin` to split up peaks depending on the number of input samples contributing. This numbers are counted to get the coverage of each genomic regions. These counts are then used for filtering using the parameter `found_in_samples`, which is defined by the user. If `found_in_samples` is a fraction between 0 and 1, then only include genomic regions that are found in at least this fraction of input samples. Default value is 2. *Please note* if `found_in_samples` is set to 1 all genomic regions covered in at least one sample are included, this makes this function to simply merge all input regions. + First step is to apply the GenomicRanges function `disjoin` to split up peaks depending on the number of input samples contributing. These numbers are counted to get the coverage of each genomic region. These counts are then used for filtering using the parameter `found_in_samples`, which is defined by the user. If `found_in_samples` is a fraction between 0 and 1, then only include genomic regions that are found in at least this fraction of input samples. Default value is 2. *Please note* if `found_in_samples` is set to 1 all genomic regions covered in at least one sample are included, this makes this function to simply merge all input regions. + **Recombine separated genomic regions:** - As next step we perform `reduce`, another GRangers operation, to combine regions adjesent to each other. If regions are separated by at least a single nucelotide, we keep them separatly. + As next step we perform `reduce`, another GRanges operation, to combine regions adjacent to each other. If regions are separated by at least a single nucelotide, we keep them separately. + **Overlap with summits:** - This is a quality control function for the resulting genomic regions specifically developed for `peakCombiner` package. To remove potenial false posotive sites, we designed the function `combine_regions` to check for an overlap between newly combined regions and input summits. If at least one overlap is found, the new region will be maintained, othrwise it is removed. + This is a quality control function for the resulting genomic regions specifically developed for `r Biocpkg("peakCombiner")`. To remove potential false positive sites, we designed the function `combine_regions` to check for an overlap between newly combined regions and input summits. If at least one overlap is found, the new region will be maintained, otherwise it is removed. + **Restore data structure**: - In this final step, we restore the accepted data structure for working with `r Biocpkg("peakCombiner")` functions ensure the modularity of the package. This allows a user to apply the functions `center_expand_resgions` and `filter_resgions` to the combined consensus region file. Therefore, 'center', 'score' and 'sample_name' columns are created and populated based on the user defined parameters. + In this final step, we restore the accepted data structure for working with `r Biocpkg("peakCombiner")` functions to ensure the modularity of the package. This allows a user to apply the functions `center_expand_resgions` and `filter_resgions` to the combined consensus region file. Therefore, 'center', 'score' and 'sample_name' columns are created and populated based on the user defined parameters. ### Define parameter to best combine regions -Here we explain the parameter you can set for the function `combine_regions` and the defaults we recommend. +Here we explain the parameters you can set for the function `combine_regions` and the defaults we recommend. #### In how many samples should a region be found? @@ -729,7 +750,7 @@ combine_regions( found_in_samples = 2, show_messages = FALSE, combined_sample_name = "found_in_samples_2_example" -) +) ``` If the parameter `found_in_samples` is set to '1', this function basically merges all input regions. @@ -792,3 +813,5 @@ This vignette was built using: ```{r, session, eval=TRUE} sessionInfo() ``` + +# References