From c1f0142626733e5c3af090a820fad91d7c330372 Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Tue, 12 Nov 2024 13:42:44 +0000
Subject: [PATCH] Correct terminology (de-novo motif discovery -> motif
discovery)
---
DESCRIPTION | 2 +-
NAMESPACE | 1 +
NEWS.md | 9 +++++
R/MotifPeeker.R | 42 +++++++++++----------
R/denovo_motifs.R | 16 ++++----
R/find_motifs.R | 2 +-
R/motif_similarity.R | 2 +-
README.Rmd | 18 ++++-----
README.md | 48 ++++++++++++------------
inst/markdown/MotifPeeker.Rmd | 32 ++++++++--------
man/MotifPeeker.Rd | 36 +++++++++---------
man/denovo_motifs.Rd | 14 +++----
man/find_motifs.Rd | 2 +-
man/get_df_distances.Rd | 2 +-
man/get_df_enrichment.Rd | 2 +-
man/motif_similarity.Rd | 2 +-
tests/testthat/test-MotifPeeker.R | 12 +++---
tests/testthat/test-denovo_motif_funcs.R | 4 +-
vignettes/MotifPeeker.Rmd | 16 ++++----
vignettes/troubleshooting.Rmd | 2 +-
20 files changed, 138 insertions(+), 126 deletions(-)
diff --git a/DESCRIPTION b/DESCRIPTION
index 87df787..a915f90 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
Type: Package
Package: MotifPeeker
Title: Benchmarking Epigenomic Profiling Methods Using Motif Enrichment
-Version: 0.99.10
+Version: 0.99.11
Authors@R: c(
person(given = "Hiranyamaya",
family = "Dash",
diff --git a/NAMESPACE b/NAMESPACE
index 8006ca7..f0be77e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -73,6 +73,7 @@ importFrom(universalmotif,read_meme)
importFrom(universalmotif,read_transfac)
importFrom(universalmotif,read_uniprobe)
importFrom(utils,capture.output)
+importFrom(utils,packageVersion)
importFrom(utils,read.table)
importFrom(utils,write.table)
importFrom(viridis,scale_color_viridis)
diff --git a/NEWS.md b/NEWS.md
index 951aaba..dbc69a1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,12 @@
+# MotifPeeker 0.99.11
+
+## Miscellaneous
+
+* Correct "de-novo motif discovery" term to "motif discovery". STREME does not
+perform de-novo motif discovery.
+* Add package version to report header.
+
+
# MotifPeeker 0.99.9 / 0.99.10
## Bug Fixes
diff --git a/R/MotifPeeker.R b/R/MotifPeeker.R
index 07e063f..f139eba 100644
--- a/R/MotifPeeker.R
+++ b/R/MotifPeeker.R
@@ -4,10 +4,10 @@
#' as the key metric. The output is an easy-to-interpret HTML document with the
#' results. The report contains three main sections: (1) General Metrics on peak
#' and alignment files (if provided), (2) Known Motif Enrichment Analysis and
-#' (3) De-novo Motif Enrichment Analysis.
+#' (3) Discovered Motif Enrichment Analysis.
#'
#' Runtime guidance: For 4 datasets, the runtime is approximately 3 minutes with
-#' denovo_motif_discovery disabled. However, de-novo motif discovery can take
+#' motif_discovery disabled. However, motif discovery can take
#' hours to complete. To make computation faster, we highly recommend tuning the
#' following arguments:
#' \describe{
@@ -15,13 +15,13 @@
#' parallel can significantly reduce runtime, but it is very
#' memory-intensive, consuming 10+GB of RAM per thread. Memory starvation can
#' greatly slow the process, so set the number of cores with caution.}
-#' \item{\code{denovo_motifs}}{The number of motifs to discover per sequence
-#' group exponentially increases runtime. We recommend no more than 5
-#' motifs to make a meaningful inference.}
-#' \item{\code{trim_seq_width}}{Trimming sequences before running de-novo
+#' \item{\code{motif_discovery_count}}{The number of motifs to discover per
+#' sequence group exponentially increases runtime. We recommend no more than
+#' 5 motifs to make a meaningful inference.}
+#' \item{\code{trim_seq_width}}{Trimming sequences before running
#' motif discovery can significantly reduce the search space. Sequence
#' length can exponentially increase runtime. We recommend running the
-#' script with \code{denovo_motif_discovery = FALSE} and studying the
+#' script with \code{motif_discovery = FALSE} and studying the
#' motif-summit distance distribution under general metrics to find the
#' sequence length that captures most motifs. A good starting point is 150
#' but it can be reduced further if appropriate.}
@@ -69,8 +69,10 @@
#' labels.
#' @param cell_counts An integer vector of experiment cell counts for each peak
#' file. (optional) Creates additional comparisons based on cell counts.
-#' @param denovo_motif_discovery A logical indicating whether to perform
-#' de-novo motif discovery for the third section of the report. (default = TRUE)
+#' @param motif_discovery A logical indicating whether to perform
+#' motif discovery for the third section of the report. (default = TRUE)
+#' @param motif_discovery_count An integer specifying the number of motifs to
+#' discover. (default = 3) Note that higher values take longer to compute.
#' @param download_buttons A logical indicating whether to include download
#' buttons for various files within the HTML report. (default = TRUE)
#' @param out_dir A character string specifying the directory to save the
@@ -97,7 +99,7 @@
#' with \code{BPPARAM = BiocParallel::MulticoreParam()}.
#' }
#' \strong{IMPORTANT:} For each worker, please ensure a minimum of 8GB of
-#' memory (RAM) is available as \code{denovo_motif_discovery} is
+#' memory (RAM) is available as \code{motif_discovery} is
#' memory-intensive.
#' @param quiet A logical indicating whether to print markdown knit messages.
#' (default = FALSE)
@@ -124,7 +126,7 @@
#'
#' @return Path to the output directory.
#'
-#' @note Running de-novo motif discovery is computationally expensive and can
+#' @note Running motif discovery is computationally expensive and can
#' require from minutes to hours. \code{denovo_motifs} can widely affect the
#' runtime (higher values take longer). Setting \code{trim_seq_width} to a lower
#' value can also reduce the runtime significantly.
@@ -162,8 +164,8 @@
#' motif_files = motifs,
#' motif_labels = NULL,
#' cell_counts = NULL,
-#' denovo_motif_discovery = TRUE,
-#' denovo_motifs = 1,
+#' motif_discovery = TRUE,
+#' motif_discovery_count = 1,
#' motif_db = NULL,
#' download_buttons = TRUE,
#' out_dir = tempdir(),
@@ -184,8 +186,8 @@ MotifPeeker <- function(
motif_files = NULL,
motif_labels = NULL,
cell_counts = NULL,
- denovo_motif_discovery = TRUE,
- denovo_motifs = 3,
+ motif_discovery = TRUE,
+ motif_discovery_count = 3,
filter_n = 6,
trim_seq_width = NULL,
motif_db = NULL,
@@ -223,9 +225,9 @@ MotifPeeker <- function(
"equal to ", shQuote("peak_files"), ".")
stop(stp_msg)
}
- if (denovo_motif_discovery &&
- (is.null(denovo_motifs) || denovo_motifs < 1)) {
- stp_msg <- "Number of de-novo motifs to find must be greater than 0."
+ if (motif_discovery &&
+ (is.null(motif_discovery_count) || motif_discovery_count < 1)) {
+ stp_msg <- "Number of motifs to discover must be greater than 0."
stop(stp_msg)
}
@@ -266,8 +268,8 @@ MotifPeeker <- function(
motif_files = motif_files,
motif_labels = motif_labels,
cell_counts = cell_counts,
- denovo_motif_discovery = denovo_motif_discovery,
- denovo_motifs = denovo_motifs,
+ motif_discovery = motif_discovery,
+ discover_motifs = motif_discovery_count,
filter_n = filter_n,
motif_db = motif_db,
trim_seq_width = trim_seq_width,
diff --git a/R/denovo_motifs.R b/R/denovo_motifs.R
index 5cfc998..54f76be 100644
--- a/R/denovo_motifs.R
+++ b/R/denovo_motifs.R
@@ -1,6 +1,6 @@
-#' Find de-novo motifs in sequences
+#' Discover motifs in sequences
#'
-#' Use STREME from MEME suite to find de-novo motifs in the provided sequences.
+#' Use STREME from MEME suite to find motifs in the provided sequences.
#' To speed up the process, the sequences can be optionally trimmed to reduce
#' the search space. The result is then optionally filtered to remove motifs
#' with a high number of nucleotide repeats
@@ -9,19 +9,19 @@
#' sequences to search for motifs.
#' @param trim_seq_width An integer specifying the width of the sequence to
#' extract around the summit (default = NULL). This sequence is used to search
-#' for de novo motifs. If not provided, the entire peak region will be used.
+#' for discovered motifs. If not provided, the entire peak region will be used.
#' This parameter is intended to reduce the search space and speed up motif
#' discovery; therefore, a value less than the average peak width is
#' recommended. Peaks are trimmed symmetrically around the summit while
#' respecting the peak bounds.
-#' @param denovo_motifs An integer specifying the number of de-novo motifs to
+#' @param discover_motifs_count An integer specifying the number of motifs to
#' discover. (default = 3) Note that higher values take longer to compute.
#' @param minw An integer specifying the minimum width of the motif.
#' (default = 8)
#' @param maxw An integer specifying the maximum width of the motif.
#' (default = 25)
#' @param filter_n An integer specifying the number of consecutive nucleotide
-#' repeats a de-novo discovered motif must contain to be filtered out.
+#' repeats a discovered motif must contain to be filtered out.
#' (default = 6)
#' @param out_dir A \code{character} vector of output directory to save STREME
#' results to. (default = \code{tempdir()})
@@ -47,7 +47,7 @@
#' res <- denovo_motifs(list(CTCF_TIP_peaks),
#' trim_seq_width = 50,
#' genome_build = genome_build,
-#' denovo_motifs = 1,
+#' discover_motifs_count = 1,
#' filter_n = 6,
#' minw = 8,
#' maxw = 8,
@@ -59,7 +59,7 @@
denovo_motifs <- function(seqs,
trim_seq_width,
genome_build,
- denovo_motifs = 3,
+ discover_motifs_count = 3,
minw = 8,
maxw = 25,
filter_n = 6,
@@ -93,7 +93,7 @@ denovo_motifs <- function(seqs,
silent = !debug,
minw = 8,
maxw = 25,
- nmotifs = denovo_motifs,
+ nmotifs = discover_motifs_count,
meme_path = meme_path,
...
)
diff --git a/R/find_motifs.R b/R/find_motifs.R
index 211b764..6912067 100644
--- a/R/find_motifs.R
+++ b/R/find_motifs.R
@@ -31,7 +31,7 @@
#' res <- denovo_motifs(list(CTCF_TIP_peaks),
#' trim_seq_width = 50,
#' genome_build = genome_build,
-#' denovo_motifs = 1,
+#' discover_motifs_count = 1,
#' filter_n = 10,
#' out_dir = tempdir())
#' res2 <- find_motifs(res, motif_db = get_JASPARCORE(),
diff --git a/R/motif_similarity.R b/R/motif_similarity.R
index f42dc5a..90e8d46 100644
--- a/R/motif_similarity.R
+++ b/R/motif_similarity.R
@@ -42,7 +42,7 @@
#' denovo_motifs <- denovo_motifs(unlist(segregated_peaks),
#' trim_seq_width = 50,
#' genome_build = genome_build,
-#' denovo_motifs = 1,
+#' discover_motifs_count = 1,
#' filter_n = 6,
#' maxw = 8,
#' minw = 8,
diff --git a/README.Rmd b/README.Rmd
index 30dbd99..72fe123 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -34,8 +34,8 @@ peaks, including FRiP scores, peak widths, and motif-to-summit distances.
enriched user-supplied motifs in the datasets and compares them between the
common and unique peaks from comparison and reference datasets.
-3. **De-Novo Motif Enrichment Analysis**: Details the statistics of de-novo
-discovered motifs in common and unique peaks from comparison and reference
+3. **Discovered Motif Enrichment Analysis**: Details the statistics of
+motifs discovered in common and unique peaks from comparison and reference
datasets. Examines motif similarities and identifies the closest known motifs in
the JASPAR or the provided database.
@@ -126,8 +126,8 @@ MotifPeeker(
genome_build = "hg38",
motif_files = motif_files,
cell_counts = NULL, # No cell-count information
- denovo_motif_discovery = TRUE,
- denovo_motifs = 3,
+ motif_discovery = TRUE,
+ motif_discovery_count = 3,
motif_db = NULL,
download_buttons = TRUE,
out_dir = tempdir(),
@@ -192,7 +192,7 @@ enhance them:
- `cell_counts`: An integer vector of experiment cell counts for each peak file
(if available). Creates additional comparisons based on cell counts.
- `motif_db`: Path to `.meme` format file to use as reference database, or a
- list of `universalmotif-class` objects. Results from de-novo motif discovery
+ list of `universalmotif-class` objects. Results from motif discovery
are searched against this database to find similar motifs. If not provided,
JASPAR CORE database will be used, making this parameter **truly optional**.
**NOTE**: p-value estimates are inaccurate when the database has fewer than
@@ -208,7 +208,7 @@ for [`MotifPeeker()`](https://neurogenomics.github.io/MotifPeeker/reference/Moti
### Runtime Guidance
For 4 datasets, the runtime is approximately 3 minutes with
-denovo_motif_discovery disabled. However, de-novo motif discovery can take
+motif_discovery disabled. However, motif discovery can take
hours to complete.
To make computation faster, we highly recommend tuning the following arguments:
@@ -219,13 +219,13 @@ To make computation faster, we highly recommend tuning the following arguments:
runtime, but it is very memory-intensive, consuming upwards of 10GB of RAM per
thread. Memory starvation can greatly slow the process, so set `workers` with
caution.
-- `denovo_motifs`: The number of motifs to discover per sequence group
+- `motif_discovery_count`: The number of motifs to discover per sequence group
exponentially increases runtime. We recommend no more than 5 motifs to make a
meaningful inference.
-- `trim_seq_width`: Trimming sequences before running de-novo motif discovery
+- `trim_seq_width`: Trimming sequences before running motif discovery
can significantly reduce the search space. Sequence length can exponentially
increase runtime. We recommend running the script with
- `denovo_motif_discovery = FALSE` and studying the motif-summit distance
+ `motif_discovery = FALSE` and studying the motif-summit distance
distribution under general metrics to find the sequence length that captures
most motifs. A good starting point is 150 but it can be reduced further if
appropriate.
diff --git a/README.md b/README.md
index f8564bc..17c7b74 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ style="height: 300px !important;" />
[-blue.svg)](https://cran.r-project.org/web/licenses/GPL%20(%3E=%203))
-[](https://github.com/neurogenomics/MotifPeeker)
+[](https://github.com/neurogenomics/MotifPeeker)
[](https://github.com/neurogenomics/MotifPeeker)
[](https://github.com/neurogenomics/MotifPeeker/commits/master)
[ Dash, Thomas Roberts, Nathan
Skene***
-**Updated:** ***Nov-11-2024***
+**Updated:** ***Nov-12-2024***
## Introduction
@@ -35,10 +35,10 @@ package outputs an HTML report consisting of three sections:
compares them between the common and unique peaks from comparison
and reference datasets.
-3. **De-Novo Motif Enrichment Analysis**: Details the statistics of
- de-novo discovered motifs in common and unique peaks from comparison
- and reference datasets. Examines motif similarities and identifies
- the closest known motifs in the JASPAR or the provided database.
+3. **Discovered Motif Enrichment Analysis**: Details the statistics of
+ motifs discovered in common and unique peaks from comparison and
+ reference datasets. Examines motif similarities and identifies the
+ closest known motifs in the JASPAR or the provided database.
@@ -141,8 +141,8 @@ MotifPeeker(
genome_build = "hg38",
motif_files = motif_files,
cell_counts = NULL, # No cell-count information
- denovo_motif_discovery = TRUE,
- denovo_motifs = 3,
+ motif_discovery = TRUE,
+ motif_discovery_count = 3,
motif_db = NULL,
download_buttons = TRUE,
out_dir = tempdir(),
@@ -222,10 +222,10 @@ or enhance them:
peak file (if available). Creates additional comparisons based on cell
counts.
- `motif_db`: Path to `.meme` format file to use as reference database,
- or a list of `universalmotif-class` objects. Results from de-novo
- motif discovery are searched against this database to find similar
- motifs. If not provided, JASPAR CORE database will be used, making
- this parameter **truly optional**. **NOTE**: p-value estimates are
+ or a list of `universalmotif-class` objects. Results from motif
+ discovery are searched against this database to find similar motifs.
+ If not provided, JASPAR CORE database will be used, making this
+ parameter **truly optional**. **NOTE**: p-value estimates are
inaccurate when the database has fewer than 50 entries.
@@ -239,8 +239,8 @@ documentation for
### Runtime Guidance
For 4 datasets, the runtime is approximately 3 minutes with
-denovo_motif_discovery disabled. However, de-novo motif discovery can
-take hours to complete.
+motif_discovery disabled. However, motif discovery can take hours to
+complete.
To make computation faster, we highly recommend tuning the following
arguments:
@@ -256,16 +256,16 @@ arguments:
reduce runtime, but it is very memory-intensive, consuming upwards of
10GB of RAM per thread. Memory starvation can greatly slow the
process, so set `workers` with caution.
-- `denovo_motifs`: The number of motifs to discover per sequence group
- exponentially increases runtime. We recommend no more than 5 motifs to
- make a meaningful inference.
-- `trim_seq_width`: Trimming sequences before running de-novo motif
- discovery can significantly reduce the search space. Sequence length
- can exponentially increase runtime. We recommend running the script
- with `denovo_motif_discovery = FALSE` and studying the motif-summit
- distance distribution under general metrics to find the sequence
- length that captures most motifs. A good starting point is 150 but it
- can be reduced further if appropriate.
+- `motif_discovery_count`: The number of motifs to discover per sequence
+ group exponentially increases runtime. We recommend no more than 5
+ motifs to make a meaningful inference.
+- `trim_seq_width`: Trimming sequences before running motif discovery
+ can significantly reduce the search space. Sequence length can
+ exponentially increase runtime. We recommend running the script with
+ `motif_discovery = FALSE` and studying the motif-summit distance
+ distribution under general metrics to find the sequence length that
+ captures most motifs. A good starting point is 150 but it can be
+ reduced further if appropriate.
diff --git a/inst/markdown/MotifPeeker.Rmd b/inst/markdown/MotifPeeker.Rmd
index fc7b557..470404e 100644
--- a/inst/markdown/MotifPeeker.Rmd
+++ b/inst/markdown/MotifPeeker.Rmd
@@ -27,9 +27,9 @@ params:
value: NULL
cell_counts:
value: NULL
- denovo_motif_discovery:
+ motif_discovery:
value: TRUE
- denovo_motifs:
+ discover_motifs:
value: 3
filter_n:
value: 6
@@ -110,16 +110,16 @@ cellcount_metrics <- ifelse((length(params$cell_counts) == 0), FALSE, TRUE)
user_motif_metrics <- ifelse((length(user_motifs$motifs) == 0 ||
result_len == 1), FALSE, TRUE)
comparison_metrics <- ifelse(result_len == 1, FALSE, TRUE)
-denovo_metrics <- params$denovo_motif_discovery && comparison_metrics
+discover_metrics <- params$motif_discovery && comparison_metrics
## Misc
ex_emo <- ifelse(requireNamespace("emoji", quietly = TRUE),
emoji::emoji("exclamation"), "!!")
## Check motif_db
-using_jaspar_db <- if (denovo_metrics) ifelse(is.null(params$motif_db),
+using_jaspar_db <- if (discover_metrics) ifelse(is.null(params$motif_db),
TRUE, FALSE)
-motif_db <- if (denovo_metrics) ifelse(is.null(params$motif_db),
+motif_db <- if (discover_metrics) ifelse(is.null(params$motif_db),
get_JASPARCORE(), params$motif_db)
### General Metrics ###
@@ -176,13 +176,13 @@ if (comparison_metrics) {
}
}
-### De-Novo Motif Analysis ###
-if (denovo_metrics) {
+### Discovered Motif Analysis ###
+if (discover_metrics) {
denovo_res <- list()
## Run STREME
denovo_res$streme <- denovo_motifs(
unlist(segregated_peaks), params$trim_seq_width, genome_build,
- params$denovo_motifs, filter_n = params$filter_n,
+ params$discover_motifs, filter_n = params$filter_n,
out_dir = out_dir_extra, meme_path = params$meme_path,
BPPARAM = params$BPPARAM, verbose = params$verbose, debug = params$debug
)
@@ -221,8 +221,8 @@ statistics on the frequency of enriched user-supplied motifs in the datasets and
compares them between the common and unique peaks from comparison and reference
datasets.
-3. [De-Novo Motif Enrichment Analysis](#denovo-motif-analysis): Details the
-statistics of de-novo discovered motifs in common and unique peaks from
+3. [Discovered Motif Enrichment Analysis](#discovered-motif-analysis): Details
+the statistics of discovered motifs in common and unique peaks from
comparison and reference datasets. Examines motif similarities and identifies
the closest known motifs in the [JASPAR](https://jaspar.uio.no/downloads/)
or the provided database.
@@ -826,8 +826,8 @@ if (!user_motif_metrics) {