Merge pull request #139 from danforthcenter/time_bug

Time bug
danforthcenter · Dec 18, 2024 · ff89948 · ff89948
2 parents 60c03da + 249c898
commit ff89948
Show file tree

Hide file tree

Showing 13 changed files with 90 additions and 53 deletions.
diff --git a/R/brmViolin.R b/R/brmViolin.R
@@ -4,11 +4,13 @@
 #' use \link{combineDraws} to merge their draws into a single dataframe for testing.
 #' @param ss A \code{pcvrss} object. The only component that is currently used is the pcvrForm.
 #' @param hypothesis A hypothesis expressed as a character string in the style of that used by
-#' \code{brms::hypothesis} and \link{testGrowth}. Note that currently this only supports hypotheses
-#' using two parameters from the model at a time (ie, "groupA / groupB > 1.05" works but
-#' "(groupA / groupB) - (groupC / groupD) > 1" does not). In the hypothesis "..." can be used to mean
+#' \code{brms::hypothesis} and \link{testGrowth}. In the hypothesis "..." can be used to mean
 #' "all groups for this parameter" so that the hypothesis "... / A_group1 > 1.05" would include all
-#' the "A" coefficients for groups 1:N along the x axis, see examples.
+#' the "A" coefficients for groups 1:N along the x axis, see examples. If a hypothesis is using
+#' several parameters per group (second example) then math around those parameters and any ellipses
+#' should be wrapped in parentheses. Note that currently the single hypothesis option (no ...)
+#' only supports hypotheses using two parameters from the model at a time
+#' (ie, "groupA / groupB > 1.05" works but "(groupA / groupB) - (groupC / groupD) > 1" does not).
 #'
 #' @keywords brms Bayesian pcvrss
 #'
@@ -32,6 +34,8 @@
 #'
 #' fit <- fitGrowth(ss, backend = "cmdstanr", iter = 500, chains = 1, cores = 1)
 #' brmViolin(fit, ss, ".../A_groupd > 1.05") # all groups used
+#' brmViolin(fit, ss, "abs(1 - ((...)/(C_groupd - B_groupd))) > 0.05") # rather arbitrary
+#' brmViolin(fit, ss, "abs(1 - ((...)/(C_groupa - B_groupd))) > 0.05") # totally arbitrary
 #' brmViolin(fit, ss, "A_groupa/A_groupd > 1.05") # only these two groups
 #' }
 #'
@@ -78,7 +82,7 @@ brmViolin <- function(fit, ss, hypothesis) {
 #' @keywords internal
 
 .brmViolinEllipsis <- function(fitdf, ss, hypothesis) {
-  math_or_underscore <- "\\+|\\/|\\-|\\*|\\^|>|<|=|_"
+  math_or_underscore <- "\\+|\\/|\\-|\\*|\\^|>|<|=|_|\\(|\\)"
   hsplit <- trimws(strsplit(hypothesis, math_or_underscore)[[1]])
   hsplit <- hsplit[as.logical(nchar(hsplit))]
   hsplit <- hsplit[!grepl("\\.\\.\\.", hsplit)]
@@ -88,24 +92,36 @@ brmViolin <- function(fit, ss, hypothesis) {
     any(grepl(grp, hsplit))
   })))
   grouping <- grouping[which_grouping]
-  par <- hsplit[which(!grepl(grouping, hsplit))]
+  par <- hsplit[which(hsplit %in% names(ss$formula$pforms))]
   ref_group <- hsplit[which(grepl(grouping, hsplit))]
-  draws <- fitdf[, grepl(paste0("^b_", par, ".*", grouping), colnames(fitdf))]
-  colnames(draws) <- sub("^b_", "", colnames(draws))
+  colnames(fitdf) <- sub("^b_", "", colnames(fitdf))
+  draws <- fitdf[, grepl(paste0(par, ".*", grouping, collapse = "|"), colnames(fitdf))]
   if (!grepl("[:]", grouping)) {
     draws <- draws[, !grepl("[:]", colnames(draws))] # drop interaction terms
   }
-  if (length(par) > 0) {
-    colnames(draws) <- sub(paste0(par, "_"), "", colnames(draws))
-    colnames(draws) <- paste0(par, "_", colnames(draws))
+  transformation <- paste(par, ref_group, sep = "_")
+  if (length(ref_group) > 1) {
+    pat <- paste0("\\(", paste(paste(par, ref_group, sep = "_"), collapse = ".*"), "\\)")
+    transformation <- regmatches(hypothesis, gregexpr(pat, hypothesis))
   }
-  hyps_df <- do.call(rbind, lapply(colnames(draws), function(grp) {
-    iter_hyp <- gsub("\\.{3,}", grp, hypothesis)
+  groupings <- unique(ss$df[[grouping]])
+  hyps_res <- lapply(groupings, function(grp) {
+    ellipse_replacement <- gsub(ref_group[1], paste0(grouping, grp), transformation)
+    transformed_posterior <- with(draws, expr = {eval(parse(text = ellipse_replacement))})
+    iter_hyp <- gsub("\\.{3,}", ellipse_replacement, hypothesis)
     x <- as.data.frame(brms::hypothesis(draws, iter_hyp)$h)
     x$ref_group <- ifelse(as.logical(length(par)), paste0(par, "_", ref_group), ref_group)
     x$comparison <- grp
-    x[, c("Post.Prob", "ref_group", "comparison")]
-  }))
+    return(
+      list(
+        "hyp_df" = x[, c("Post.Prob", "ref_group", "comparison")],
+        "transformed_posterior" = transformed_posterior
+      )
+    )
+  })
+  hyps_df <- do.call(rbind, lapply(hyps_res, \(l) {l$hyp_df}))
+  transformed_draws <- as.data.frame(do.call(cbind, lapply(hyps_res, \(l) {l$transformed_posterior})))
+  colnames(transformed_draws) <- paste0(groupings)
   hyps_df$discrete_post_prob <- factor(
     ifelse(hyps_df$Post.Prob >= 0.99, "A",
       ifelse(hyps_df$Post.Prob >= 0.95, "B",
@@ -116,12 +132,13 @@ brmViolin <- function(fit, ss, hypothesis) {
     ),
     levels = c("A", "B", "C", "D", "E"), ordered = TRUE
   )
-  longdraw <- as.data.frame(data.table::melt(data.table::as.data.table(draws),
-    measure.vars = colnames(draws), value.name = "draw"
+  longdraw <- as.data.frame(data.table::melt(data.table::as.data.table(transformed_draws),
+    measure.vars = colnames(transformed_draws), value.name = "draw"
   ))
   longdraw$param <- substr(longdraw$variable, 1, 1)
-  longdraw$group <- sub(paste0(par, "_"), "", longdraw$variable)
-  longdraw$ref <- grepl(ref_group, longdraw$variable)
+  longdraw$group <- sub(paste0(paste0(par, "_"), collapse = "|"), "", longdraw$variable)
+  longdraw$ref <- grepl(paste0(unique(gsub(grouping, "", ref_group)), collapse = "|"),
+                        longdraw$variable)
   ldj <- merge(longdraw, hyps_df, by.x = c("variable"), by.y = c("comparison"))
   return(ldj)
 }
@@ -134,10 +151,12 @@ brmViolin <- function(fit, ss, hypothesis) {
   math <- "\\+|\\/|\\-|\\*|\\^|>|<|=" # foreseeable math operators
   hsplit <- trimws(strsplit(hypothesis, math)[[1]])
   hsplit <- hsplit[suppressWarnings(is.na(as.numeric(hsplit)))]
+  hsplit <- hsplit[nchar(hsplit) > 0]
   draws <- fitdf
   colnames(draws) <- sub("^b_", "", colnames(draws))
   draws <- draws[, hsplit]
   hyps_df <- as.data.frame(brms::hypothesis(draws, hypothesis = hypothesis)$h)
+  # here I'd need to make the transformed draws if the hypothesis is weird like that?
   hyps_df$ref_group <- hsplit[2]
   hyps_df$comparison <- hsplit[1]
   hyps_df <- rbind(

diff --git a/R/bwtime.R b/R/bwtime.R
@@ -3,20 +3,20 @@
 #' @param df Data frame to use, this can be in wide or long format.
 #' @param mode One of "DAS", "DAP" or "DAE" (Days After Planting and Days After Emergence).
 #' Defaults to adding all columns.
-#' Note that if timeCol is not an integer then DAS is always returned.
+#' Note that if timeCol is not numeric then DAS is always returned.
 #' @param plantingDelay If `mode` includes "DAP" then `plantingDelay` is used to adjust "DAS"
 #' @param phenotype If `mode` includes "DAE" then this is the phenotype used to classify emergence.
-#' @param cutoff If `mode` inlcludes "DAE" then this value is used to classify emergence.
+#' @param cutoff If `mode` includes "DAE" then this value is used to classify emergence.
 #' Defaults to 1, meaning an image with a value of 1 or more for `phenotype` has "emerged".
 #' @param timeCol Column of input time values, defaults to "timestamp".
-#' If this is not an integer then it is assumed to be
+#' If this is not numeric then it is assumed to be
 #' a timestamp in the format of the format argument.
 #' @param group  Grouping variables to specify unique plants as a character vector.
 #' This defaults to "Barcodes". These taken together should identify a unique plant across time,
 #' although often "angle" or "rotation" should be added.
 #' @param plot Logical, should plots of the new time variables be printed?
 #' @param format An R POSIXct format, defaults to lemnatech standard format.
-#' This is only used if timeCol is not an integer.
+#' This is only used if timeCol is not a numeric.
 #' @param traitCol Column with phenotype names, defaults to "trait".
 #' This should generally not need to be changed from the default.
 #'    If this and valueCol are present in colnames(df) then the data
@@ -28,9 +28,10 @@
 #' and plants were watered before being imaged or if you want to index days off of
 #' midnight. This defaults to NULL but will take any value coercible to POSIXct by
 #' \code{as.POSIXct(... , tz="UTC")} such as "2020-01-01 18:30:00"
+#' @param digits Number of digits to round DAS to if timeCol is not numeric, defaults to 0.
 #' @keywords DAS time ggplot
 #' @import ggplot2
-#' @return The input dataframe with new integer columns for different ways
+#' @return The input dataframe with new numeric columns for different ways
 #' of describing time in the experiment. If plot is TRUE then a ggplot is also returned as part of
 #' a list.
 #' @export
@@ -86,7 +87,7 @@
 bw.time <- function(df = NULL, mode = c("DAS", "DAP", "DAE"), plantingDelay = NULL,
                     phenotype = NULL, cutoff = 1, timeCol = "timestamp",
                     group = "Barcodes", plot = TRUE, format = "%Y-%m-%d %H:%M:%S",
-                    traitCol = "trait", valueCol = "value", index = NULL) {
+                    traitCol = "trait", valueCol = "value", index = NULL, digits = 0) {
   wide <- .detectWideData(df, traitCol, valueCol) # see bwoutliers
 
   if (is.null(plantingDelay) && "DAP" %in% mode) {
@@ -96,7 +97,7 @@ bw.time <- function(df = NULL, mode = c("DAS", "DAP", "DAE"), plantingDelay = NU
     mode <- mode[-which(mode == "DAE")]
   }
 
-  formatNonIntegerTimeRes <- .formatNonIntegerTime(df, timeCol, format, index)
+  formatNonIntegerTimeRes <- .formatNonIntegerTime(df, timeCol, format, index, digits)
   df <- formatNonIntegerTimeRes[["data"]]
   timeCol <- formatNonIntegerTimeRes[["timeCol"]]
 
@@ -124,14 +125,14 @@ bw.time <- function(df = NULL, mode = c("DAS", "DAP", "DAE"), plantingDelay = NU
 #' @keywords internal
 #' @noRd
 
-.formatNonIntegerTime <- function(df, timeCol, format, index) {
+.formatNonIntegerTime <- function(df, timeCol, format, index, digits) {
   if (!is.numeric(df[[timeCol]])) {
     df[[timeCol]] <- as.POSIXct(strptime(df[[timeCol]], format = format))
     beg <- as.POSIXct(index, tz = "UTC")
     if (is.null(index)) {
       beg <- min(df[[timeCol]], na.rm = TRUE)
     }
-    df$DAS <- floor(as.numeric((df[[timeCol]] - beg) / 60 / 60 / 24))
+    df$DAS <- round(as.numeric((df[[timeCol]] - beg) / 60 / 60 / 24), digits = digits)
     timeCol <- "DAS"
   }
   return(list("data" = df, "timeCol" = timeCol))

diff --git a/R/frem.R b/R/frem.R
@@ -83,7 +83,7 @@ frem <- function(df, des, phenotypes, timeCol = NULL, cor = TRUE, returnData = F
     dummyX <- TRUE
   }
   #* `Format a time column if non-integer`
-  formatted <- .formatNonIntegerTime(df, timeCol, format = time_format, index = NULL)
+  formatted <- .formatNonIntegerTime(df, timeCol, format = time_format, index = NULL, digits = 0)
   df <- formatted$data
   timeCol <- formatted$timeCol
   #* `Make formulas`

diff --git a/R/growthSS.R b/R/growthSS.R
@@ -44,8 +44,8 @@
 #'  of logistic increasing counts may be written as \code{model = "poisson: logistic"}.
 #' @param form A formula describing the model. The left hand side should only be
 #' the outcome variable (phenotype), and a cutoff if you are making a survival model (see details).
-#' The right hand side needs at least the x variable
-#'  (typically time). Grouping is also described in this formula using roughly lme4
+#' The right hand side needs at least the x variable (which should be numeric and is
+#'  typically time). Grouping is also described in this formula using roughly lme4
 #'  style syntax,with formulas like \code{y~time|individual/group} to show that predictors
 #'  should vary by \code{group} and autocorrelation between \code{individual:group}
 #'  interactions should be modeled. Note that autocorrelation is only modeled with the "brms"

diff --git a/R/parsePcvrForm.R b/R/parsePcvrForm.R
@@ -58,7 +58,9 @@
         df <- df[!is.infinite(df[[x]]), ]
         df <- df[!is.infinite(df[[y]]), ]
         df <- df[!is.infinite(df[[hierarchical_predictor]]), ]
-        formatted <- .formatNonIntegerTime(df, x, format = "%Y-%m-%d %H:%M:%S", index = NULL)
+        formatted <- .formatNonIntegerTime(df, x,
+                                           format = "%Y-%m-%d %H:%M:%S",
+                                           index = NULL, digits = 2)
         df <- formatted$data
         x <- formatted$timeCol
       },

diff --git a/R/wue.R b/R/wue.R
@@ -35,9 +35,12 @@
 #' how efficient a plant is at maintaining size given some amount of water.
 #' @keywords WUE
 #' @import data.table
-#' @return A data frame containing the bellwether watering data joined
-#'     to phenotype data with new columns for change in the phenotype,
-#'     change in the pre-watering weight, and pseudo-water use efficiency (pWUE).
+#' @return A data frame containing the watering data and
+#'     to phenotype data with new columns for change in the phenotype (\code{pheno_diff}),
+#'     amount of water used (\code{total_water}) over the interval between phenotype measurements
+#'     (water added post to pre phenotype measurement), \code{start} and \code{end} times for the
+#'     interval as well as their difference (\code{timeLengthSeconds}), and pseudo water use efficiency
+#'     (\code{pWUE}).
 #' @examples
 #'
 #' set.seed(123)

diff --git a/man/brmViolin.Rd b/man/brmViolin.Rd
diff --git a/man/bw.time.Rd b/man/bw.time.Rd
diff --git a/man/growthSS.Rd b/man/growthSS.Rd
diff --git a/man/pwue.Rd b/man/pwue.Rd
diff --git a/tests/testthat/test-brmsModels.R b/tests/testthat/test-brmsModels.R
@@ -34,6 +34,8 @@ test_that("Logistic brms model pipeline", {
   expect_s3_class(plot2, "ggplot")
   plot2.5 <- brmViolin(fit, ss, "B_groupb/B_groupa > 1.05")
   expect_s3_class(plot2.5, "ggplot")
+  plot2.75 <- brmViolin(fit, ss, "abs(1 - ((...)/(C_groupb - B_groupa))) > 0.05")
+  expect_s3_class(plot2.75, "ggplot")
   ss2 <- growthSS(
     model = "gompertz", form = y ~ time | id / group, sigma = "logistic",
     list("A" = 130, "B" = 10, "C" = 1, "sigmaA" = 20, "sigmaB" = 10, "sigmaC" = 2),

diff --git a/tests/testthat/test-long_mv_workflow.R b/tests/testthat/test-long_mv_workflow.R
@@ -71,7 +71,7 @@ test_that("reading mv github data as long works", {
       outRows = 3000
     )
   )
-  expect_equal(dim(mv_ag1), c(42480, 6))
+  expect_equal(dim(mv_ag1), c(42840, 6))
 
   #* test EMD
   images <- unique(mv$image)[1:10]

diff --git a/tests/testthat/test-wide_mv_workflow.R b/tests/testthat/test-wide_mv_workflow.R
@@ -69,7 +69,7 @@ test_that("reading mv github data as long works", {
     )
   )
 
-  expect_equal(dim(mv_ag1), c(460, 184))
+  expect_equal(dim(mv_ag1), c(464, 184))
 
   #* test EMD
   images <- unique(mv$image)[1:10]