Issue #175: Add distribution look up tables and associated tooling (#195

) * add first pass at dist lookup * add look ups and tests second pass * fix checks * update news entry to be easier to read * improve docs for dist and primary id * close testing gaps * update family for datasets
epinowcast · Feb 7, 2025 · 007a5a3 · 007a5a3
1 parent cb9017d
commit 007a5a3
Show file tree

Hide file tree

Showing 30 changed files with 465 additions and 39 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # R specific hooks: https://github.com/lorenzwalthert/precommit
 repos:
 -   repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.4.3
+    rev: v0.4.3.9003
     hooks:
     -   id: style-files
         args: [--style_pkg=styler, --style_fun=tidyverse_style]
@@ -14,7 +14,7 @@ repos:
     -   id: no-browser-statement
     -   id: deps-in-desc
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     -   id: check-added-large-files
         args: ['--maxkb=200']
@@ -32,7 +32,7 @@ repos:
     -   id: check-hooks-apply
     -   id: check-useless-excludes
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     -   id: trailing-whitespace
     -   id: check-yaml

diff --git a/NAMESPACE b/NAMESPACE
@@ -15,7 +15,9 @@ export(fitdistdoublecens)
 export(new_pcens)
 export(pcd_as_stan_data)
 export(pcd_cmdstan_model)
+export(pcd_dist_name)
 export(pcd_load_stan_functions)
+export(pcd_stan_dist_id)
 export(pcd_stan_files)
 export(pcd_stan_functions)
 export(pcd_stan_path)

diff --git a/NEWS.md b/NEWS.md
@@ -7,6 +7,10 @@ Development release.
 
 - Updated the CI so that stan code is also tested on Windows and Mac. This is to ensure that the stan code is compatible with these platforms and in response to a CI bug in `epidist`.
 - Revised approach to automatic discovery of distribution functions. This soft deprecates the `pdist_name` and `dprimary_name` arguments throughout. Users wishing to pass distribution names (i.e. to potentially leverage analytical solutions) are advised to use the newly introduced `add_name_attribute()` function. Adds transient dependency on `lifecycle` and `rlang` packages. See #188 by @pearsonca.
+- Added `pcd_stan_dist_id()` to allow for discovery of distribution IDs for Stan models.
+- Added `pcd_dist_name()` to allow for discovery of distribution names for R functions as needed for `add_name_attribute()`.
+- Added documentation to suggest the use of `methods(pcens_cdf)` to find analytical solutions.
+- Added new data sets `pcd_distributions` and `pcd_primary_distributions` to document the distributions supported by `primarycensored`.
 
 ## Documentation
 

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,33 @@
+#' Supported delay distributions
+#'
+#' A dataset containing information about the supported delay distributions in
+#' primarycensored. Includes both distributions with base R implementations and
+#' those only available in Stan. Distributions beyond these are not supported
+#' in the stan code but any user functions can be used in the R code.
+#'
+#' @format A data.frame with 17 rows and 4 columns:
+#' \describe{
+#'   \item{name}{Distribution name}
+#'   \item{pdist}{R distribution function name (e.g. plnorm), NA if there is no
+#'   base R implementation}
+#'   \item{aliases}{Alternative names/identifiers}
+#'   \item{stan_id}{Stan distribution ID used in the stan code}
+#' }
+#' @family utils
+"pcd_distributions"
+
+#' Supported primary event distributions
+#'
+#' A dataset containing information about the supported primary event
+#' distributions in primarycensored. Distributions beyond these are not
+#' supported in the stan code but any user functions can be used in the R code.
+#'
+#' @format A data.frame with 2 rows and 4 columns:
+#' \describe{
+#'   \item{name}{Distribution name}
+#'   \item{dprimary}{R density function name}
+#'   \item{aliases}{Alternative names/identifiers}
+#'   \item{stan_id}{Stan distribution ID used in the stan code}
+#' }
+#' @family utils
+"pcd_primary_distributions"
diff --git a/R/pcd-stan-tools.R b/R/pcd-stan-tools.R
@@ -227,3 +227,39 @@ pcd_load_stan_functions <- function(
 
   return(result)
 }
+
+#' Get distribution stan ID by name
+#'
+#' @param name String. Distribution name or alias
+#' @param type String. "delay" or "primary" corresponding to the type of
+#'  distribution to use as the look up. If delay then [pcd_distributions()]
+#'  is used, if primary then [pcd_primary_distributions()] is used.
+#'
+#' @return Numeric distribution ID
+#' @export
+#' @family stantools
+#' @examples
+#' pcd_stan_dist_id("lnorm")
+#' pcd_stan_dist_id("lognormal")
+#' pcd_stan_dist_id("gamma")
+#' pcd_stan_dist_id("weibull")
+#' pcd_stan_dist_id("exp")
+#' pcd_stan_dist_id("unif", type = "primary")
+pcd_stan_dist_id <- function(name, type = c("delay", "primary")) {
+  type <- match.arg(type)
+  df <- switch(type,
+    delay = primarycensored::pcd_distributions,
+    primary = primarycensored::pcd_primary_distributions
+  )
+
+  match_idx <- which(df$name == name | df$aliases == name)
+
+  if (length(match_idx) == 0) {
+    stop(
+      "No ", type, " distribution found matching: ", name, "\n",
+      .suggest_dist_name(name, type)
+    )
+  }
+
+  df$stan_id[match_idx]
+}
diff --git a/R/pcd_cmdstan_model.R b/R/pcd_cmdstan_model.R
@@ -65,17 +65,16 @@ pcd_cmdstan_model <- function(
 #' @param pwindow Column name for primary window (default: "pwindow")
 #'
 #' @param relative_obs_time Column name for relative observation time
-#' (default: "relative_obs_time")
+#'  (default: "relative_obs_time")
 #'
 #' @param dist_id Integer identifying the delay distribution:
-#'   1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential,
-#'   5 = Generalized Gamma, 6 = Negative Binomial, 7 = Poisson,
-#'   8 = Bernoulli, 9 = Beta, 10 = Binomial, 11 = Categorical, 12 = Cauchy,
-#'   13 = Chi-square, 14 = Dirichlet, 15 = Gumbel, 16 = Inverse Gamma,
-#'   17 = Logistic
+#'   You can use [pcd_stan_dist_id()] to get the dist ID for a
+#'   distribution or look at the [pcd_distributions] data set.
 #'
 #' @param primary_id Integer identifying the primary distribution:
-#'   1 = Uniform, 2 = Exponential growth
+#'   You can use [pcd_stan_dist_id()] to get the primary dist ID for a
+#'   distribution (make sure to select the "primary" type) or look at the
+#'   [pcd_primary_distributions] data set.
 #'
 #' @param param_bounds A list with elements `lower` and `upper`, each a numeric
 #'   vector specifying bounds for the delay distribution parameters.
@@ -92,14 +91,14 @@ pcd_cmdstan_model <- function(
 #' @param compute_log_lik Logical; compute log likelihood? (default: FALSE)
 #'
 #' @param use_reduce_sum Logical; use reduce_sum for performance?
-#'  (default: FALSE)
+#'   (default: FALSE)
 #'
 #' @param truncation_check_multiplier Numeric multiplier to use for checking
 #'   if the truncation time D is appropriate relative to the maximum delay
 #'   for each unique D value. Set to NULL to skip the check. Default is 2.
 #'
 #' @return A list containing the data formatted for use with
-#' [pcd_cmdstan_model()]
+#'   [pcd_cmdstan_model()]
 #'
 #' @export
 #' @family modelhelpers

diff --git a/R/pcens.R b/R/pcens.R
@@ -37,6 +37,12 @@ new_pcens <- function(
 
 #' Compute primary event censored CDF
 #'
+#' This function dispatches to either analytical solutions (if available) or
+#' numerical integration via the default method. To see which combinations have
+#' analytical solutions implemented, use `methods(pcens_cdf)`. For example,
+#' `pcens_cdf.gamma_unif` indicates an analytical solution exists for gamma
+#' delay with uniform primary event distributions.
+#'
 #' @inheritParams pprimarycensored
 #'
 #' @param object A `primarycensored` object as created by
@@ -61,7 +67,7 @@ pcens_cdf <- function(
 #'
 #' This method serves as a fallback for combinations of delay and primary
 #' event distributions that don't have specific implementations. It uses
-#' the numeric integration method.
+#' a numeric integration method.
 #'
 #' @inheritParams pcens_cdf
 #' @inheritParams pprimarycensored

diff --git a/R/pprimarycensored.R b/R/pprimarycensored.R
@@ -81,10 +81,8 @@
 #' for automatic use of analytical solutions when available, while
 #' seamlessly falling back to numerical integration when necessary.
 #'
-#' Note: For analytical detection to work correctly, `pdist` and `dprimary`
-#' must be directly passed as distribution functions, not via assignment or
-#' `pdist_name` and `dprimary_name` must be used to override the default
-#' extraction of the function name.
+#' See `methods(pcens_cdf)` for which combinations have analytical
+#' solutions implemented.
 #'
 #' @family primarycensored
 #' @seealso [new_pcens()] and [pcens_cdf()]

diff --git a/R/utils.R b/R/utils.R
@@ -28,7 +28,9 @@
 #' detected by [pprimarycensored()] and related functions.
 #'
 #' This is useful as it enables the automatic use of analytical solutions for
-#' distributions where they exist.
+#' distributions where they exist. You can check which analytical solutions are
+#' available using `methods(pcens_cdf)` and check distribution names using
+#' [pcd_dist_name()].
 #'
 #' @param func Function, for example the `p`- or `d`- form of a distribution
 #' function.
@@ -84,10 +86,9 @@ add_name_attribute <- function(func, name) {
 #'
 #' @keywords internal
 .name_deprecation <- function(
-  pdist_name, dprimary_name,
-  env = rlang::caller_env(),
-  user_env = rlang::caller_env(2)
-) {
+    pdist_name, dprimary_name,
+    env = rlang::caller_env(),
+    user_env = rlang::caller_env(2)) {
   test_use <- c(
     lifecycle::is_present(pdist_name),
     lifecycle::is_present(dprimary_name)
@@ -97,8 +98,7 @@ add_name_attribute <- function(func, name) {
     lifecycle::deprecate_soft(
       when = "1.0.0",
       what = I("`pdist_name` and `dprimary_name` are deprecated across all
-        functions and will be ignored in future versions;"
-      ),
+        functions and will be ignored in future versions;"),
       details = "Use `add_name_attribute()` instead.",
       env = env, user_env = user_env
     )
@@ -107,3 +107,71 @@ add_name_attribute <- function(func, name) {
   }
   return(res)
 }
+
+#' Get distribution function cdf or pdf name
+#'
+#' @param name String. Distribution name or alias
+#' @param type String. "delay" or "primary" corresponding to the type of
+#'  distribution to use as the look up. If delay then [pcd_distributions()]
+#'  is used, if primary then [pcd_primary_distributions()] is used.
+#'
+#' @return String distribution function name or NA if no base R implementation
+#' @export
+#' @family utils
+#' @examples
+#' pcd_dist_name("lnorm")
+#' pcd_dist_name("lognormal")
+#' pcd_dist_name("gamma")
+#' pcd_dist_name("weibull")
+#' pcd_dist_name("exp")
+#' pcd_dist_name("unif", type = "primary")
+#' pcd_dist_name("expgrowth", type = "primary")
+pcd_dist_name <- function(name, type = c("delay", "primary")) {
+  type <- match.arg(type)
+  df <- switch(type,
+    delay = primarycensored::pcd_distributions,
+    primary = primarycensored::pcd_primary_distributions
+  )
+
+  match_idx <- which(df$name == name | df$aliases == name)
+
+  if (length(match_idx) == 0) {
+    stop(
+      "No ", type, " distribution found matching: ", name, "\n",
+      .suggest_dist_name(name, type)
+    )
+  }
+
+  if (type == "delay") {
+    df$pdist[match_idx]
+  } else {
+    df$dprimary[match_idx]
+  }
+}
+
+#' @keywords internal
+.suggest_dist_name <- function(input, type = "delay") {
+  dist_names <- switch(type,
+    delay = primarycensored::pcd_distributions$name,
+    primary = primarycensored::pcd_primary_distributions$name
+  )
+
+  distances <- utils::adist(input, dist_names)
+  min_dist <- min(distances)
+  candidates <- dist_names[which(distances == min_dist)]
+
+  if (min_dist <= 2 && length(candidates) > 0) {
+    suggestions <- paste0(
+      "Did you mean: ",
+      toString(unique(candidates)),
+      "?"
+    )
+  } else {
+    suggestions <- paste0(
+      "Available distributions:",
+      toString(unique(dist_names))
+    )
+  }
+
+  return(suggestions)
+}
diff --git a/data-raw/distributions.R b/data-raw/distributions.R
@@ -0,0 +1,34 @@
+pcd_distributions <- data.frame(
+  name = c(
+    "lnorm", "gamma", "weibull", "exp", "gengamma", "nbinom",
+    "pois", "bern", "beta", "binom", "cat", "cauchy", "chisq",
+    "dirich", "gumbel", "invgamma", "logis"
+  ),
+  pdist = c(
+    "plnorm", "pgamma", "pweibull", "pexp", NA, "pnbinom",
+    "ppois", NA, "pbeta", "pbinom", NA, "pcauchy", "pchisq",
+    NA, "pgumbel", NA, "plogis"
+  ),
+  aliases = c(
+    "lognormal", "gamma", "weibull", "exponential", "generalized gamma",
+    "negative binomial", "poisson", "bernoulli", "beta", "binomial",
+    "categorical", "cauchy", "chi-square", "dirichlet", "gumbel",
+    "inverse gamma", "logistic"
+  ),
+  stan_id = 1:17,
+  stringsAsFactors = FALSE
+)
+
+pcd_primary_distributions <- data.frame(
+  name = c("unif", "expgrowth"),
+  dprimary = c("dunif", "dexpgrowth"),
+  aliases = c("uniform", "exponential growth"),
+  stan_id = 1:2,
+  stringsAsFactors = FALSE
+)
+
+usethis::use_data(
+  pcd_distributions,
+  pcd_primary_distributions,
+  overwrite = TRUE
+)
diff --git a/data/pcd_distributions.rda b/data/pcd_distributions.rda
diff --git a/data/pcd_primary_distributions.rda b/data/pcd_primary_distributions.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -66,6 +66,7 @@ pcd
 pcens
 pdist
 pearsonca
+plnorm
 pprimarycensored
 precalculation
 primaryeventdistributions

diff --git a/man/add_name_attribute.Rd b/man/add_name_attribute.Rd
diff --git a/man/pcd_as_stan_data.Rd b/man/pcd_as_stan_data.Rd