From 007a5a3a9b8e3408a41b73e7200b43d4b2e1e683 Mon Sep 17 00:00:00 2001 From: Sam Abbott Date: Fri, 7 Feb 2025 18:22:27 +0000 Subject: [PATCH] Issue #175: Add distribution look up tables and associated tooling (#195) * add first pass at dist lookup * add look ups and tests second pass * fix checks * update news entry to be easier to read * improve docs for dist and primary id * close testing gaps * update family for datasets --- .pre-commit-config.yaml | 6 +- NAMESPACE | 2 + NEWS.md | 4 ++ R/data.R | 33 ++++++++++ R/pcd-stan-tools.R | 36 +++++++++++ R/pcd_cmdstan_model.R | 17 +++-- R/pcens.R | 8 ++- R/pprimarycensored.R | 6 +- R/utils.R | 82 ++++++++++++++++++++++-- data-raw/distributions.R | 34 ++++++++++ data/pcd_distributions.rda | Bin 0 -> 458 bytes data/pcd_primary_distributions.rda | Bin 0 -> 282 bytes inst/WORDLIST | 1 + man/add_name_attribute.Rd | 10 ++- man/pcd_as_stan_data.Rd | 11 ++-- man/pcd_dist_name.Rd | 37 +++++++++++ man/pcd_distributions.Rd | 33 ++++++++++ man/pcd_load_stan_functions.Rd | 1 + man/pcd_primary_distributions.Rd | 31 +++++++++ man/pcd_stan_dist_id.Rd | 37 +++++++++++ man/pcd_stan_files.Rd | 1 + man/pcd_stan_functions.Rd | 1 + man/pcd_stan_path.Rd | 1 + man/pcens_cdf.Rd | 6 +- man/pcens_cdf.default.Rd | 2 +- man/pprimarycensored.Rd | 6 +- tests/testthat/test-pcd_dist_name.R | 23 +++++++ tests/testthat/test-pcd_stan_dist_id.R | 32 +++++++++ tests/testthat/test-suggest_dist_name.R | 39 +++++++++++ vignettes/fitting-dists-with-stan.Rmd | 4 +- 30 files changed, 465 insertions(+), 39 deletions(-) create mode 100644 R/data.R create mode 100644 data-raw/distributions.R create mode 100644 data/pcd_distributions.rda create mode 100644 data/pcd_primary_distributions.rda create mode 100644 man/pcd_dist_name.Rd create mode 100644 man/pcd_distributions.Rd create mode 100644 man/pcd_primary_distributions.Rd create mode 100644 man/pcd_stan_dist_id.Rd create mode 100644 tests/testthat/test-pcd_dist_name.R create mode 100644 tests/testthat/test-pcd_stan_dist_id.R create mode 100644 tests/testthat/test-suggest_dist_name.R diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4a228d52..65127e2e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # R specific hooks: https://github.com/lorenzwalthert/precommit repos: - repo: https://github.com/lorenzwalthert/precommit - rev: v0.4.3 + rev: v0.4.3.9003 hooks: - id: style-files args: [--style_pkg=styler, --style_fun=tidyverse_style] @@ -14,7 +14,7 @@ repos: - id: no-browser-statement - id: deps-in-desc - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-added-large-files args: ['--maxkb=200'] @@ -32,7 +32,7 @@ repos: - id: check-hooks-apply - id: check-useless-excludes - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: check-yaml diff --git a/NAMESPACE b/NAMESPACE index 0df8b3c3..bd2ce648 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,9 @@ export(fitdistdoublecens) export(new_pcens) export(pcd_as_stan_data) export(pcd_cmdstan_model) +export(pcd_dist_name) export(pcd_load_stan_functions) +export(pcd_stan_dist_id) export(pcd_stan_files) export(pcd_stan_functions) export(pcd_stan_path) diff --git a/NEWS.md b/NEWS.md index b51d9dff..cf94b2dd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,10 @@ Development release. - Updated the CI so that stan code is also tested on Windows and Mac. This is to ensure that the stan code is compatible with these platforms and in response to a CI bug in `epidist`. - Revised approach to automatic discovery of distribution functions. This soft deprecates the `pdist_name` and `dprimary_name` arguments throughout. Users wishing to pass distribution names (i.e. to potentially leverage analytical solutions) are advised to use the newly introduced `add_name_attribute()` function. Adds transient dependency on `lifecycle` and `rlang` packages. See #188 by @pearsonca. +- Added `pcd_stan_dist_id()` to allow for discovery of distribution IDs for Stan models. +- Added `pcd_dist_name()` to allow for discovery of distribution names for R functions as needed for `add_name_attribute()`. +- Added documentation to suggest the use of `methods(pcens_cdf)` to find analytical solutions. +- Added new data sets `pcd_distributions` and `pcd_primary_distributions` to document the distributions supported by `primarycensored`. ## Documentation diff --git a/R/data.R b/R/data.R new file mode 100644 index 00000000..a97cc1a2 --- /dev/null +++ b/R/data.R @@ -0,0 +1,33 @@ +#' Supported delay distributions +#' +#' A dataset containing information about the supported delay distributions in +#' primarycensored. Includes both distributions with base R implementations and +#' those only available in Stan. Distributions beyond these are not supported +#' in the stan code but any user functions can be used in the R code. +#' +#' @format A data.frame with 17 rows and 4 columns: +#' \describe{ +#' \item{name}{Distribution name} +#' \item{pdist}{R distribution function name (e.g. plnorm), NA if there is no +#' base R implementation} +#' \item{aliases}{Alternative names/identifiers} +#' \item{stan_id}{Stan distribution ID used in the stan code} +#' } +#' @family utils +"pcd_distributions" + +#' Supported primary event distributions +#' +#' A dataset containing information about the supported primary event +#' distributions in primarycensored. Distributions beyond these are not +#' supported in the stan code but any user functions can be used in the R code. +#' +#' @format A data.frame with 2 rows and 4 columns: +#' \describe{ +#' \item{name}{Distribution name} +#' \item{dprimary}{R density function name} +#' \item{aliases}{Alternative names/identifiers} +#' \item{stan_id}{Stan distribution ID used in the stan code} +#' } +#' @family utils +"pcd_primary_distributions" diff --git a/R/pcd-stan-tools.R b/R/pcd-stan-tools.R index dac46b26..71249f77 100644 --- a/R/pcd-stan-tools.R +++ b/R/pcd-stan-tools.R @@ -227,3 +227,39 @@ pcd_load_stan_functions <- function( return(result) } + +#' Get distribution stan ID by name +#' +#' @param name String. Distribution name or alias +#' @param type String. "delay" or "primary" corresponding to the type of +#' distribution to use as the look up. If delay then [pcd_distributions()] +#' is used, if primary then [pcd_primary_distributions()] is used. +#' +#' @return Numeric distribution ID +#' @export +#' @family stantools +#' @examples +#' pcd_stan_dist_id("lnorm") +#' pcd_stan_dist_id("lognormal") +#' pcd_stan_dist_id("gamma") +#' pcd_stan_dist_id("weibull") +#' pcd_stan_dist_id("exp") +#' pcd_stan_dist_id("unif", type = "primary") +pcd_stan_dist_id <- function(name, type = c("delay", "primary")) { + type <- match.arg(type) + df <- switch(type, + delay = primarycensored::pcd_distributions, + primary = primarycensored::pcd_primary_distributions + ) + + match_idx <- which(df$name == name | df$aliases == name) + + if (length(match_idx) == 0) { + stop( + "No ", type, " distribution found matching: ", name, "\n", + .suggest_dist_name(name, type) + ) + } + + df$stan_id[match_idx] +} diff --git a/R/pcd_cmdstan_model.R b/R/pcd_cmdstan_model.R index e959e1b3..d2b40bf8 100644 --- a/R/pcd_cmdstan_model.R +++ b/R/pcd_cmdstan_model.R @@ -65,17 +65,16 @@ pcd_cmdstan_model <- function( #' @param pwindow Column name for primary window (default: "pwindow") #' #' @param relative_obs_time Column name for relative observation time -#' (default: "relative_obs_time") +#' (default: "relative_obs_time") #' #' @param dist_id Integer identifying the delay distribution: -#' 1 = Lognormal, 2 = Gamma, 3 = Weibull, 4 = Exponential, -#' 5 = Generalized Gamma, 6 = Negative Binomial, 7 = Poisson, -#' 8 = Bernoulli, 9 = Beta, 10 = Binomial, 11 = Categorical, 12 = Cauchy, -#' 13 = Chi-square, 14 = Dirichlet, 15 = Gumbel, 16 = Inverse Gamma, -#' 17 = Logistic +#' You can use [pcd_stan_dist_id()] to get the dist ID for a +#' distribution or look at the [pcd_distributions] data set. #' #' @param primary_id Integer identifying the primary distribution: -#' 1 = Uniform, 2 = Exponential growth +#' You can use [pcd_stan_dist_id()] to get the primary dist ID for a +#' distribution (make sure to select the "primary" type) or look at the +#' [pcd_primary_distributions] data set. #' #' @param param_bounds A list with elements `lower` and `upper`, each a numeric #' vector specifying bounds for the delay distribution parameters. @@ -92,14 +91,14 @@ pcd_cmdstan_model <- function( #' @param compute_log_lik Logical; compute log likelihood? (default: FALSE) #' #' @param use_reduce_sum Logical; use reduce_sum for performance? -#' (default: FALSE) +#' (default: FALSE) #' #' @param truncation_check_multiplier Numeric multiplier to use for checking #' if the truncation time D is appropriate relative to the maximum delay #' for each unique D value. Set to NULL to skip the check. Default is 2. #' #' @return A list containing the data formatted for use with -#' [pcd_cmdstan_model()] +#' [pcd_cmdstan_model()] #' #' @export #' @family modelhelpers diff --git a/R/pcens.R b/R/pcens.R index 952b0280..acdad0f2 100644 --- a/R/pcens.R +++ b/R/pcens.R @@ -37,6 +37,12 @@ new_pcens <- function( #' Compute primary event censored CDF #' +#' This function dispatches to either analytical solutions (if available) or +#' numerical integration via the default method. To see which combinations have +#' analytical solutions implemented, use `methods(pcens_cdf)`. For example, +#' `pcens_cdf.gamma_unif` indicates an analytical solution exists for gamma +#' delay with uniform primary event distributions. +#' #' @inheritParams pprimarycensored #' #' @param object A `primarycensored` object as created by @@ -61,7 +67,7 @@ pcens_cdf <- function( #' #' This method serves as a fallback for combinations of delay and primary #' event distributions that don't have specific implementations. It uses -#' the numeric integration method. +#' a numeric integration method. #' #' @inheritParams pcens_cdf #' @inheritParams pprimarycensored diff --git a/R/pprimarycensored.R b/R/pprimarycensored.R index 79711f6e..57c8053b 100644 --- a/R/pprimarycensored.R +++ b/R/pprimarycensored.R @@ -81,10 +81,8 @@ #' for automatic use of analytical solutions when available, while #' seamlessly falling back to numerical integration when necessary. #' -#' Note: For analytical detection to work correctly, `pdist` and `dprimary` -#' must be directly passed as distribution functions, not via assignment or -#' `pdist_name` and `dprimary_name` must be used to override the default -#' extraction of the function name. +#' See `methods(pcens_cdf)` for which combinations have analytical +#' solutions implemented. #' #' @family primarycensored #' @seealso [new_pcens()] and [pcens_cdf()] diff --git a/R/utils.R b/R/utils.R index c7320a4a..867bcdc3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -28,7 +28,9 @@ #' detected by [pprimarycensored()] and related functions. #' #' This is useful as it enables the automatic use of analytical solutions for -#' distributions where they exist. +#' distributions where they exist. You can check which analytical solutions are +#' available using `methods(pcens_cdf)` and check distribution names using +#' [pcd_dist_name()]. #' #' @param func Function, for example the `p`- or `d`- form of a distribution #' function. @@ -84,10 +86,9 @@ add_name_attribute <- function(func, name) { #' #' @keywords internal .name_deprecation <- function( - pdist_name, dprimary_name, - env = rlang::caller_env(), - user_env = rlang::caller_env(2) -) { + pdist_name, dprimary_name, + env = rlang::caller_env(), + user_env = rlang::caller_env(2)) { test_use <- c( lifecycle::is_present(pdist_name), lifecycle::is_present(dprimary_name) @@ -97,8 +98,7 @@ add_name_attribute <- function(func, name) { lifecycle::deprecate_soft( when = "1.0.0", what = I("`pdist_name` and `dprimary_name` are deprecated across all - functions and will be ignored in future versions;" - ), + functions and will be ignored in future versions;"), details = "Use `add_name_attribute()` instead.", env = env, user_env = user_env ) @@ -107,3 +107,71 @@ add_name_attribute <- function(func, name) { } return(res) } + +#' Get distribution function cdf or pdf name +#' +#' @param name String. Distribution name or alias +#' @param type String. "delay" or "primary" corresponding to the type of +#' distribution to use as the look up. If delay then [pcd_distributions()] +#' is used, if primary then [pcd_primary_distributions()] is used. +#' +#' @return String distribution function name or NA if no base R implementation +#' @export +#' @family utils +#' @examples +#' pcd_dist_name("lnorm") +#' pcd_dist_name("lognormal") +#' pcd_dist_name("gamma") +#' pcd_dist_name("weibull") +#' pcd_dist_name("exp") +#' pcd_dist_name("unif", type = "primary") +#' pcd_dist_name("expgrowth", type = "primary") +pcd_dist_name <- function(name, type = c("delay", "primary")) { + type <- match.arg(type) + df <- switch(type, + delay = primarycensored::pcd_distributions, + primary = primarycensored::pcd_primary_distributions + ) + + match_idx <- which(df$name == name | df$aliases == name) + + if (length(match_idx) == 0) { + stop( + "No ", type, " distribution found matching: ", name, "\n", + .suggest_dist_name(name, type) + ) + } + + if (type == "delay") { + df$pdist[match_idx] + } else { + df$dprimary[match_idx] + } +} + +#' @keywords internal +.suggest_dist_name <- function(input, type = "delay") { + dist_names <- switch(type, + delay = primarycensored::pcd_distributions$name, + primary = primarycensored::pcd_primary_distributions$name + ) + + distances <- utils::adist(input, dist_names) + min_dist <- min(distances) + candidates <- dist_names[which(distances == min_dist)] + + if (min_dist <= 2 && length(candidates) > 0) { + suggestions <- paste0( + "Did you mean: ", + toString(unique(candidates)), + "?" + ) + } else { + suggestions <- paste0( + "Available distributions:", + toString(unique(dist_names)) + ) + } + + return(suggestions) +} diff --git a/data-raw/distributions.R b/data-raw/distributions.R new file mode 100644 index 00000000..30070285 --- /dev/null +++ b/data-raw/distributions.R @@ -0,0 +1,34 @@ +pcd_distributions <- data.frame( + name = c( + "lnorm", "gamma", "weibull", "exp", "gengamma", "nbinom", + "pois", "bern", "beta", "binom", "cat", "cauchy", "chisq", + "dirich", "gumbel", "invgamma", "logis" + ), + pdist = c( + "plnorm", "pgamma", "pweibull", "pexp", NA, "pnbinom", + "ppois", NA, "pbeta", "pbinom", NA, "pcauchy", "pchisq", + NA, "pgumbel", NA, "plogis" + ), + aliases = c( + "lognormal", "gamma", "weibull", "exponential", "generalized gamma", + "negative binomial", "poisson", "bernoulli", "beta", "binomial", + "categorical", "cauchy", "chi-square", "dirichlet", "gumbel", + "inverse gamma", "logistic" + ), + stan_id = 1:17, + stringsAsFactors = FALSE +) + +pcd_primary_distributions <- data.frame( + name = c("unif", "expgrowth"), + dprimary = c("dunif", "dexpgrowth"), + aliases = c("uniform", "exponential growth"), + stan_id = 1:2, + stringsAsFactors = FALSE +) + +usethis::use_data( + pcd_distributions, + pcd_primary_distributions, + overwrite = TRUE +) diff --git a/data/pcd_distributions.rda b/data/pcd_distributions.rda new file mode 100644 index 0000000000000000000000000000000000000000..6b9a4f43dcd63c22bbfb8986f024577afaa0ad16 GIT binary patch literal 458 zcmV;*0X6SR|H1$JXaGP1C_u#k7C^t}|L{Nn00F=OumPE@ zOJSyHDZfY|Se1`s9l%B7|$@w|k?43kR@;(llm63Y6U)B2gk!cm_5J79dH2Lru28 zK-dszAX@an2rdHMwnLuSY>0;b6634ijWUX~}%AmqUkZ`>W=`OAuT&WBN; zn!1JmQ-qE8%u6j(*IFxwi!@a;>4q9|3%RcxxxG?J+8NMR9Pr68CyaSa_>9gr?3eHh zZ(z6unyoL4>xM01aGu<9=qtc=4l#05OF7gWAqw$H6QnM+y1B8et=f~FVhIM5i-wLW zFQ+{D4v|=0RIO?>2PMHWXIoy-ABe7sGYMco&W%%f5HCqSRgpncBwyn0!?)IO@biLxn(R0(l)WW`cu{nX>6;j4@ zARUYIkgjO;d