diff --git a/.github/workflows/check-cmdstan.yaml b/.github/workflows/check-cmdstan.yaml index 5d9ab12..a9dd81d 100644 --- a/.github/workflows/check-cmdstan.yaml +++ b/.github/workflows/check-cmdstan.yaml @@ -69,7 +69,7 @@ jobs: # We can't use TRUE here as pendatic check return lots of false # positives related to our use of functions. stopifnot( - length(message) > 11 && + length(message) != 0 && length(message) < 12 && any(message == "Stan program is syntactically correct") ) shell: Rscript {0} diff --git a/NAMESPACE b/NAMESPACE index 6e34dac..c387291 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,3 +17,4 @@ export(rpcens) export(rprimarycensoreddist) importFrom(stats,dunif) importFrom(stats,runif) +importFrom(stats,setNames) diff --git a/NEWS.md b/NEWS.md index 0c81237..7793622 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,10 @@ interface changes in `0.3.0` for the Stan code. * Added support for `swindow = 0` to `rprimarycensoreddist` to allow for non-secondary event censored distributions. * Adapted `rprimarycensoreddist` so that truncation is based on the primary censored distribution before secondary events are censored. This better matches the generative process. * Added a new Stan interface tool to enable finding which files functions are implemented in the Stan code. +* Updated the approach to truncation to be outside the primary censored distribution integral. +* Improved tests that compare random sampling and probability mass/density functions between R and Stan. +* Improved cross testing between R and Stan implementations of the primary censored distributions. +* Worked on improving the stability of the `primary_censored_dist_lpmf` when used for NUTS based fitting (i.e. in Stan). ## Documentation diff --git a/R/dprimarycensoreddist.R b/R/dprimarycensoreddist.R index ed1ddb0..c860775 100644 --- a/R/dprimarycensoreddist.R +++ b/R/dprimarycensoreddist.R @@ -34,12 +34,29 @@ #' \deqn{ #' f_{\text{cens}}(d) = F_{\text{cens}}(d + \text{swindow}) - F_{\text{cens}}(d) #' } -#' where \eqn{F_{\text{cens}}} is the primary event censored CDF. For the -#' explanation and mathematical details of the CDF, refer to the documentation -#' of [pprimarycensoreddist()]. +#' where \eqn{F_{\text{cens}}} is the primary event censored CDF. +#' +#' The function first computes the CDFs for all unique points (including both +#' \eqn{d} and \eqn{d + \text{swindow}}) using [pprimarycensoreddist()]. It then +#' creates a lookup table for these CDFs to efficiently calculate the PMF for +#' each input value. For non-positive delays, the function returns 0. +#' +#' If a finite maximum delay \eqn{D} is specified, the PMF is normalized to +#' ensure it sums to 1 over the range \[0, D\]. This normalization can be +#' expressed as: +#' \deqn{ +#' f_{\text{cens,norm}}(d) = \frac{f_{\text{cens}}(d)}{\sum_{i=0}^{D-1} +#' f_{\text{cens}}(i)} +#' } +#' where \eqn{f_{\text{cens,norm}}(d)} is the normalized PMF and +#' \eqn{f_{\text{cens}}(d)} is the unnormalized PMF. For the explanation and +#' mathematical details of the CDF, refer to the documentation of +#' [pprimarycensoreddist()]. #' #' @family primarycensoreddist #' +#' @importFrom stats setNames +#' #' @examples #' # Example: Weibull distribution with uniform primary events #' dprimarycensoreddist(c(0.1, 0.5, 1), pweibull, shape = 1.5, scale = 2.0) @@ -65,19 +82,45 @@ dprimarycensoreddist <- function( ) } + # Compute CDFs for all unique points + unique_points <- sort(unique(c(x, x + swindow))) + unique_points <- unique_points[unique_points > 0] + if (length(unique_points) == 0) { + return(rep(0, length(x))) + } + + cdfs <- pprimarycensoreddist( + unique_points, pdist, pwindow, Inf, dprimary, dprimary_args, ... + ) + + # Create a lookup table for CDFs + cdf_lookup <- stats::setNames(cdfs, as.character(unique_points)) + result <- vapply(x, function(d) { if (d < 0) { - return(0) # Return log(0) for non-positive delays + return(0) # Return 0 for negative delays + } else if (d == 0) { + # Special case for d = 0 + cdf_upper <- cdf_lookup[as.character(swindow)] + return(cdf_upper) } else { - pprimarycensoreddist( - d + swindow, pdist, pwindow, D, dprimary, dprimary_args, ... - ) - - pprimarycensoreddist( - d, pdist, pwindow, D, dprimary, dprimary_args, ... - ) + cdf_upper <- cdf_lookup[as.character(d + swindow)] + cdf_lower <- cdf_lookup[as.character(d)] + return(cdf_upper - cdf_lower) } }, numeric(1)) + if (is.finite(D)) { + if (max(unique_points) == D) { + cdf_D <- max(cdfs) + } else { + cdf_D <- pprimarycensoreddist( + D, pdist, pwindow, Inf, dprimary, dprimary_args, ... + ) + } + result <- result / cdf_D + } + if (log) { return(log(result)) } else { diff --git a/R/pprimarycensoreddist.R b/R/pprimarycensoreddist.R index 0194885..85e3ad1 100644 --- a/R/pprimarycensoreddist.R +++ b/R/pprimarycensoreddist.R @@ -41,7 +41,7 @@ #' #' @details #' The primary event censored CDF is computed by integrating the product of -#' the primary event distribution function (CDF) and the delay distribution +#' the delay distribution function (CDF) and the primary event distribution #' function (PDF) over the primary event window. The integration is adjusted #' for truncation if a finite maximum delay (D) is specified. #' @@ -50,15 +50,16 @@ #' F_{\text{cens}}(q) = \int_{0}^{pwindow} F(q - p) \cdot f_{\text{primary}}(p) #' \, dp #' } -#' where \eqn{F} is the CDF of the primary event distribution, +#' where \eqn{F} is the CDF of the delay distribution, #' \eqn{f_{\text{primary}}} is the PDF of the primary event times, and #' \eqn{pwindow} is the primary event window. #' -#' If the maximum delay \eqn{D} is finite, the CDF is normalized by \eqn{F(D)}: +#' If the maximum delay \eqn{D} is finite, the CDF is normalized by dividing +#' by \eqn{F_{\text{cens}}(D)}: #' \deqn{ -#' F_{\text{cens}}(q) = \int_{0}^{pwindow} \frac{F(q - p)}{F(D - p)} \cdot -#' f_{\text{primary}}(p) \, dp +#' F_{\text{cens,norm}}(q) = \frac{F_{\text{cens}}(q)}{F_{\text{cens}}(D)} #' } +#' where \eqn{F_{\text{cens,norm}}(q)} is the normalized CDF. #' #' @family primarycensoreddist #' @@ -82,28 +83,28 @@ pprimarycensoreddist <- function( if (d <= 0) { return(0) # Return 0 for non-positive delays } else { - if (is.infinite(D)) { - integrand <- function(p) { - d_adj <- d - p - pdist(d_adj, ...) * - do.call( - dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args) - ) - } - } else { - integrand <- function(p) { - d_adj <- d - p - D_adj <- D - p - pdist(d_adj, ...) / pdist(D_adj, ...) * - do.call( - dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args) - ) - } + integrand <- function(p) { + d_adj <- d - p + pdist(d_adj, ...) * + do.call( + dprimary, c(list(x = p, min = 0, max = pwindow), dprimary_args) + ) } + stats::integrate(integrand, lower = 0, upper = pwindow)$value } }, numeric(1)) + if (!is.infinite(D)) { + # Compute normalization factor for finite D + normalizer <- if (max(q) == D) { + result[length(result)] + } else { + pprimarycensoreddist(D, pdist, pwindow, Inf, dprimary, dprimary_args, ...) + } + result <- result / normalizer + } + return(result) } diff --git a/inst/stan/primary_censored_dist.stan b/inst/stan/primary_censored_dist.stan index 45de07c..1d31548 100644 --- a/inst/stan/primary_censored_dist.stan +++ b/inst/stan/primary_censored_dist.stan @@ -84,7 +84,7 @@ real primary_dist_lpdf(real x, int primary_dist_id, array[] real params, real mi * @param xc A high precision version of the distance from x to the nearest * endpoint in a definite integral * @param theta Distribution parameters - * @param x_r Real data (contains pwindow) + * @param x_r Real data (contains d, pwindow, obs_time_add) * @param x_i Integer data (contains dist_id and primary_dist_id) * * @return Value of the integrand @@ -105,49 +105,50 @@ real primary_dist_lpdf(real x, int primary_dist_id, array[] real params, real mi real primary_censored_integrand(real x, real xc, array[] real theta, array[] real x_r, array[] int x_i) { // Unpack parameters - real d = theta[1]; + real d = x_r[1]; int dist_id = x_i[1]; int primary_dist_id = x_i[2]; - real pwindow = x_r[1]; - real D = x_r[2]; + real pwindow = x_r[2]; int dist_params_len = x_i[3]; int primary_params_len = x_i[4]; + real lower_bound = theta[size(theta) - 1]; + real width = theta[size(theta)]; // Extract distribution parameters array[dist_params_len] real params; if (dist_params_len) { - params = theta[2:(1 + dist_params_len)]; + params = theta[1:dist_params_len]; } array[primary_params_len] real primary_params; if (primary_params_len) { - int theta_len = size(theta); - primary_params = theta[(theta_len - dist_params_len + 1):theta_len]; - } - - // Compute adjusted delay - real d_adj = d - x; - if (d_adj <= 0) { - return 0; + int primary_loc = size(theta) - 2; + primary_params = theta[primary_loc - primary_params_len + 1:primary_loc]; } + // Compute adjusted delay and primary point + real d_adj; + real ppoint; + // This whilst theoretically correct is appears to make the integral + // very hard to solve numerically and so has a large knock on effect + // on sampling efficiency. + // if (x < (lower_bound + width * 0.01)) { + // d_adj = lower_bound - xc; + // } else if (x > (d - width * 0.01)) { + // d_adj = d - xc; + // } else { + // d_adj = x; + // } + d_adj = x; + ppoint = d - d_adj; // Compute log probabilities real log_cdf = dist_lcdf(d_adj | params, dist_id); real log_primary_pdf = primary_dist_lpdf( - x | primary_dist_id, primary_params, 0, pwindow + ppoint | primary_dist_id, primary_params, 0, pwindow ); - - if (is_inf(D)) { - // No truncation - return exp(log_cdf + log_primary_pdf); - } else { - // Truncate at D - real D_adj = D - x; - real log_cdf_D = dist_lcdf(D_adj | params, dist_id); - return exp(log_cdf - log_cdf_D + log_primary_pdf); - } + return exp(log_cdf + log_primary_pdf); } - /** +/** * Compute the primary event censored CDF for a single delay * * @param d Delay @@ -174,24 +175,38 @@ real primary_censored_integrand(real x, real xc, array[] real theta, * ); * @endcode */ -real primary_censored_dist_cdf(real d, int dist_id, array[] real params, +real primary_censored_dist_cdf(data real d, int dist_id, array[] real params, data real pwindow, data real D, int primary_dist_id, array[] real primary_params) { real result; - if (d <= 0 || d > D) { + if (d <= 0) { return 0; } - array[size(params) + size(primary_params) + 1] real theta = - append_array({d}, append_array(params, primary_params)); + real lower_bound = max({d - pwindow, 1e-6}); + real width = d - lower_bound; + + array[size(params) + size(primary_params) + 2] real theta = + append_array( + append_array( + append_array(params, primary_params), {lower_bound} + ), + {width} + ); array[4] int ids = { dist_id, primary_dist_id, size(params), size(primary_params) }; result = integrate_1d( - primary_censored_integrand, 0.0, pwindow, theta, {pwindow, D}, ids + primary_censored_integrand, lower_bound, d, theta, {d, pwindow}, ids, 1e-2 ); + if (!is_inf(D)) { + real log_cdf_D = primary_censored_dist_lcdf( + D | dist_id, params, pwindow, positive_infinity(), primary_dist_id,primary_params + ); + result = exp(log(result) - log_cdf_D); + } return result; } @@ -223,13 +238,14 @@ real primary_censored_dist_cdf(real d, int dist_id, array[] real params, * ); * @endcode */ -real primary_censored_dist_lcdf(real d, int dist_id, array[] real params, +real primary_censored_dist_lcdf(data real d, int dist_id, array[] real params, data real pwindow, data real D, int primary_dist_id, array[] real primary_params) { - if (d <= 0 || d > D) { + if (d <= 0) { return negative_infinity(); } + return log( primary_censored_dist_cdf( d | dist_id, params, pwindow, D, primary_dist_id, primary_params @@ -244,7 +260,7 @@ real primary_censored_dist_lcdf(real d, int dist_id, array[] real params, * @param dist_id Distribution identifier * @param params Array of distribution parameters * @param pwindow Primary event window - * @param swindow Secondary event window + * @param d_upper Upper bound for the delay interval * @param D Maximum delay (truncation point) * @param primary_dist_id Primary distribution identifier * @param primary_params Primary distribution parameters @@ -253,35 +269,51 @@ real primary_censored_dist_lcdf(real d, int dist_id, array[] real params, * * @code * // Example: Weibull delay distribution with uniform primary distribution - * real d = 3.0; + * int d = 3; * int dist_id = 5; // Weibull * array[2] real params = {2.0, 1.5}; // shape and scale * real pwindow = 1.0; - * real swindow = 0.1; + * real d_upper = 4.0; * real D = positive_infinity(); * int primary_dist_id = 1; // Uniform * array[0] real primary_params = {}; * real log_pmf = primary_censored_dist_lpmf( - * d, dist_id, params, pwindow, swindow, D, primary_dist_id, primary_params + * d, dist_id, params, pwindow, d_upper, D, primary_dist_id, primary_params * ); * @endcode */ -real primary_censored_dist_lpmf(int d, int dist_id, array[] real params, - data real pwindow, real swindow, data real D, - int primary_dist_id, array[] real primary_params) { - - real d_upper = d + swindow; +real primary_censored_dist_lpmf(data int d, int dist_id, array[] real params, + data real pwindow, data real d_upper, + data real D, int primary_dist_id, + array[] real primary_params) { if (d_upper > D) { reject("Upper truncation point is greater than D. It is ", d_upper, - " and D is ", D, ". Resolve this by increasing D to be greater or equal to d + swindow."); + " and D is ", D, ". Resolve this by increasing D to be greater or equal to d + swindow or decreasing swindow."); + } + if (d_upper <= d) { + reject("Upper truncation point is less than or equal to d. It is ", d_upper, + " and d is ", d, ". Resolve this by increasing d to be less than d_upper."); } real log_cdf_upper = primary_censored_dist_lcdf( - d_upper | dist_id, params, pwindow, D, primary_dist_id, primary_params + d_upper | dist_id, params, pwindow, positive_infinity(), primary_dist_id, primary_params ); real log_cdf_lower = primary_censored_dist_lcdf( - d | dist_id, params, pwindow, D, primary_dist_id, primary_params + d | dist_id, params, pwindow, positive_infinity(), primary_dist_id, primary_params ); - return log_diff_exp(log_cdf_upper, log_cdf_lower); + if (!is_inf(D)) { + real log_cdf_D; + + if (d_upper == D) { + log_cdf_D = log_cdf_upper; + } else { + log_cdf_D = primary_censored_dist_lcdf( + D | dist_id, params, pwindow, positive_infinity(), primary_dist_id, primary_params + ); + } + return log_diff_exp(log_cdf_upper, log_cdf_lower) - log_cdf_D; + } else { + return log_diff_exp(log_cdf_upper, log_cdf_lower); + } } /** @@ -291,7 +323,7 @@ real primary_censored_dist_lpmf(int d, int dist_id, array[] real params, * @param dist_id Distribution identifier * @param params Array of distribution parameters * @param pwindow Primary event window - * @param swindow Secondary event window + * @param d_upper Upper bound for the delay interval * @param D Maximum delay (truncation point) * @param primary_dist_id Primary distribution identifier * @param primary_params Primary distribution parameters @@ -300,6 +332,7 @@ real primary_censored_dist_lpmf(int d, int dist_id, array[] real params, * * @code * // Example: Weibull delay distribution with uniform primary distribution + * int d = 3; * real d = 3.0; * int dist_id = 5; // Weibull * array[2] real params = {2.0, 1.5}; // shape and scale @@ -311,12 +344,13 @@ real primary_censored_dist_lpmf(int d, int dist_id, array[] real params, * real pmf = primary_censored_dist_pmf(d, dist_id, params, pwindow, swindow, D, primary_dist_id, primary_params); * @endcode */ -real primary_censored_dist_pmf(int d, int dist_id, array[] real params, - data real pwindow, real swindow, data real D, - int primary_dist_id, array[] real primary_params) { +real primary_censored_dist_pmf(data int d, int dist_id, array[] real params, + data real pwindow, data real d_upper, + data real D, int primary_dist_id, + array[] real primary_params) { return exp( primary_censored_dist_lpmf( - d | dist_id, params, pwindow, swindow, D, primary_dist_id, primary_params + d | dist_id, params, pwindow, d_upper, D, primary_dist_id, primary_params ) ); } @@ -331,8 +365,6 @@ real primary_censored_dist_pmf(int d, int dist_id, array[] real params, * @param pwindow Primary event window * @param primary_dist_id Primary distribution identifier * @param primary_params Primary distribution parameters - * @param approx_truncation Binary; if 1, use approximate truncation method - * and if 0, use exact truncation method. * * @return Vector of primary event censored log PMFs for delays \[0, 1\] to * \[max_delay, max_delay + 1\]. @@ -343,7 +375,6 @@ real primary_censored_dist_pmf(int d, int dist_id, array[] real params, * 2. Assumes integer delays (swindow = 1) * 3. Is more computationally efficient for multiple delay calculation as it * reduces the number of integration calls. - * 4. Allows for approximate or exact truncation handling * * @code * // Example: Weibull delay distribution with uniform primary distribution @@ -354,19 +385,18 @@ real primary_censored_dist_pmf(int d, int dist_id, array[] real params, * real pwindow = 7.0; * int primary_dist_id = 1; // Uniform * array[0] real primary_params = {}; - * int approx_truncation = 1; // Use approximate truncation + * vector[max_delay] log_pmf = * primary_censored_sone_lpmf_vectorized( * max_delay, D, dist_id, params, pwindow, primary_dist_id, - * primary_params, approx_truncation + * primary_params * ); * @endcode */ vector primary_censored_sone_lpmf_vectorized( int max_delay, data real D, int dist_id, array[] real params, data real pwindow, - int primary_dist_id, array[] real primary_params, - int approx_truncation + int primary_dist_id, array[] real primary_params ) { int upper_interval = max_delay + 1; @@ -380,37 +410,25 @@ vector primary_censored_sone_lpmf_vectorized( } // Compute log CDFs - if (approx_truncation) { - for (d in 1:upper_interval) { - log_cdfs[d] = primary_censored_dist_lcdf( - d | dist_id, params, pwindow, positive_infinity(), primary_dist_id, - primary_params - ); - } - } else { - for (d in 1:upper_interval) { - log_cdfs[d] = primary_censored_dist_lcdf( - d | dist_id, params, pwindow, D, primary_dist_id, primary_params - ); - } + for (d in 1:upper_interval) { + log_cdfs[d] = primary_censored_dist_lcdf( + d | dist_id, params, pwindow, positive_infinity(), primary_dist_id, + primary_params + ); } // Compute log normalizer using upper_interval - if (approx_truncation) { - if (D > upper_interval) { - if (is_inf(D)) { - log_normalizer = 0; // No normalization needed for infinite D - } else { - log_normalizer = primary_censored_dist_lcdf( - D | dist_id, params, pwindow, positive_infinity(), - primary_dist_id, primary_params - ); - } + if (D > upper_interval) { + if (is_inf(D)) { + log_normalizer = 0; // No normalization needed for infinite D } else { - log_normalizer = log_cdfs[upper_interval]; + log_normalizer = primary_censored_dist_lcdf( + D | dist_id, params, pwindow, positive_infinity(), + primary_dist_id, primary_params + ); } } else { - log_normalizer = 0; // No external normalization for exact truncation + log_normalizer = log_cdfs[upper_interval]; } // Compute log PMFs @@ -432,16 +450,15 @@ vector primary_censored_sone_lpmf_vectorized( * @param pwindow Primary event window * @param primary_dist_id Primary distribution identifier * @param primary_params Primary distribution parameters - * @param approx_truncation Logical; if TRUE, use approximate truncation method * - * @return Vector of primary event censored PMFs for integer delays 1 to max_delay + * @return Vector of primary event censored PMFs for integer delays 1 to + * max_delay * * This function differs from primary_censored_dist_pmf in that it: * 1. Computes PMFs for all integer delays from \[0, 1\] to \[max_delay, * max_delay + 1\] in one call. * 2. Assumes integer delays (swindow = 1) * 3. Is more computationally efficient for multiple delay calculations - * 4. Allows for approximate or exact truncation handling * * @code * // Example: Weibull delay distribution with uniform primary distribution @@ -452,10 +469,9 @@ vector primary_censored_sone_lpmf_vectorized( * real pwindow = 7.0; * int primary_dist_id = 1; // Uniform * array[0] real primary_params = {}; - * int approx_truncation = 1; // Use approximate truncation * vector[max_delay] pmf = * primary_censored_sone_pmf_vectorized( - * max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, approx_truncation + * max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params * ); * @endcode */ @@ -463,12 +479,11 @@ vector primary_censored_sone_pmf_vectorized( int max_delay, data real D, int dist_id, array[] real params, data real pwindow, int primary_dist_id, - array[] real primary_params, - int approx_truncation + array[] real primary_params ) { return exp( primary_censored_sone_lpmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, approx_truncation + max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params ) ); } diff --git a/man/dprimarycensoreddist.Rd b/man/dprimarycensoreddist.Rd index f6dd398..e908eec 100644 --- a/man/dprimarycensoreddist.Rd +++ b/man/dprimarycensoreddist.Rd @@ -77,9 +77,24 @@ primary event censored cumulative distribution function (CDF) at two points, \deqn{ f_{\text{cens}}(d) = F_{\text{cens}}(d + \text{swindow}) - F_{\text{cens}}(d) } -where \eqn{F_{\text{cens}}} is the primary event censored CDF. For the -explanation and mathematical details of the CDF, refer to the documentation -of \code{\link[=pprimarycensoreddist]{pprimarycensoreddist()}}. +where \eqn{F_{\text{cens}}} is the primary event censored CDF. + +The function first computes the CDFs for all unique points (including both +\eqn{d} and \eqn{d + \text{swindow}}) using \code{\link[=pprimarycensoreddist]{pprimarycensoreddist()}}. It then +creates a lookup table for these CDFs to efficiently calculate the PMF for +each input value. For non-positive delays, the function returns 0. + +If a finite maximum delay \eqn{D} is specified, the PMF is normalized to +ensure it sums to 1 over the range [0, D]. This normalization can be +expressed as: +\deqn{ +f_{\text{cens,norm}}(d) = \frac{f_{\text{cens}}(d)}{\sum_{i=0}^{D-1} + f_{\text{cens}}(i)} +} +where \eqn{f_{\text{cens,norm}}(d)} is the normalized PMF and +\eqn{f_{\text{cens}}(d)} is the unnormalized PMF. For the explanation and +mathematical details of the CDF, refer to the documentation of +\code{\link[=pprimarycensoreddist]{pprimarycensoreddist()}}. } \examples{ # Example: Weibull distribution with uniform primary events diff --git a/man/pprimarycensoreddist.Rd b/man/pprimarycensoreddist.Rd index 8f6aa4d..ca0400e 100644 --- a/man/pprimarycensoreddist.Rd +++ b/man/pprimarycensoreddist.Rd @@ -63,7 +63,7 @@ custom primary event distributions and delay distributions. } \details{ The primary event censored CDF is computed by integrating the product of -the primary event distribution function (CDF) and the delay distribution +the delay distribution function (CDF) and the primary event distribution function (PDF) over the primary event window. The integration is adjusted for truncation if a finite maximum delay (D) is specified. @@ -72,15 +72,16 @@ The primary event censored CDF, \eqn{F_{\text{cens}}(q)}, is given by: F_{\text{cens}}(q) = \int_{0}^{pwindow} F(q - p) \cdot f_{\text{primary}}(p) \, dp } -where \eqn{F} is the CDF of the primary event distribution, +where \eqn{F} is the CDF of the delay distribution, \eqn{f_{\text{primary}}} is the PDF of the primary event times, and \eqn{pwindow} is the primary event window. -If the maximum delay \eqn{D} is finite, the CDF is normalized by \eqn{F(D)}: +If the maximum delay \eqn{D} is finite, the CDF is normalized by dividing +by \eqn{F_{\text{cens}}(D)}: \deqn{ -F_{\text{cens}}(q) = \int_{0}^{pwindow} \frac{F(q - p)}{F(D - p)} \cdot -f_{\text{primary}}(p) \, dp +F_{\text{cens,norm}}(q) = \frac{F_{\text{cens}}(q)}{F_{\text{cens}}(D)} } +where \eqn{F_{\text{cens,norm}}(q)} is the normalized CDF. } \examples{ # Example: Lognormal distribution with uniform primary events diff --git a/tests/testthat/test-dprimarycensoreddist.R b/tests/testthat/test-dprimarycensoreddist.R index ca2566c..3273aa6 100644 --- a/tests/testthat/test-dprimarycensoreddist.R +++ b/tests/testthat/test-dprimarycensoreddist.R @@ -75,7 +75,7 @@ test_that( } ) -test_that("dprimarycensoreddist throws an error for negative d", { +test_that("dprimarycensoreddist returns 0 for negative d", { d <- -1 pwindow <- 1 swindow <- 0.5 @@ -87,4 +87,10 @@ test_that("dprimarycensoreddist throws an error for negative d", { meanlog = 0, sdlog = 1 ), 0 ) + expect_identical( + dpcens( + c(8, d), plnorm, pwindow, swindow, D, + meanlog = 0, sdlog = 1 + )[2], 0 + ) }) diff --git a/tests/testthat/test-pprimarycensoreddist.R b/tests/testthat/test-pprimarycensoreddist.R new file mode 100644 index 0000000..3805860 --- /dev/null +++ b/tests/testthat/test-pprimarycensoreddist.R @@ -0,0 +1,55 @@ +test_that("pprimarycensoreddist returns 0 for non-positive quantiles", { + pwindow <- 1 + D <- 10 + cdf <- ppcens(c(-1, 0), plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + expect_identical(cdf, c(0, 0)) +}) + +test_that("pprimarycensoreddist approaches 1 for large quantiles", { + pwindow <- 1 + D <- Inf + cdf <- ppcens(1000, plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + expect_equal(cdf, 1, tolerance = 1e-6) +}) + +test_that("pprimarycensoreddist is monotonically increasing", { + pwindow <- 1 + D <- 10 + q <- seq(0, D, by = 0.5) + cdf <- ppcens(q, plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + expect_true(all(diff(cdf) >= 0)) +}) + +test_that("pprimarycensoreddist handles finite D correctly", { + pwindow <- 1 + D <- 10 + cdf <- ppcens(D, plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + expect_equal(cdf, 1, tolerance = 1e-6) +}) + +test_that("pprimarycensoreddist handles custom primary distributions", { + pwindow <- 5 + D <- 20 + cdf_uniform <- ppcens( + c(1, 5, 10), plnorm, pwindow, + D = D, + meanlog = 1, sdlog = 1 + ) + cdf_expgrowth <- ppcens( + c(1, 5, 10), plnorm, pwindow, + D = D, + dprimary = dexpgrowth, dprimary_args = list(r = 0.2), + meanlog = 1, sdlog = 1 + ) + expect_false(all(cdf_uniform == cdf_expgrowth)) +}) + +test_that("pprimarycensoreddist is consistent with dprimarycensoreddist", { + pwindow <- 1 + D <- 10 + q <- 0:9 + cdf <- ppcens(q + 1, plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + pmf <- dpcens(q, plnorm, pwindow, D = D, meanlog = 1, sdlog = 1) + cdf_from_pmf <- cumsum(pmf) + expect_equal(cdf, cdf_from_pmf, tolerance = 1e-6) +}) diff --git a/tests/testthat/test-rpd-primarycensoreddist.R b/tests/testthat/test-rpd-primarycensoreddist.R index 653cc78..4d3154a 100644 --- a/tests/testthat/test-rpd-primarycensoreddist.R +++ b/tests/testthat/test-rpd-primarycensoreddist.R @@ -2,7 +2,8 @@ # and the random number generators for primary events test_that( - "rprimarycensoreddist is consistent with dprimarycensoreddist and pprimarycensoreddist", + "rprimarycensoreddist is consistent with dprimarycensoreddist and + pprimarycensoreddist", { # nolint n <- 10000 pwindow <- 4 @@ -39,18 +40,19 @@ test_that( test_that( - "rprimarycensoreddist is consistent with dprimarycensoreddist and pprimarycensoreddist for exponential growth primary distribution", + "rprimarycensoreddist is consistent with dprimarycensoreddist and + pprimarycensoreddist for exponential growth primary distribution", { # nolint - n <- 10000 + n <- 1e6 pwindow <- 3 - D <- 10 + D <- 6 r <- 0.5 samples <- rpcens( n, rlnorm, pwindow, D = D, rprimary = rexpgrowth, rprimary_args = list(r = r), - meanlog = 1, sdlog = 0.5 + meanlog = 1.5, sdlog = 0.5 ) # Check empirical mean and pmf @@ -58,8 +60,8 @@ test_that( empirical_mean <- mean(samples) empirical_sd <- sd(samples) - expect_equal(empirical_mean, 4.3, tolerance = 0.05) - expect_equal(empirical_sd, 1.6, tolerance = 0.05) + expect_equal(empirical_mean, 4.12, tolerance = 0.01) + expect_equal(empirical_sd, 0.94, tolerance = 0.01) # Check empirical cdf against theoretical cdf x_values <- 0:(D - 1) @@ -68,12 +70,12 @@ test_that( D = D, dprimary = dexpgrowth, dprimary_args = list(r = r), - meanlog = 1, sdlog = 0.5 + meanlog = 1.5, sdlog = 0.5 ) theoretical_mean <- sum(x_values * pmf) - expect_equal(empirical_mean, theoretical_mean, tolerance = 0.05) - expect_equal(empirical_pmf, pmf, tolerance = 0.05) + expect_equal(empirical_mean, theoretical_mean, tolerance = 0.01) + expect_equal(empirical_pmf, pmf, tolerance = 0.01) # Check empirical cdf against theoretical cdf empirical_cdf <- ecdf(samples)(x_values) @@ -81,9 +83,59 @@ test_that( c(x_values[-1], D), plnorm, pwindow, D, dprimary = dexpgrowth, dprimary_args = list(r = r), - meanlog = 1, sdlog = 0.5 + meanlog = 1.5, sdlog = 0.5 ) - expect_equal(empirical_cdf, theoretical_cdf, tolerance = 0.05) + expect_equal(empirical_cdf, theoretical_cdf, tolerance = 0.01) + } +) + +test_that( + "rprimarycensoreddist with wider windows and different delay distribution + mathches p and d numerically", + { + n <- 1e6 + pwindow <- 7 # One week primary window + swindow <- 3 # Three-day secondary window + D <- 30 # Maximum delay of 30 days + + # Using Weibull distribution for delay + shape <- 2 + scale <- 10 + + samples <- rpcens( + n, rweibull, pwindow, swindow, + D = D, shape = shape, scale = scale + ) + + # Check empirical mean and standard deviation + empirical_mean <- mean(samples) + empirical_sd <- sd(samples) + + # Calculate theoretical mean and standard deviation + x_values <- seq(0, D - swindow, by = swindow) + pmf <- dpcens( + x_values, pweibull, pwindow, swindow, + D = D, shape = shape, scale = scale + ) + theoretical_mean <- sum(x_values * pmf) + theoretical_sd <- sqrt(sum((x_values - theoretical_mean)^2 * pmf)) + + # Compare empirical and theoretical statistics + expect_equal(empirical_mean, theoretical_mean, tolerance = 0.1) + expect_equal(empirical_sd, theoretical_sd, tolerance = 0.1) + + # Check empirical PMF against theoretical PMF + empirical_pmf <- as.vector(table(samples) / n) + expect_equal(empirical_pmf, pmf, tolerance = 0.01) + + # Check empirical CDF against theoretical CDF + empirical_cdf <- ecdf(samples)(x_values) + theoretical_cdf <- ppcens( + c(x_values[-1], D), pweibull, pwindow, D, + shape = shape, scale = scale + ) + expect_equal(cumsum(pmf), theoretical_cdf, tolerance = 0.01) + expect_equal(empirical_cdf, theoretical_cdf, tolerance = 0.01) } ) diff --git a/tests/testthat/test-stan-rpd-primarycensoreddist.R b/tests/testthat/test-stan-rpd-primarycensoreddist.R index ebb9ca5..0287179 100644 --- a/tests/testthat/test-stan-rpd-primarycensoreddist.R +++ b/tests/testthat/test-stan-rpd-primarycensoreddist.R @@ -3,24 +3,40 @@ skip_on_os("windows") skip_on_os("mac") test_that("Stan primary_censored_dist_cdf matches R pprimarycensoreddist", { - d <- seq(0, 10, by = 0.5) - dist_id <- 1 # Lognormal - params <- c(0, 1) # meanlog, sdlog - pwindow <- 1 - D <- Inf - primary_dist_id <- 1 # Uniform - primary_params <- array(numeric(0)) - - stan_cdf <- sapply( - d, primary_censored_dist_cdf, dist_id, params, pwindow, D, - primary_dist_id, primary_params - ) - r_cdf <- pprimarycensoreddist( - d, plnorm, - pwindow = pwindow, D = D, meanlog = params[1], sdlog = params[2] + d_values <- list( + seq(0, 10, by = 0.5), + seq(0, 5, by = 0.25), + seq(0, 15, by = 1) ) + D_values <- list(Inf, 10, 15) + + for (d in d_values) { + for (D in D_values) { + dist_id <- 1 # Lognormal + params <- c(0, 1) # meanlog, sdlog + pwindow <- 1 + primary_dist_id <- 1 # Uniform + primary_params <- array(numeric(0)) + + stan_cdf <- sapply( + d, primary_censored_dist_cdf, dist_id, params, pwindow, D, + primary_dist_id, primary_params + ) + r_cdf <- pprimarycensoreddist( + d, plnorm, + pwindow = pwindow, D = D, meanlog = params[1], sdlog = params[2] + ) - expect_equal(stan_cdf, r_cdf, tolerance = 1e-6) + expect_equal( + stan_cdf, r_cdf, + tolerance = 1e-6, + info = paste( + "Failed for d =", paste(range(d), collapse = "-"), + "and D =", D + ) + ) + } + } }) test_that( @@ -39,15 +55,13 @@ test_that( d, primary_censored_dist_lcdf, dist_id, params, pwindow, D, primary_dist_id, primary_params ) - r_lcdf <- log( - pprimarycensoreddist( - d, plnorm, - pwindow = pwindow, D = D, meanlog = params[1], - sdlog = params[2] - ) + r_cdf <- pprimarycensoreddist( + d, plnorm, + pwindow = pwindow, D = D, meanlog = params[1], + sdlog = params[2] ) - expect_equal(stan_lcdf, r_lcdf, tolerance = 1e-6) + expect_equal(exp(stan_lcdf), r_cdf, tolerance = 1e-6) } ) @@ -59,64 +73,90 @@ test_that( dist_id <- 1 # Lognormal params <- c(0, 1) # meanlog, sdlog pwindow <- 1 - swindow <- 1 + d_upper <- 11 D <- 10 primary_dist_id <- 1 # Uniform primary_params <- numeric(0) expect_error( primary_censored_dist_lpmf( - d, dist_id, params, pwindow, swindow, D, primary_dist_id, primary_params + d, dist_id, params, pwindow, d_upper, D, primary_dist_id, primary_params ), "Upper truncation point is greater than D" ) } ) - - test_that( "Stan primary_censored_dist matches R primarycensoreddist when d is the same as D - 1", { - d <- 10 dist_id <- 1 # Lognormal params <- c(0, 1) # meanlog, sdlog pwindow <- 1 primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_pmf <- primary_censored_dist_pmf( - d, dist_id, params, pwindow, 1, d + 1, primary_dist_id, primary_params - ) - r_pmf <- dprimarycensoreddist( - d, plnorm, - pwindow = pwindow, swindow = 1, D = d + 1, - meanlog = params[1], sdlog = params[2] - ) - - expect_equal(stan_pmf, r_pmf, tolerance = 1e-6) + d_values <- 1:10 + D_values <- c(Inf, 11) # Test without and with truncation + + for (D in D_values) { + truncation_label <- if (is.infinite(D)) { + "without truncation" + } else { + "with truncation" + } + for (d in d_values) { + stan_pmf <- primary_censored_dist_pmf( + d, dist_id, params, pwindow, d + 1, D, primary_dist_id, + primary_params + ) + r_pmf <- dprimarycensoreddist( + d, plnorm, + pwindow = pwindow, swindow = 1, D = D, + meanlog = params[1], sdlog = params[2] + ) + expect_equal( + stan_pmf, r_pmf, + tolerance = 1e-6, + info = paste( + "Mismatch for d =", d, truncation_label, "\n", + "Stan PMF:", stan_pmf, "\n", + "R PMF:", r_pmf, "\n", + "Difference:", abs(stan_pmf - r_pmf) + ) + ) + } + } } ) - test_that("Stan primary_censored_dist_pmf matches R dprimarycensoreddist", { d <- 0:10 dist_id <- 1 # Lognormal params <- c(0, 1) # meanlog, sdlog pwindow <- 1 - swindow <- 1 + d_upper <- d + 1 D <- Inf primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_pmf <- sapply( - d, primary_censored_dist_pmf, dist_id, params, pwindow, swindow, D, - primary_dist_id, primary_params + stan_pmf <- mapply( + primary_censored_dist_pmf, + d = d, + d_upper = d + 1, + MoreArgs = list( + dist_id = dist_id, + params = params, + pwindow = pwindow, + D = D, + primary_dist_id = primary_dist_id, + primary_params = primary_params + ) ) r_pmf <- dprimarycensoreddist( d, plnorm, - pwindow = pwindow, swindow = swindow, D = D, + pwindow = pwindow, swindow = 1, D = D, meanlog = params[1], sdlog = params[2] ) @@ -131,19 +171,28 @@ test_that( dist_id <- 1 # Lognormal params <- c(0, 1) # meanlog, sdlog pwindow <- 1 - swindow <- 1 + d_upper <- d + 1 D <- Inf primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_lpmf <- sapply( - d, primary_censored_dist_lpmf, dist_id, params, pwindow, swindow, D, - primary_dist_id, primary_params + stan_lpmf <- mapply( + primary_censored_dist_lpmf, + d = d, + d_upper = d_upper, + MoreArgs = list( + dist_id = dist_id, + params = params, + pwindow = pwindow, + D = D, + primary_dist_id = primary_dist_id, + primary_params = primary_params + ) ) r_lpmf <- log( dprimarycensoreddist( d, plnorm, - pwindow = pwindow, swindow = swindow, D = D, + pwindow = pwindow, swindow = 1, D = D, meanlog = params[1], sdlog = params[2] ) ) @@ -163,11 +212,8 @@ test_that( primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_pmf_approx <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 1 - ) - stan_pmf_exact <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 0 + stan_pmf <- primary_censored_sone_pmf_vectorized( + max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params ) r_pmf <- dprimarycensoreddist( 0:max_delay, plnorm, @@ -175,8 +221,7 @@ test_that( meanlog = params[1], sdlog = params[2] ) - expect_equal(stan_pmf_approx, r_pmf, tolerance = 1e-6) - expect_equal(stan_pmf_exact, r_pmf, tolerance = 1e-6) + expect_equal(stan_pmf, r_pmf, tolerance = 1e-6) } ) @@ -192,11 +237,8 @@ test_that( primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_pmf_approx <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 1 - ) - stan_pmf_exact <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 0 + stan_pmf <- primary_censored_sone_pmf_vectorized( + max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params ) r_pmf <- dprimarycensoreddist( 0:max_delay, plnorm, @@ -204,9 +246,7 @@ test_that( meanlog = params[1], sdlog = params[2] ) - expect_equal(stan_pmf_approx, r_pmf, tolerance = 1e-2) - expect_equal(stan_pmf_exact, r_pmf, tolerance = 1e-6) - expect_true(all(abs(stan_pmf_exact - r_pmf) < abs(stan_pmf_approx - r_pmf))) + expect_equal(stan_pmf, r_pmf, tolerance = 1e-6) } ) @@ -222,11 +262,8 @@ test_that( primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_pmf_approx <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 1 - ) - stan_pmf_exact <- primary_censored_sone_pmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 0 + stan_pmf <- primary_censored_sone_pmf_vectorized( + max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params ) r_pmf <- dprimarycensoreddist( 0:max_delay, plnorm, @@ -234,9 +271,7 @@ test_that( meanlog = params[1], sdlog = params[2] ) - expect_equal(stan_pmf_approx, r_pmf, tolerance = 1e-3) - expect_equal(stan_pmf_exact, r_pmf, tolerance = 1e-6) - expect_true(all(abs(stan_pmf_exact - r_pmf) < abs(stan_pmf_approx - r_pmf))) + expect_equal(stan_pmf, r_pmf, tolerance = 1e-6) } ) @@ -252,11 +287,8 @@ test_that( primary_dist_id <- 1 # Uniform primary_params <- numeric(0) - stan_lpmf_approx <- primary_censored_sone_lpmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 1 - ) - stan_lpmf_exact <- primary_censored_sone_lpmf_vectorized( - max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params, 0 + stan_lpmf <- primary_censored_sone_lpmf_vectorized( + max_delay, D, dist_id, params, pwindow, primary_dist_id, primary_params ) r_lpmf <- dprimarycensoreddist( 0:max_delay, plnorm, @@ -264,10 +296,6 @@ test_that( meanlog = params[1], sdlog = params[2], log = TRUE ) - expect_equal(stan_lpmf_approx, r_lpmf, tolerance = 1e-6) - expect_equal(stan_lpmf_exact, r_lpmf, tolerance = 1e-8) - expect_true( - all(abs(stan_lpmf_exact - r_lpmf) <= abs(stan_lpmf_approx - r_lpmf)) - ) + expect_equal(stan_lpmf, r_lpmf, tolerance = 1e-6) } ) diff --git a/vignettes/fitting-dists-with-stan.Rmd b/vignettes/fitting-dists-with-stan.Rmd index 0d251f3..b2056c8 100644 --- a/vignettes/fitting-dists-with-stan.Rmd +++ b/vignettes/fitting-dists-with-stan.Rmd @@ -78,7 +78,8 @@ delay_data <- data.frame( pwindow = pwindows, swindow = swindows, obs_time = obs_times, - observed_delay = samples + observed_delay = samples, + observed_delay_upper = samples + swindows ) head(delay_data) @@ -86,7 +87,10 @@ head(delay_data) # Aggregate to unique combinations and count occurrences # Aggregate to unique combinations and count occurrences delay_counts <- delay_data |> - summarise(n = n(), .by = c(pwindow, swindow, obs_time, observed_delay)) + summarise( + n = n(), + .by = c(pwindow, swindow, obs_time, observed_delay, observed_delay_upper) + ) head(delay_counts) @@ -184,8 +188,8 @@ naive_fit <- naive_model$sample( ), chains = 4, parallel_chains = 4, - show_messages = FALSE, - refresh = 0 + refresh = ifelse(interactive(), 50, 0), + show_messages = interactive() ) naive_fit ``` @@ -206,9 +210,9 @@ writeLines( data { int N; // number of observations array[N] int y; // observed delays + array[N] int y_upper; // observed delays upper bound array[N] int n; // number of occurrences for each delay array[N] int pwindow; // primary censoring window - array[N] int swindow; // secondary censoring window array[N] int D; // maximum delay } transformed data { @@ -227,7 +231,7 @@ writeLines( for (i in 1:N) { target += n[i] * primary_censored_dist_lpmf( y[i] | 1, {mu, sigma}, - pwindow[i], swindow[i], D[i], + pwindow[i], y_upper[i], D[i], 1, primary_params ); } @@ -251,22 +255,16 @@ Now let's fit the compiled model. primarycensoreddist_fit <- primarycensoreddist_model$sample( data = list( y = delay_counts$observed_delay, + y_upper = delay_counts$observed_delay_upper, n = delay_counts$n, pwindow = delay_counts$pwindow, - swindow = delay_counts$swindow, D = delay_counts$obs_time, N = nrow(delay_counts) ), chains = 4, parallel_chains = 4, - init = list( # we use this to resolve initialisation issues - list(mu = 1.5, sigma = 0.6), - list(mu = 1.5, sigma = 0.4), - list(mu = 1.5, sigma = 0.3), - list(mu = 1.5, sigma = 0.55) - ), - refresh = 0, - show_messages = FALSE + refresh = ifelse(interactive(), 50, 0), + show_messages = interactive() ) primarycensoreddist_fit ``` diff --git a/vignettes/primarycensoreddist.Rmd b/vignettes/primarycensoreddist.Rmd index 1507acd..3ccc423 100644 --- a/vignettes/primarycensoreddist.Rmd +++ b/vignettes/primarycensoreddist.Rmd @@ -156,14 +156,16 @@ $$ F_{\text{cens}}(q) = \int_{0}^{pwindow} F(q - p) \cdot f_{\text{primary}}(p) \, dp $$ -where ($F$) is the CDF of the primary event distribution, ($f_{\text{primary}}$) is the PDF of the primary event times, and ($pwindow$) is the primary event window. +where $F$ is the CDF of the delay distribution, $f_{\text{primary}}$ is the PDF of the primary event times, and $pwindow$ is the primary event window. -If the maximum delay ($D$) is finite, the CDF is normalized by ($F(D)$): +If the maximum delay $D$ is finite, the CDF is normalized by dividing by $F_{\text{cens}}(D)$: $$ -F_{\text{cens}}(q) = \int_{0}^{pwindow} \frac{F(q - p)}{F(D - p)} \cdot f_{\text{primary}}(p) \, dp +F_{\text{cens,norm}}(q) = \frac{F_{\text{cens}}(q)}{F_{\text{cens}}(D)} $$ +where $F_{\text{cens,norm}}(q)$ is the normalized CDF. + Let's compare the empirical CDF of our samples without secondary censoring to the theoretical CDF computed using `pprimarycensoreddist()`: ```{r cdf-lognormal}