From 94a1d7609ff921e4a7f9901c3074d67b2cf6c723 Mon Sep 17 00:00:00 2001 From: jchristopherson Date: Mon, 25 Mar 2024 14:52:20 -0500 Subject: [PATCH] Add bootstrapping --- src/fstats.f90 | 5 + src/fstats_bootstrap.f90 | 295 +++++++++++++++++++++++++++++++++------ 2 files changed, 260 insertions(+), 40 deletions(-) diff --git a/src/fstats.f90 b/src/fstats.f90 index e73e6b7..ca8d837 100644 --- a/src/fstats.f90 +++ b/src/fstats.f90 @@ -63,6 +63,11 @@ module fstats public :: trimmed_mean public :: difference public :: factorial + public :: bootstrap_resampling_routine + public :: bootstrap_statistic_routine + public :: random_resample + public :: bootstrap_statistics + public :: bootstrap public :: bootstrap_regression_statistics public :: bootstrap_linear_least_squares public :: bootstrap_nonlinear_least_squares diff --git a/src/fstats_bootstrap.f90 b/src/fstats_bootstrap.f90 index 69c9c8d..584a72d 100644 --- a/src/fstats_bootstrap.f90 +++ b/src/fstats_bootstrap.f90 @@ -8,6 +8,11 @@ module fstats_bootstrap use fstats_regression implicit none private + public :: bootstrap_resampling_routine + public :: bootstrap_statistic_routine + public :: random_resample + public :: bootstrap_statistics + public :: bootstrap public :: bootstrap_regression_statistics public :: bootstrap_linear_least_squares public :: bootstrap_nonlinear_least_squares @@ -41,15 +46,173 @@ module fstats_bootstrap !! The lower limit of the confidence interval for the parameter. end type + type bootstrap_statistics + !! A collection of statistics resulting from the bootstrap process. + real(real64) :: statistic_value + !! The value of the statistic of interest. + real(real64) :: upper_confidence_interval + !! The upper confidence limit on the statistic. + real(real64) :: lower_confidence_interval + !! The lower confidence limit on the statistic. + real(real64) :: bias + !! The bias in the statistic. + real(real64) :: standard_error + !! The standard error of the statistic. + real(real64), allocatable, dimension(:) :: population + !! An array of the population values generated by the bootstrap + !! process. + end type + + interface + subroutine bootstrap_resampling_routine(x, xn) + !! Defines the signature of a subroutine used to compute a + !! resampling of data for bootstrapping purposes. + use iso_fortran_env, only : real64 + real(real64), intent(in), dimension(:) :: x + !! The N-element array to resample. + real(real64), intent(out), dimension(size(x)) :: xn + !! An N-element array where the resampled data set will be + !! written. + end subroutine + + function bootstrap_statistic_routine(x) result(rst) + !! Defines the signature of a function for computing the desired + !! bootstrap statistic. + use iso_fortran_env, only : real64 + real(real64), intent(in), dimension(:) :: x + !! The array of data to analyze. + real(real64) :: rst + !! The resulting statistic. + end function + end interface + contains +! ****************************************************************************** +! RESAMPLING +! ------------------------------------------------------------------------------ +subroutine random_resample(x, xn) + !! Random resampling, with replacement, based upon a normal distribution. + real(real64), intent(in), dimension(:) :: x + !! The N-element array to resample. + real(real64), intent(out), dimension(size(x)) :: xn + !! An N-element array where the resampled data set will be written. + + ! Parameters + real(real64), parameter :: half = 0.5d0 + + ! Local Variables + integer(int32) :: n + real(real64) :: eps + + ! Process + n = size(x) + eps = standard_deviation(x) / sqrt(real(n, real64)) + call random_number(xn) + xn = eps * (xn - half) + x +end subroutine + +! ****************************************************************************** +! BOOTSTRAPPING +! ------------------------------------------------------------------------------ +function bootstrap(stat, x, method, nsamples, alpha) result(rst) + !! Performs a bootstrap calculation on the supplied data set for the given + !! statistic. The default implementation utlizes a random resampling with + !! replacement. Other resampling methods may be defined by specifying an + !! appropriate routine by means of the method input. + procedure(bootstrap_statistic_routine), pointer, intent(in) :: stat + !! The routine used to compute the desired statistic. + real(real64), intent(in), dimension(:) :: x + !! The N-element data set. + procedure(bootstrap_resampling_routine), pointer, intent(in), optional :: method + !! An optional pointer to the method to use for resampling of the data. + !! If no method is supplied, a random resampling is utilized. + integer(int32), intent(in), optional :: nsamples + !! An optional input, that if supplied, specifies the number of + !! resampling runs to perform. The default is 10 000. + real(real64), intent(in), optional :: alpha + !! An optional input, that if supplied, defines the significance level + !! to use for the analysis. The default is 0.05. + type(bootstrap_statistics) :: rst + !! The resulting bootstrap_statistics type containing the confidence + !! intervals, bias, standard error, etc. for the analyzed statistic. + + ! Parameters + real(real64), parameter :: half = 0.5d0 + real(real64), parameter :: p05 = 5.0d-2 + + ! Local Variables + integer(int32) :: i, i1, i2, n, ns + real(real64) :: a + real(real64), allocatable, dimension(:) :: xn + procedure(bootstrap_resampling_routine), pointer :: resample + + ! Initialization + n = size(x) + if (present(method)) then + resample => method + else + resample => random_resample + end if + if (present(nsamples)) then + ns = nsamples + else + ns = 10000 + end if + if (present(alpha)) then + a = alpha + else + a = p05 + end if + allocate(rst%population(ns)) + i1 = floor(half * a * ns, int32) + i2 = ns - i1 + 1 + + ! Analyze the basic data set + rst%statistic_value = stat(x) + rst%population(1) = rst%statistic_value + + ! Resampling Process + call random_init(.false., .true.) +#ifdef USEOPENMP + ! Use OpenMP to run operations in parallel +!$OMP PARALLEL DO PRIVATE(xn) SHARED(rst) + do i = 2, ns + ! Resample the data + call resample(x, xn) + + ! Compute the statistic + rst%population(i) = stat(xn) + end do +!$OMP END PARALLEL DO +#else + ! OpenMP is not available - run in a serial manner + allocate(xn(n)) + do i = 2, ns + ! Resample the data + call resample(x, xn) + + ! Compute the statistic for the resampled data + rst%population(i) = stat(xn) + end do +#endif + + ! Compute the relevant quantities on the resampled statistic + rst%upper_confidence_interval = rst%population(i2) + rst%lower_confidence_interval = rst%population(i1) + rst%bias = mean(rst%population) - rst%statistic_value + rst%standard_error = standard_deviation(rst%population) +end function + ! ****************************************************************************** ! LINEAR REGRESSION ! ------------------------------------------------------------------------------ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & - coeffs, ymod, resid, nsamples, stats, bias, alpha, err) + coeffs, ymod, resid, nsamples, stats, bias, alpha, method, bscoeffs, err) !! Computes a linear least-squares regression to fit a set of data. !! Bootstrapping is utilized to gain insight into the quality of - !! the fit. + !! the fit. Resampling for the bootstrap process is a random resampling + !! with replacement process with the range of values limited by the + !! standard deviation of the original data set. integer(int32), intent(in) :: order !! The order of the equation to fit. This value must be at !! least one (linear equation), but can be higher as desired, @@ -81,11 +244,20 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & real(real64), intent(out), optional, dimension(:) :: bias !! An ORDER+1 element array where an estimate of the bias of !! each coefficient is returned based upon the results of the - !! bootstrapping analysis. + !! bootstrapping analysis. The bias is computed as the difference + !! between the mean of the boostrap population results for the given + !! parameter and the original estimate of the given parameter. real(real64), intent(in), optional :: alpha !! The significance level at which to evaluate the confidence !! intervals. The default value is 0.05 such that a 95% !! confidence interval is calculated. + procedure(bootstrap_resampling_routine), pointer, intent(in), optional :: method + !! An optional pointer to the method to use for resampling of the data. + !! If no method is supplied, a random resampling is utilized. + real(real64), intent(out), optional, allocatable, target, dimension(:,:) :: bscoeffs + !! An optional, allocatable matrix, containing the bootstrap + !! distributions for each parameter stored in each row of the matrix + !! such that the resulting matrix is NCOEFFS -by- NSAMPLES. class(errors), intent(inout), optional, target :: err !! A mechanism for communicating errors and warnings to the !! caller. Possible warning and error codes are as follows. @@ -99,15 +271,16 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & ! Parameters real(real64), parameter :: zero = 0.0d0 real(real64), parameter :: p05 = 5.0d-2 - real(real64), parameter :: half = 5.0d-1 ! Local Variables - integer(int32) :: i, j, n, ns, nc, ncoeffs, flag - real(real64) :: eps, alph + integer(int32) :: i, j, n, ns, nc, ncoeffs, flag, nthreads, thread + real(real64) :: alph real(real64), allocatable, dimension(:) :: fLocal, yLocal, rLocal - real(real64), allocatable, dimension(:,:) :: allcoeffs + real(real64), allocatable, target, dimension(:,:) :: coeffstorage + real(real64), pointer, dimension(:,:) :: allcoeffs class(errors), pointer :: errmgr type(errors), target :: deferr + procedure(bootstrap_resampling_routine), pointer :: resample ! Initialization if (present(err)) then @@ -125,10 +298,16 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & else alph = p05 end if + if (present(method)) then + resample => method + else + resample => random_resample + end if n = size(x) ncoeffs = order + 1 nc = order if (intercept) nc = nc + 1 + nthreads = omp_get_num_threads() ! Compute the fit call linear_least_squares(order, intercept, x, y, coeffs, & @@ -136,29 +315,41 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & if (errmgr%has_error_occurred()) return ! Memory Allocations - allocate(allcoeffs(ncoeffs, ns), source = zero, stat = flag) - if (flag /= 0) then - call report_memory_error(errmgr, "bootstrap_linear_least_squares", flag) - return + if (present(bscoeffs)) then + allocate(bscoeffs(ncoeffs, ns), source = zero, stat = flag) + if (flag /= 0) then + call report_memory_error(errmgr, "bootstrap_linear_least_squares", & + flag) + return + end if + allcoeffs => bscoeffs + else + allocate(coeffstorage(ncoeffs, ns), source = zero, stat = flag) + if (flag /= 0) then + call report_memory_error(errmgr, "bootstrap_linear_least_squares", & + flag) + return + end if + allcoeffs => coeffstorage end if allcoeffs(:,1) = coeffs - ! Establish the epsilon term - eps = standard_deviation(y) / sqrt(real(n)) - call random_init(.false., .true.) - ! Cycle over each data set and perform the fit + call random_init(.false., .true.) #ifdef USEOPENMP !$OMP PARALLEL DO PRIVATE(fLocal, yLocal, rLocal) SHARED(allcoeffs) do i = 2, ns + ! Get the current thread number + ! The +1 is because OpenMP is zero-based for thread numbering + thread = omp_get_thread_num() + 1 + ! Allocate local arrays on a per-thread basis if (.not.allocated(fLocal)) allocate(fLocal(n)) if (.not.allocated(yLocal)) allocate(yLocal(n)) if (.not.allocated(rLocal)) allocate(rLocal(n)) ! Compute a random data set - call random_number(yLocal) - yLocal = eps * (yLocal - half) + y + call resample(y, yLocal) ! Compute the fit of the perturbed data set call linear_least_squares(order, intercept, x, yLocal, & @@ -170,8 +361,7 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & allocate(fLocal(n), yLocal(n), rLocal(n)) do i = 2, ns ! Compute a random data set - call random_number(yLocal) - yLocal = eps * (yLocal - half) + y + call resample(y, yLocal) ! Compute the fit of the perturbed data set call linear_least_squares(order, intercept, x, yLocal, & @@ -195,7 +385,7 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & ! Perform the calculations do i = 1, ncoeffs - bias(i) = coeffs(i) - mean(allcoeffs(i,:)) + bias(i) = mean(allcoeffs(i,:)) - coeffs(i) end do end if end subroutine @@ -205,10 +395,13 @@ subroutine bootstrap_linear_least_squares(order, intercept, x, y, & ! ------------------------------------------------------------------------------ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & nsamples, weights, maxp, minp, stats, alpha, controls, settings, info, & - bias, err) + bias, method, bscoeffs, err) !! Performs a nonlinear regression to fit a model using a version !! of the Levenberg-Marquardt algorithm. Bootstrapping is utilized to gain - !! insight into the quality of the fit. + !! insight into the quality of the fit. Resampling for the bootstrap + !! process is a random resampling with replacement process with the + !! range of values limited by the standard deviation of the original + !! data set. procedure(regression_function), intent(in), pointer :: fun !! A pointer to the regression_function to evaluate. real(real64), intent(in) :: x(:) @@ -258,7 +451,17 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & real(real64), intent(out), optional, dimension(:) :: bias !! An optional N-element array that, if supplied, will be used to !! provide an estimate of the bias of each model parameter based upon - !! the results of the bootstrapping analysis. + !! the results of the bootstrapping analysis. The bias is computed as + !! the difference between the mean of the boostrap population results + !! for the given parameter and the original estimate of the given + !! parameter. + procedure(bootstrap_resampling_routine), pointer, intent(in), optional :: method + !! An optional pointer to the method to use for resampling of the data. + !! If no method is supplied, a random resampling is utilized. + real(real64), intent(out), optional, allocatable, target, dimension(:,:) :: bscoeffs + !! An optional, allocatable matrix, containing the bootstrap + !! distributions for each parameter stored in each row of the matrix + !! such that the resulting matrix is NCOEFFS -by- NSAMPLES. class(errors), intent(inout), optional, target :: err !! A mechanism for communicating errors and warnings to the !! caller. Possible warning and error codes are as follows. @@ -277,13 +480,14 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & ! Parameters real(real64), parameter :: zero = 0.0d0 real(real64), parameter :: p05 = 5.0d-2 - real(real64), parameter :: half = 5.0d-1 ! Local Variables integer(int32) :: i, n, ns, nparams, flag - real(real64) :: eps, alph + real(real64) :: alph real(real64), allocatable, dimension(:) :: fLocal, yLocal, rLocal - real(real64), allocatable, dimension(:,:) :: allcoeffs + real(real64), allocatable, target, dimension(:,:) :: coeffstorage + real(real64), pointer, dimension(:,:) :: allcoeffs + procedure(bootstrap_resampling_routine), pointer :: resample class(errors), pointer :: errmgr type(errors), target :: deferr @@ -303,6 +507,11 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & else alph = p05 end if + if (present(method)) then + resample => method + else + resample => random_resample + end if n = size(x) nparams = size(params) @@ -312,11 +521,22 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & controls = controls, settings = settings, info = info, err = err) ! Memory Allocations - allocate(allcoeffs(nparams, ns), source = zero, stat = flag) - if (flag /= 0) then - call report_memory_error(errmgr, "bootstrap_nonlinear_least_squares", & - flag) - return + if (present(bscoeffs)) then + allocate(bscoeffs(nparams, ns), source = zero, stat = flag) + if (flag /= 0) then + call report_memory_error(errmgr, & + "bootstrap_nonlinear_least_squares", flag) + return + end if + allcoeffs => bscoeffs + else + allocate(coeffstorage(nparams, ns), source = zero, stat = flag) + if (flag /= 0) then + call report_memory_error(errmgr, & + "bootstrap_nonlinear_least_squares", flag) + return + end if + allcoeffs => coeffstorage end if allcoeffs(:,1) = params @@ -327,11 +547,8 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & allcoeffs(i,:) = params(i) end do - ! Establish the epsilon term - eps = standard_deviation(y) / sqrt(real(n)) - call random_init(.false., .true.) - ! Cycle over each data set and perform the fit + call random_init(.false., .true.) #ifdef USEOPENMP !$OMP PARALLEL DO do i = 2, ns @@ -341,8 +558,7 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & if (.not.allocated(rLocal)) allocate(rLocal(n)) ! Compute a random data set - call random_number(yLocal) - yLocal = eps * (yLocal - half) + y + call resample(y, yLocal) ! Compute the fit of the perturbed data set call nonlinear_least_squares(fun, x, yLocal, allcoeffs(:,i), fLocal, & @@ -355,8 +571,7 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & allocate(fLocal(n), yLocal(n), rLocal(n)) do i = 2, ns ! Compute a random data set - call random_number(yLocal) - yLocal = eps * (yLocal - half) + y + call resample(y, yLocal) ! Compute the fit of the perturbed data set call nonlinear_least_squares(fun, x, yLocal, allcoeffs(:,i), fLocal, & @@ -391,7 +606,7 @@ subroutine bootstrap_nonlinear_least_squares(fun, x, y, params, ymod, resid, & ! Perform the calculations do i = 1, nparams - bias(i) = params(i) - mean(allcoeffs(i,:)) + bias(i) = mean(allcoeffs(i,:)) - params(i) end do end if end subroutine