Updated BCF function and added vignette demonstrating how to use feature subsets

andrewherren · andrewherren · commit 4b827b415afe · 2024-06-20T16:34:49.000-05:00
diff --git a/R/bcf.R b/R/bcf.R
@@ -189,8 +189,7 @@ bcf <- function(X_train, Z_train, y_train, pi_train = NULL, group_ids_train = NU
             }
             variable_subset_tau <- keep_vars_tau
         }
-    }
-    if ((is.null(keep_vars_tau)) && (!is.null(drop_vars_tau))) {
+    } else if ((is.null(keep_vars_tau)) && (!is.null(drop_vars_tau))) {
         if (is.character(drop_vars_tau)) {
             if (!all(drop_vars_tau %in% names(X_train))) {
                 stop("drop_vars_tau includes some variable names that are not in X_train")
@@ -304,7 +303,7 @@ bcf <- function(X_train, Z_train, y_train, pi_train = NULL, group_ids_train = NU
     variable_weights_tau <- variable_weights_mu <- variable_weights
     variable_weights_mu[!(original_var_indices %in% variable_subset_mu)] <- 0
     variable_weights_tau[!(original_var_indices %in% variable_subset_tau)] <- 0
-    
+
     # Fill in rfx basis as a vector of 1s (random intercept) if a basis not provided 
     has_basis_rfx <- F
     num_basis_rfx <- 0
@@ -375,14 +374,14 @@ bcf <- function(X_train, Z_train, y_train, pi_train = NULL, group_ids_train = NU
         feature_types <- as.integer(c(feature_types,0))
         X_train <- cbind(X_train, pi_train)
         if (propensity_covariate == "mu") {
-            variable_weights_mu <- c(variable_weights_mu, 1./num_cov_orig)
+            variable_weights_mu <- c(variable_weights_mu, rep(1./num_cov_orig, ncol(pi_train)))
             variable_weights_tau <- c(variable_weights_tau, 0)
         } else if (propensity_covariate == "tau") {
             variable_weights_mu <- c(variable_weights_mu, 0)
-            variable_weights_tau <- c(variable_weights_tau, 1./num_cov_orig)
+            variable_weights_tau <- c(variable_weights_tau, rep(1./num_cov_orig, ncol(pi_train)))
         } else if (propensity_covariate == "both") {
-            variable_weights_mu <- c(variable_weights_mu, 1./num_cov_orig)
-            variable_weights_tau <- c(variable_weights_tau, 1./num_cov_orig)
+            variable_weights_mu <- c(variable_weights_mu, rep(1./num_cov_orig, ncol(pi_train)))
+            variable_weights_tau <- c(variable_weights_tau, rep(1./num_cov_orig, ncol(pi_train)))
         }
         if (has_test) X_test <- cbind(X_test, pi_test)
     }
diff --git a/vignettes/CausalInference.Rmd b/vignettes/CausalInference.Rmd
@@ -792,6 +792,273 @@ mean(cover)
 
 It is clear that causal inference is much more difficult in the presence of **both** strong covariate-dependent prognostic effects and strong group-level random effects. In this sense, proper prior calibration for all three of the $\mu$, $\tau$ and random effects models is crucial.
 
+## Demo 6: Nonlinear Outcome Model, Heterogeneous Treatment Effect using Different Features in the Prognostic and Treatment Forests
+
+Here, we consider the case in which we might prefer to use only a subset of covariates in the treatment effect forest. Why might we want to do that?
+Well, in many cases it is plausible that some covariates (for example age, income, etc...) influence the outcome of interest in a causal problem, but do not **moderate** the treatment effect. In this case, we'd need to include these variables in the prognostic forest for deconfounding but we don't necessarily need to include them in the treatment effect forest.
+
+### Simulation
+
+We draw from a modified "demo 1" DGP
+
+```{r}
+mu <- function(x) {1+g(x)+x[,1]*x[,3]-x[,2]+3*x[,3]}
+tau <- function(x) {1+0.5*abs(x[,1])-0.25*sin(2*x[,1])}
+n <- 500
+snr <- 2
+x1 <- rnorm(n)
+x2 <- rnorm(n)
+x3 <- rnorm(n)
+x4 <- as.numeric(rbinom(n,1,0.5))
+x5 <- as.numeric(sample(1:3,n,replace=TRUE))
+X <- cbind(x1,x2,x3,x4,x5)
+p <- ncol(X)
+mu_x <- mu(X)
+tau_x <- tau(X)
+pi_x <- 0.8*pnorm((3*mu_x/sd(mu_x)) - 0.5*X[,1]) + 0.05 + runif(n)/10
+Z <- rbinom(n,1,pi_x)
+E_XZ <- mu_x + Z*tau_x
+y <- E_XZ + rnorm(n, 0, 1)*(sd(E_XZ)/snr)
+X <- as.data.frame(X)
+X$x4 <- factor(X$x4, ordered = TRUE)
+X$x5 <- factor(X$x5, ordered = TRUE)
+
+# Split data into test and train sets
+test_set_pct <- 0.2
+n_test <- round(test_set_pct*n)
+n_train <- n - n_test
+test_inds <- sort(sample(1:n, n_test, replace = FALSE))
+train_inds <- (1:n)[!((1:n) %in% test_inds)]
+X_test <- X[test_inds,]
+X_train <- X[train_inds,]
+pi_test <- pi_x[test_inds]
+pi_train <- pi_x[train_inds]
+Z_test <- Z[test_inds]
+Z_train <- Z[train_inds]
+y_test <- y[test_inds]
+y_train <- y[train_inds]
+mu_test <- mu_x[test_inds]
+mu_train <- mu_x[train_inds]
+tau_test <- tau_x[test_inds]
+tau_train <- tau_x[train_inds]
+```
+
+### Sampling and Analysis
+
+#### MCMC, full covariate set in $\tau(X)$
+
+Here we simulate from the model with the original MCMC sampler, using all of the covariates in both the prognostic ($\mu(X)$) and treatment effect ($\tau(X)$) forests.
+
+```{r}
+num_gfr <- 0
+num_burnin <- 1000
+num_mcmc <- 1000
+num_samples <- num_gfr + num_burnin + num_mcmc
+bcf_model_mcmc <- bcf(
+    X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, 
+    X_test = X_test, Z_test = Z_test, pi_test = pi_test, 
+    num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, 
+    sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F
+)
+```
+
+Inspect the burned-in samples
+
+```{r}
+plot(rowMeans(bcf_model_mcmc$mu_hat_test), mu_test, 
+     xlab = "predicted", ylab = "actual", main = "Prognostic function")
+abline(0,1,col="red",lty=3,lwd=3)
+plot(rowMeans(bcf_model_mcmc$tau_hat_test), tau_test, 
+     xlab = "predicted", ylab = "actual", main = "Treatment effect")
+abline(0,1,col="red",lty=3,lwd=3)
+sigma_observed <- var(y-E_XZ)
+plot_bounds <- c(min(c(bcf_model_mcmc$sigma2_samples, sigma_observed)), 
+                 max(c(bcf_model_mcmc$sigma2_samples, sigma_observed)))
+plot(bcf_model_mcmc$sigma2_samples, ylim = plot_bounds, 
+     ylab = "sigma^2", xlab = "Sample", main = "Global variance parameter")
+abline(h = sigma_observed, lty=3, lwd = 3, col = "blue")
+```
+
+Examine test set interval coverage
+
+```{r}
+test_lb <- apply(bcf_model_mcmc$tau_hat_test, 1, quantile, 0.025)
+test_ub <- apply(bcf_model_mcmc$tau_hat_test, 1, quantile, 0.975)
+cover <- (
+    (test_lb <= tau_x[test_inds]) & 
+    (test_ub >= tau_x[test_inds])
+)
+mean(cover)
+```
+
+And test set RMSE
+
+```{r}
+test_mean <- rowMeans(bcf_model_mcmc$tau_hat_test)
+sqrt(mean((test_mean - tau_test)^2))
+```
+
+#### MCMC, covariate subset in $\tau(X)$
+
+Here we simulate from the model with the original MCMC sampler, using only covariate $X_1$ in the treatment effect forest.
+
+```{r}
+num_gfr <- 0
+num_burnin <- 1000
+num_mcmc <- 1000
+num_samples <- num_gfr + num_burnin + num_mcmc
+bcf_model_mcmc <- bcf(
+    X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, 
+    X_test = X_test, Z_test = Z_test, pi_test = pi_test, 
+    num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, 
+    sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, 
+    keep_vars_tau = c("x1")
+)
+```
+
+Inspect the BART samples
+
+```{r}
+plot(rowMeans(bcf_model_mcmc$mu_hat_test), mu_test, 
+     xlab = "predicted", ylab = "actual", main = "Prognostic function")
+abline(0,1,col="red",lty=3,lwd=3)
+plot(rowMeans(bcf_model_mcmc$tau_hat_test), tau_test, 
+     xlab = "predicted", ylab = "actual", main = "Treatment effect")
+abline(0,1,col="red",lty=3,lwd=3)
+sigma_observed <- var(y-E_XZ)
+plot_bounds <- c(min(c(bcf_model_mcmc$sigma2_samples, sigma_observed)), 
+                 max(c(bcf_model_mcmc$sigma2_samples, sigma_observed)))
+plot(bcf_model_mcmc$sigma2_samples, ylim = plot_bounds, 
+     ylab = "sigma^2", xlab = "Sample", main = "Global variance parameter")
+abline(h = sigma_observed, lty=3, lwd = 3, col = "blue")
+```
+
+Examine test set interval coverage
+
+```{r}
+test_lb <- apply(bcf_model_mcmc$tau_hat_test, 1, quantile, 0.025)
+test_ub <- apply(bcf_model_mcmc$tau_hat_test, 1, quantile, 0.975)
+cover <- (
+    (test_lb <= tau_x[test_inds]) & 
+    (test_ub >= tau_x[test_inds])
+)
+mean(cover)
+```
+
+And test set RMSE
+
+```{r}
+test_mean <- rowMeans(bcf_model_mcmc$tau_hat_test)
+sqrt(mean((test_mean - tau_test)^2))
+```
+
+#### Warmstart, full covariate set in $\tau(X)$
+
+Here we simulate from the model with the warm-start sampler, using all of the covariates in both the prognostic ($\mu(X)$) and treatment effect ($\tau(X)$) forests.
+
+```{r}
+num_gfr <- 10
+num_burnin <- 0
+num_mcmc <- 1000
+num_samples <- num_gfr + num_burnin + num_mcmc
+bcf_model_warmstart <- bcf(
+    X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, 
+    X_test = X_test, Z_test = Z_test, pi_test = pi_test, 
+    num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, 
+    sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F
+)
+```
+
+Inspect the BART samples that were initialized with an XBART warm-start
+
+```{r}
+plot(rowMeans(bcf_model_warmstart$mu_hat_test), mu_test, 
+     xlab = "predicted", ylab = "actual", main = "Prognostic function")
+abline(0,1,col="red",lty=3,lwd=3)
+plot(rowMeans(bcf_model_warmstart$tau_hat_test), tau_test, 
+     xlab = "predicted", ylab = "actual", main = "Treatment effect")
+abline(0,1,col="red",lty=3,lwd=3)
+sigma_observed <- var(y-E_XZ)
+plot_bounds <- c(min(c(bcf_model_warmstart$sigma2_samples, sigma_observed)), 
+                 max(c(bcf_model_warmstart$sigma2_samples, sigma_observed)))
+plot(bcf_model_warmstart$sigma2_samples, ylim = plot_bounds, 
+     ylab = "sigma^2", xlab = "Sample", main = "Global variance parameter")
+abline(h = sigma_observed, lty=3, lwd = 3, col = "blue")
+```
+
+Examine test set interval coverage
+
+```{r}
+test_lb <- apply(bcf_model_warmstart$tau_hat_test, 1, quantile, 0.025)
+test_ub <- apply(bcf_model_warmstart$tau_hat_test, 1, quantile, 0.975)
+cover <- (
+    (test_lb <= tau_x[test_inds]) & 
+    (test_ub >= tau_x[test_inds])
+)
+mean(cover)
+```
+
+And test set RMSE
+
+```{r}
+test_mean <- apply(bcf_model_warmstart$tau_hat_test, 1, mean)
+sqrt(mean((tau_x[test_inds] - test_mean)^2))
+```
+
+#### Warmstart, covariate subset in $\tau(X)$
+
+Here we simulate from the model with the warm-start sampler, using only covariate $X_1$ in the treatment effect forest.
+
+```{r}
+num_gfr <- 10
+num_burnin <- 0
+num_mcmc <- 1000
+num_samples <- num_gfr + num_burnin + num_mcmc
+bcf_model_warmstart <- bcf(
+    X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, 
+    X_test = X_test, Z_test = Z_test, pi_test = pi_test, 
+    num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, 
+    sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, 
+    keep_vars_tau = c("x1")
+)
+```
+
+Inspect the BART samples that were initialized with an XBART warm-start
+
+```{r}
+plot(rowMeans(bcf_model_warmstart$mu_hat_test), mu_test, 
+     xlab = "predicted", ylab = "actual", main = "Prognostic function")
+abline(0,1,col="red",lty=3,lwd=3)
+plot(rowMeans(bcf_model_warmstart$tau_hat_test), tau_test, 
+     xlab = "predicted", ylab = "actual", main = "Treatment effect")
+abline(0,1,col="red",lty=3,lwd=3)
+sigma_observed <- var(y-E_XZ)
+plot_bounds <- c(min(c(bcf_model_warmstart$sigma2_samples, sigma_observed)), 
+                 max(c(bcf_model_warmstart$sigma2_samples, sigma_observed)))
+plot(bcf_model_warmstart$sigma2_samples, ylim = plot_bounds, 
+     ylab = "sigma^2", xlab = "Sample", main = "Global variance parameter")
+abline(h = sigma_observed, lty=3, lwd = 3, col = "blue")
+```
+
+Examine test set interval coverage
+
+```{r}
+test_lb <- apply(bcf_model_warmstart$tau_hat_test, 1, quantile, 0.025)
+test_ub <- apply(bcf_model_warmstart$tau_hat_test, 1, quantile, 0.975)
+cover <- (
+    (test_lb <= tau_x[test_inds]) & 
+    (test_ub >= tau_x[test_inds])
+)
+mean(cover)
+```
+
+And test set RMSE
+
+```{r}
+test_mean <- apply(bcf_model_warmstart$tau_hat_test, 1, mean)
+sqrt(mean((tau_x[test_inds] - test_mean)^2))
+```
+
 # Continuous Treatment
 
 ## Demo 1: Nonlinear Outcome Model, Heterogeneous Treatment Effect