From 2ca1bbe9ae0cf07c251751ede22cf71e813d94d6 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Tue, 25 Jun 2024 01:07:54 -0500
Subject: [PATCH 1/2] Updated python interface to allow the use of different
 feature subsets in mu and tau forests for BCF

---
 R/utils.R                                     |   2 -
 .../causal_inference_feature_subsets.ipynb    | 302 ++++++++++++++++++
 include/stochtree/tree_sampler.h              |  24 +-
 stochtree/bcf.py                              | 295 ++++++++++++-----
 stochtree/preprocessing.py                    |  15 +-
 5 files changed, 533 insertions(+), 105 deletions(-)
 create mode 100644 demo/notebooks/causal_inference_feature_subsets.ipynb

diff --git a/R/utils.R b/R/utils.R
index 7c8803c9..7afffe27 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -64,7 +64,6 @@ preprocessPredictionData <- function(input_data, metadata) {
 #' Returns a list including a matrix of preprocessed covariate values and associated tracking.
 #'
 #' @param input_matrix Covariate matrix.
-#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable
 #'
 #' @return List with preprocessed (unmodified) data and details on the number of each type 
 #' of variable, unique categories associated with categorical variables, and the 
@@ -142,7 +141,6 @@ preprocessPredictionMatrix <- function(input_matrix, metadata) {
 #'
 #' @param input_df Dataframe of covariates. Users must pre-process any 
 #' categorical variables as factors (ordered for ordered categorical).
-#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable
 #'
 #' @return List with preprocessed data and details on the number of each type 
 #' of variable, unique categories associated with categorical variables, and the 
diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb
new file mode 100644
index 00000000..1d35bb5c
--- /dev/null
+++ b/demo/notebooks/causal_inference_feature_subsets.ipynb
@@ -0,0 +1,302 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Causal Inference with Feature Subsets Demo Notebook\n",
+    "\n",
+    "This is a duplicate of the main causal inference demo which shows how a user might decide to use only a subset of covariates in the treatment effect forest. \n",
+    "Why might we want to do that? Well, in many cases it is plausible that some covariates (for example age, income, etc...) influence the outcome of interest \n",
+    "in a causal problem, but do not **moderate** the treatment effect. In this case, we'd need to include these variables in the prognostic forest for deconfounding \n",
+    "but we don't necessarily need to include them in the treatment effect forest."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from stochtree import BCFModel\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate sample data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# RNG\n",
+    "rng = np.random.default_rng()\n",
+    "\n",
+    "# Generate covariates and basis\n",
+    "n = 1000\n",
+    "p_X = 10\n",
+    "X = rng.uniform(0, 1, (n, p_X))\n",
+    "pi_X = 0.25 + 0.5*X[:,0]\n",
+    "Z = rng.binomial(1, pi_X, n).astype(float)\n",
+    "\n",
+    "# Define the outcome mean functions (prognostic and treatment effects)\n",
+    "mu_X = pi_X*5 + 2*X[:,2]\n",
+    "tau_X = 1 - 2*X[:,0] + 2*X[:,1] + 1*X[:,0]*X[:,1]\n",
+    "\n",
+    "# Generate outcome\n",
+    "epsilon = rng.normal(0, 1, n)\n",
+    "y = mu_X + tau_X*Z + epsilon"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test-train split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_inds = np.arange(n)\n",
+    "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n",
+    "X_train = X[train_inds,:]\n",
+    "X_test = X[test_inds,:]\n",
+    "Z_train = Z[train_inds]\n",
+    "Z_test = Z[test_inds]\n",
+    "y_train = y[train_inds]\n",
+    "y_test = y[test_inds]\n",
+    "pi_train = pi_X[train_inds]\n",
+    "pi_test = pi_X[test_inds]\n",
+    "mu_train = mu_X[train_inds]\n",
+    "mu_test = mu_X[test_inds]\n",
+    "tau_train = tau_X[train_inds]\n",
+    "tau_test = tau_X[test_inds]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run BCF without feature subsetting for $\\tau(X)$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bcf_model = BCFModel()\n",
+    "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Inspect the MCMC (BART) samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_y_mcmc = bcf_model.y_hat_test\n",
+    "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n",
+    "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n",
+    "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n",
+    "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n",
+    "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n",
+    "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n",
+    "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n",
+    "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n",
+    "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n",
+    "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run BCF, subsetting to the two features that show up in $\\tau(X)$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bcf_model_subset = BCFModel()\n",
+    "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, keep_vars_tau=[0,1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Inspect the MCMC (BART) samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_y_mcmc = bcf_model_subset.y_hat_test\n",
+    "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n",
+    "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n",
+    "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_tau_mcmc = bcf_model_subset.tau_hat_test\n",
+    "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n",
+    "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n",
+    "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_mu_mcmc = bcf_model_subset.mu_hat_test\n",
+    "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n",
+    "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n",
+    "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n",
+    "                                             np.expand_dims(bcf_model_subset.global_var_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n",
+    "                                             columns=[\"Sample\", \"Sigma\"])\n",
+    "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n",
+    "                                         np.expand_dims(bcf_model_subset.b0_samples[bcf_model_subset.num_gfr:],axis=1), \n",
+    "                                         np.expand_dims(bcf_model_subset.b1_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n",
+    "                                         columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "stochtree-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/include/stochtree/tree_sampler.h b/include/stochtree/tree_sampler.h
index 243ec39d..e9ef65d9 100644
--- a/include/stochtree/tree_sampler.h
+++ b/include/stochtree/tree_sampler.h
@@ -113,15 +113,9 @@ static inline bool NodeNonConstant(ForestDataset& dataset, ForestTracker& tracke
 
 static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, TreeSplit& split, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int feature_split, bool keep_sorted = false) {
   // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete
-  int basis_dim = dataset.NumBasis();
-  if (dataset.HasBasis()) {
-    if (basis_dim > 1) {
-      std::vector<double> temp_leaf_values(basis_dim, 0.);
-      tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values);
-    } else {
-      double temp_leaf_value = 0.;
-      tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value);
-    }
+  if (tree->OutputDimension() > 1) {
+    std::vector<double> temp_leaf_values(tree->OutputDimension(), 0.);
+    tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values);
   } else {
     double temp_leaf_value = 0.;
     tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value);
@@ -135,15 +129,9 @@ static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& datase
 
 static inline void RemoveSplitFromModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int left_node, int right_node, bool keep_sorted = false) {
   // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete
-  int basis_dim = dataset.NumBasis();
-  if (dataset.HasBasis()) {
-    if (basis_dim > 1) {
-      std::vector<double> temp_leaf_values(basis_dim, 0.);
-      tree->CollapseToLeaf(leaf_node, temp_leaf_values);
-    } else {
-      double temp_leaf_value = 0.;
-      tree->CollapseToLeaf(leaf_node, temp_leaf_value);
-    }
+  if (tree->OutputDimension() > 1) {
+    std::vector<double> temp_leaf_values(tree->OutputDimension(), 0.);
+    tree->CollapseToLeaf(leaf_node, temp_leaf_values);
   } else {
     double temp_leaf_value = 0.;
     tree->CollapseToLeaf(leaf_node, temp_leaf_value);
diff --git a/stochtree/bcf.py b/stochtree/bcf.py
index 6509ebc8..5b6d2351 100644
--- a/stochtree/bcf.py
+++ b/stochtree/bcf.py
@@ -6,7 +6,7 @@
 from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
 from sklearn.model_selection import GridSearchCV, KFold
 from sklearn.utils import check_scalar
-from typing import Optional
+from typing import Optional, Union
 from scipy.linalg import lstsq
 from scipy.stats import gamma
 from .bart import BARTModel
@@ -28,17 +28,19 @@ def __init__(self) -> None:
     def is_sampled(self) -> bool:
         return self.sampled
     
-    def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_train: np.array = None, 
-               X_test: np.array = None, Z_test: np.array = None, pi_test: np.array = None, 
+    def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_train: np.array, pi_train: np.array = None, 
+               X_test: Union[pd.DataFrame, np.array] = None, Z_test: np.array = None, pi_test: np.array = None, 
                cutpoint_grid_size = 100, sigma_leaf_mu: float = None, sigma_leaf_tau: float = None, 
                alpha_mu: float = 0.95, alpha_tau: float = 0.25, beta_mu: float = 2.0, beta_tau: float = 3.0, 
                min_samples_leaf_mu: int = 5, min_samples_leaf_tau: int = 5, nu: float = 3, lamb: float = None, 
                a_leaf_mu: float = 3, a_leaf_tau: float = 3, b_leaf_mu: float = None, b_leaf_tau: float = None, 
-               q: float = 0.9, sigma2: float = None, num_trees_mu: int = 200, num_trees_tau: int = 50, 
-               num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, sample_sigma_global: bool = True, 
-               sample_sigma_leaf_mu: bool = True, sample_sigma_leaf_tau: bool = False, propensity_covariate: str = "mu", 
-               adaptive_coding: bool = True, b_0: float = -0.5, b_1: float = 0.5, random_seed: int = -1, 
-               keep_burnin: bool = False, keep_gfr: bool = False) -> None:
+               q: float = 0.9, sigma2: float = None, variable_weights: np.array = None, 
+               keep_vars_mu: Union[list, np.array] = None, drop_vars_mu: Union[list, np.array] = None, 
+               keep_vars_tau: Union[list, np.array] = None, drop_vars_tau: Union[list, np.array] = None, 
+               num_trees_mu: int = 200, num_trees_tau: int = 50, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, 
+               sample_sigma_global: bool = True, sample_sigma_leaf_mu: bool = True, sample_sigma_leaf_tau: bool = False, 
+               propensity_covariate: str = "mu", adaptive_coding: bool = True, b_0: float = -0.5, b_1: float = 0.5, 
+               random_seed: int = -1, keep_burnin: bool = False, keep_gfr: bool = False) -> None:
         """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions 
         will be cached for the training set and (if provided) the test set.
 
@@ -102,6 +104,16 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             Quantile used to calibrated ``lamb`` as in Sparapani et al (2021). Defaults to ``0.9``.
         sigma2 : :obj:`float`, optional
             Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here.
+        variable_weights : :obj:`np.array`, optional
+            Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly.
+        keep_vars_mu : obj:`list` or :obj:`np.array`, optional
+            Vector of variable names or column indices denoting variables that should be included in the prognostic (``mu(X)``) forest. Defaults to ``None``.
+        drop_vars_mu : obj:`list` or :obj:`np.array`, optional
+            Vector of variable names or column indices denoting variables that should be excluded from the prognostic (``mu(X)``) forest. Defaults to ``None``. If both ``drop_vars_mu`` and ``keep_vars_mu`` are set, ``drop_vars_mu`` will be ignored.
+        keep_vars_tau : obj:`list` or :obj:`np.array`, optional
+            Vector of variable names or column indices denoting variables that should be included in the treatment effect (``tau(X)``) forest. Defaults to ``None``.
+        drop_vars_tau : obj:`list` or :obj:`np.array`, optional
+            Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (``tau(X)``) forest. Defaults to ``None``. If both ``drop_vars_tau`` and ``keep_vars_tau`` are set, ``drop_vars_tau`` will be ignored.
         num_trees_mu : :obj:`int`, optional
             Number of trees in the prognostic forest. Defaults to ``200``.
         num_trees_tau : :obj:`int`, optional
@@ -142,6 +154,15 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
         self : BCFModel
             Sampled BCF Model.
         """
+        # Variable weight preprocessing (and initialization if necessary)
+        if variable_weights is None:
+            if X_train.ndim > 1:
+                variable_weights = np.repeat(1/X_train.shape[1], X_train.shape[1])
+            else:
+                variable_weights = np.repeat(1., 1)
+        if np.any(variable_weights < 0):
+            raise ValueError("variable_weights cannot have any negative weights")
+        
         # Check data inputs
         if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray):
             raise ValueError("X_train must be a pandas dataframe or numpy array")
@@ -185,6 +206,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
         if pi_test is not None:
             if pi_test.ndim == 1:
                 pi_test = np.expand_dims(pi_test, 1)
+
+        # Original number of covariates
+        num_cov_orig = X_train.shape[1]
         
         # Data checks
         if X_test is not None:
@@ -311,6 +335,124 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             if not isinstance(keep_gfr, bool):
                 raise ValueError("keep_gfr must be a bool")
         
+        # Standardize the keep variable lists to numeric indices
+        if keep_vars_mu is not None:
+            if isinstance(keep_vars_mu, list):
+                if all(isinstance(i, str) for i in keep_vars_mu):
+                    if not np.all(np.isin(keep_vars_mu, X_train.columns)):
+                        raise ValueError("keep_vars_mu includes some variable names that are not in X_train")
+                    variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0]
+                elif all(isinstance(i, int) for i in keep_vars_mu):
+                    if any(i >= X_train.shape[1] for i in keep_vars_mu):
+                        raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train")
+                    if any(i < 0 for i in keep_vars_mu):
+                        raise ValueError("keep_vars_mu includes some negative variable indices")
+                    variable_subset_mu = keep_vars_mu
+                else:
+                    raise ValueError("keep_vars_mu must be a list of variable names (str) or column indices (int)")
+            elif isinstance(keep_vars_mu, np.ndarray):
+                if keep_vars_mu.dtype == np.str_:
+                    if not np.all(np.isin(keep_vars_mu, X_train.columns)):
+                        raise ValueError("keep_vars_mu includes some variable names that are not in X_train")
+                    variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0]
+                else:
+                    if np.any(keep_vars_mu >= X_train.shape[1]):
+                        raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train")
+                    if np.any(keep_vars_mu < 0):
+                        raise ValueError("keep_vars_mu includes some negative variable indices")
+                    variable_subset_mu = [i for i in keep_vars_mu]
+            else:
+                raise ValueError("keep_vars_mu must be a list or np.array")
+        elif keep_vars_mu is None and drop_vars_mu is not None:
+            if isinstance(drop_vars_mu, list):
+                if all(isinstance(i, str) for i in drop_vars_mu):
+                    if not np.all(np.isin(drop_vars_mu, X_train.columns)):
+                        raise ValueError("drop_vars_mu includes some variable names that are not in X_train")
+                    variable_subset_mu = [i for i in X_train.shape[1] if drop_vars_mu.count(X_train.columns.array[i]) == 0]
+                elif all(isinstance(i, int) for i in drop_vars_mu):
+                    if any(i >= X_train.shape[1] for i in drop_vars_mu):
+                        raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train")
+                    if any(i < 0 for i in drop_vars_mu):
+                        raise ValueError("drop_vars_mu includes some negative variable indices")
+                    variable_subset_mu = [i for i in X_train.shape[1] if drop_vars_mu.count(i) == 0]
+                else:
+                    raise ValueError("drop_vars_mu must be a list of variable names (str) or column indices (int)")
+            elif isinstance(drop_vars_mu, np.ndarray):
+                if drop_vars_mu.dtype == np.str_:
+                    if not np.all(np.isin(drop_vars_mu, X_train.columns)):
+                        raise ValueError("drop_vars_mu includes some variable names that are not in X_train")
+                    keep_inds = ~np.isin(X_train.columns.array, drop_vars_mu)
+                    variable_subset_mu = [i for i in keep_inds]
+                else:
+                    if np.any(drop_vars_mu >= X_train.shape[1]):
+                        raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train")
+                    if np.any(drop_vars_mu < 0):
+                        raise ValueError("drop_vars_mu includes some negative variable indices")
+                    keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_mu)
+                    variable_subset_mu = [i for i in keep_inds]
+            else:
+                raise ValueError("drop_vars_mu must be a list or np.array")
+        else:
+            variable_subset_mu = [i for i in range(X_train.shape[1])]
+        if keep_vars_tau is not None:
+            if isinstance(keep_vars_tau, list):
+                if all(isinstance(i, str) for i in keep_vars_tau):
+                    if not np.all(np.isin(keep_vars_tau, X_train.columns)):
+                        raise ValueError("keep_vars_tau includes some variable names that are not in X_train")
+                    variable_subset_tau = [i for i in X_train.shape[1] if keep_vars_tau.count(X_train.columns.array[i]) > 0]
+                elif all(isinstance(i, int) for i in keep_vars_tau):
+                    if any(i >= X_train.shape[1] for i in keep_vars_tau):
+                        raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train")
+                    if any(i < 0 for i in keep_vars_tau):
+                        raise ValueError("keep_vars_tau includes some negative variable indices")
+                    variable_subset_tau = keep_vars_tau
+                else:
+                    raise ValueError("keep_vars_tau must be a list of variable names (str) or column indices (int)")
+            elif isinstance(keep_vars_tau, np.ndarray):
+                if keep_vars_tau.dtype == np.str_:
+                    if not np.all(np.isin(keep_vars_tau, X_train.columns)):
+                        raise ValueError("keep_vars_tau includes some variable names that are not in X_train")
+                    variable_subset_tau = [i for i in X_train.shape[1] if keep_vars_tau.count(X_train.columns.array[i]) > 0]
+                else:
+                    if np.any(keep_vars_tau >= X_train.shape[1]):
+                        raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train")
+                    if np.any(keep_vars_tau < 0):
+                        raise ValueError("keep_vars_tau includes some negative variable indices")
+                    variable_subset_tau = [i for i in keep_vars_tau]
+            else:
+                raise ValueError("keep_vars_tau must be a list or np.array")
+        elif keep_vars_tau is None and drop_vars_tau is not None:
+            if isinstance(drop_vars_tau, list):
+                if all(isinstance(i, str) for i in drop_vars_tau):
+                    if not np.all(np.isin(drop_vars_tau, X_train.columns)):
+                        raise ValueError("drop_vars_tau includes some variable names that are not in X_train")
+                    variable_subset_tau = [i for i in X_train.shape[1] if drop_vars_tau.count(X_train.columns.array[i]) == 0]
+                elif all(isinstance(i, int) for i in drop_vars_tau):
+                    if any(i >= X_train.shape[1] for i in drop_vars_tau):
+                        raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train")
+                    if any(i < 0 for i in drop_vars_tau):
+                        raise ValueError("drop_vars_tau includes some negative variable indices")
+                    variable_subset_tau = [i for i in X_train.shape[1] if drop_vars_tau.count(i) == 0]
+                else:
+                    raise ValueError("drop_vars_tau must be a list of variable names (str) or column indices (int)")
+            elif isinstance(drop_vars_tau, np.ndarray):
+                if drop_vars_tau.dtype == np.str_:
+                    if not np.all(np.isin(drop_vars_tau, X_train.columns)):
+                        raise ValueError("drop_vars_tau includes some variable names that are not in X_train")
+                    keep_inds = ~np.isin(X_train.columns.array, drop_vars_tau)
+                    variable_subset_tau = [i for i in keep_inds]
+                else:
+                    if np.any(drop_vars_tau >= X_train.shape[1]):
+                        raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train")
+                    if np.any(drop_vars_tau < 0):
+                        raise ValueError("drop_vars_tau includes some negative variable indices")
+                    keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_tau)
+                    variable_subset_tau = [i for i in keep_inds]
+            else:
+                raise ValueError("drop_vars_tau must be a list or np.array")
+        else:
+            variable_subset_tau = [i for i in range(X_train.shape[1])]
+        
         # Covariate preprocessing
         self._covariate_transformer = CovariateTransformer()
         self._covariate_transformer.fit(X_train)
@@ -318,6 +460,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
         if X_test is not None:
             X_test_processed = self._covariate_transformer.transform(X_test)
         feature_types = np.asarray(self._covariate_transformer._processed_feature_types)
+        original_var_indices = self._covariate_transformer.fetch_original_feature_indices()
 
         # Determine whether a test set is provided
         self.has_test = X_test is not None
@@ -355,47 +498,6 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             self.internal_propensity_model = True
         else:
             self.internal_propensity_model = False
-        
-        # Update covariates to include propensities if requested
-        if propensity_covariate == "mu":
-            feature_types_mu = np.append(feature_types, 0).astype('int')
-            feature_types_tau = feature_types.astype('int')
-            X_train_mu = np.c_[X_train_processed, pi_train]
-            X_train_tau = X_train_processed
-            if self.has_test:
-                X_test_mu = np.c_[X_test, pi_test]
-                X_test_tau = X_test
-        elif propensity_covariate == "tau":
-            feature_types_tau = np.append(feature_types, 0).astype('int')
-            feature_types_mu = feature_types.astype('int')
-            X_train_tau = np.c_[X_train_processed, pi_train]
-            X_train_mu = X_train_processed
-            if self.has_test:
-                X_test_tau = np.c_[X_test, pi_test]
-                X_test_mu = X_test
-        elif propensity_covariate == "both":
-            feature_types_tau = np.append(feature_types, 0).astype('int')
-            feature_types_mu = np.append(feature_types, 0).astype('int')
-            X_train_tau = np.c_[X_train_processed, pi_train]
-            X_train_mu = np.c_[X_train_processed, pi_train]
-            if self.has_test:
-                X_test_tau = np.c_[X_test, pi_test]
-                X_test_mu = np.c_[X_test, pi_test]
-        elif propensity_covariate == "none":
-            feature_types_tau = feature_types.astype('int')
-            feature_types_mu = feature_types.astype('int')
-            X_train_tau = X_train_processed
-            X_train_mu = X_train_processed
-            if self.has_test:
-                X_test_tau = X_test
-                X_test_mu = X_test
-        
-        # Store propensity score requirements of the BCF forests
-        self.propensity_covariate = propensity_covariate
-        
-        # Set variable weights for the prognostic and treatment effect forests
-        variable_weights_mu = np.repeat(1.0/X_train_mu.shape[1], X_train_mu.shape[1])
-        variable_weights_tau = np.repeat(1.0/X_train_tau.shape[1], X_train_tau.shape[1])
 
         # Scale outcome
         self.y_bar = np.squeeze(np.mean(y_train))
@@ -424,6 +526,42 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
         else:
             current_leaf_scale_tau = sigma_leaf_tau
 
+        # Update variable weights
+        variable_counts = [original_var_indices.count(i) for i in original_var_indices]
+        variable_weights_adj = [1/i for i in variable_counts]
+        variable_weights = variable_weights[original_var_indices]*variable_weights_adj
+
+        # Create mu and tau specific variable weights with weights zeroed out for excluded variables
+        variable_weights_tau = variable_weights
+        variable_weights_mu = variable_weights
+        variable_weights_mu[[variable_subset_mu.count(i) == 0 for i in original_var_indices]] = 0
+        variable_weights_tau[[variable_subset_tau.count(i) == 0 for i in original_var_indices]] <- 0
+        
+        # Update covariates to include propensities if requested
+        if propensity_covariate not in ["none", "mu", "tau", "both"]:
+            raise ValueError("propensity_covariate must equal one of 'none', 'mu', 'tau', or 'both'")
+        if propensity_covariate != "none":
+            feature_types = np.append(feature_types, 0).astype('int')
+            X_train_processed = np.c_[X_train_processed, pi_train]
+            if self.has_test:
+                X_test_processed = np.c_[X_test_processed, pi_test]
+            if propensity_covariate == "mu":
+                variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1]))
+                variable_weights_tau = np.append(variable_weights_tau, np.repeat(0., pi_train.shape[1]))
+            elif propensity_covariate == "tau":
+                variable_weights_mu = np.append(variable_weights_mu, np.repeat(0., pi_train.shape[1]))
+                variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1]))
+            elif propensity_covariate == "both":
+                variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1]))
+                variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1]))
+        
+        # Renormalize variable weights
+        variable_weights_mu = variable_weights_mu / np.sum(variable_weights_mu)
+        variable_weights_tau = variable_weights_tau / np.sum(variable_weights_tau)
+        
+        # Store propensity score requirements of the BCF forests
+        self.propensity_covariate = propensity_covariate
+
         # Container of variance parameter samples
         self.num_gfr = num_gfr
         self.num_burnin = num_burnin
@@ -458,20 +596,13 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 tau_basis_test = Z_test
 
         # Prognostic Forest Dataset (covariates)
-        forest_dataset_mu_train = Dataset()
-        forest_dataset_mu_train.add_covariates(X_train_mu)
-        if self.has_test:
-            forest_dataset_mu_test = Dataset()
-            forest_dataset_mu_test.add_covariates(X_test_mu)
-
-        # Treatment Forest Dataset (covariates and treatment variable)
-        forest_dataset_tau_train = Dataset()
-        forest_dataset_tau_train.add_covariates(X_train_tau)
-        forest_dataset_tau_train.add_basis(tau_basis_train)
+        forest_dataset_train = Dataset()
+        forest_dataset_train.add_covariates(X_train_processed)
+        forest_dataset_train.add_basis(tau_basis_train)
         if self.has_test:
-            forest_dataset_tau_test = Dataset()
-            forest_dataset_tau_test.add_covariates(X_test_tau)
-            forest_dataset_tau_test.add_basis(tau_basis_test)
+            forest_dataset_test = Dataset()
+            forest_dataset_test.add_covariates(X_test_processed)
+            forest_dataset_test.add_basis(tau_basis_test)
 
         # Residual
         residual_train = Residual(resid_train)
@@ -483,8 +614,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             cpp_rng = RNG(random_seed)
         
         # Sampling data structures
-        forest_sampler_mu = ForestSampler(forest_dataset_mu_train, feature_types_mu, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu)
-        forest_sampler_tau = ForestSampler(forest_dataset_tau_train, feature_types_tau, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau)
+        forest_sampler_mu = ForestSampler(forest_dataset_train, feature_types, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu)
+        forest_sampler_tau = ForestSampler(forest_dataset_train, feature_types, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau)
 
         # Container of forest samples
         self.forest_container_mu = ForestContainer(num_trees_mu, 1, True)
@@ -501,14 +632,14 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
         # Initialize the leaves of each tree in the prognostic forest
         init_mu = np.squeeze(np.mean(resid_train)) / num_trees_mu
         self.forest_container_mu.set_root_leaves(0, init_mu)
-        forest_sampler_mu.update_residual(forest_dataset_mu_train, residual_train, self.forest_container_mu, False, 0, True)
+        forest_sampler_mu.update_residual(forest_dataset_train, residual_train, self.forest_container_mu, False, 0, True)
 
         # Initialize the leaves of each tree in the treatment forest
         if self.multivariate_treatment:
             self.forest_container_tau.set_root_leaves(0, np.zeros(self.treatment_dim))
         else:
             self.forest_container_tau.set_root_leaves(0, 0.)
-        forest_sampler_tau.update_residual(forest_dataset_tau_train, residual_train, self.forest_container_tau, True, 0, True)
+        forest_sampler_tau.update_residual(forest_dataset_train, residual_train, self.forest_container_tau, True, 0, True)
 
         # Run GFR (warm start) if specified
         if self.num_gfr > 0:
@@ -516,7 +647,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             for i in range(self.num_gfr):
                 # Sample the prognostic forest
                 forest_sampler_mu.sample_one_iteration(
-                    self.forest_container_mu, forest_dataset_mu_train, residual_train, cpp_rng, feature_types_mu, 
+                    self.forest_container_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, 
                     cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, current_sigma2, 0, True, True
                 )
 
@@ -530,7 +661,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 
                 # Sample the treatment forest
                 forest_sampler_tau.sample_one_iteration(
-                    self.forest_container_tau, forest_dataset_tau_train, residual_train, cpp_rng, feature_types_tau, 
+                    self.forest_container_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, 
                     cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, current_sigma2, treatment_leaf_model, True, True
                 )
                 
@@ -544,8 +675,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 
                 # Sample coding parameters (if requested)
                 if self.adaptive_coding:
-                    mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_mu_train, i)
-                    tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_tau_train, i))
+                    mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_train, i)
+                    tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_train, i))
                     s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0))
                     s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1))
                     partial_resid_mu = np.squeeze(resid_train - mu_x)
@@ -556,10 +687,10 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                     current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), 
                                              scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1)
                     tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1
-                    forest_dataset_tau_train.update_basis(tau_basis_train)
+                    forest_dataset_train.update_basis(tau_basis_train)
                     if self.has_test:
                         tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1
-                        forest_dataset_tau_test.update_basis(tau_basis_test)
+                        forest_dataset_test.update_basis(tau_basis_test)
                     self.b0_samples[i] = current_b_0
                     self.b1_samples[i] = current_b_1
         
@@ -572,7 +703,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             for i in range(self.num_gfr, self.num_samples):
                 # Sample the prognostic forest
                 forest_sampler_mu.sample_one_iteration(
-                    self.forest_container_mu, forest_dataset_mu_train, residual_train, cpp_rng, feature_types_mu, 
+                    self.forest_container_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, 
                     cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, current_sigma2, 0, False, True
                 )
 
@@ -586,7 +717,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 
                 # Sample the treatment forest
                 forest_sampler_tau.sample_one_iteration(
-                    self.forest_container_tau, forest_dataset_tau_train, residual_train, cpp_rng, feature_types_tau, 
+                    self.forest_container_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, 
                     cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, current_sigma2, treatment_leaf_model, False, True
                 )
                 
@@ -600,8 +731,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 
                 # Sample coding parameters (if requested)
                 if self.adaptive_coding:
-                    mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_mu_train, i)
-                    tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_tau_train, i))
+                    mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_train, i)
+                    tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_train, i))
                     s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0))
                     s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1))
                     partial_resid_mu = np.squeeze(resid_train - mu_x)
@@ -612,10 +743,10 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                     current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), 
                                              scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1)
                     tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1
-                    forest_dataset_tau_train.update_basis(tau_basis_train)
+                    forest_dataset_train.update_basis(tau_basis_train)
                     if self.has_test:
                         tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1
-                        forest_dataset_tau_test.update_basis(tau_basis_test)
+                        forest_dataset_test.update_basis(tau_basis_test)
                     self.b0_samples[i] = current_b_0
                     self.b1_samples[i] = current_b_1
         
@@ -644,9 +775,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
                 raise RuntimeError("There are no samples to retain!")
         
         # Store predictions
-        mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_mu_train.dataset_cpp)
+        mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp)
         self.mu_hat_train = mu_raw[:,self.keep_indices]*self.y_std + self.y_bar
-        tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau_train.dataset_cpp)
+        tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_train.dataset_cpp)
         self.tau_hat_train = tau_raw_train[:,self.keep_indices]
         if self.adaptive_coding:
             adaptive_coding_weights = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2))
@@ -658,9 +789,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra
             treatment_term_train = Z_train*np.squeeze(self.tau_hat_train)
         self.y_hat_train = self.mu_hat_train + treatment_term_train
         if self.has_test:
-            mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_mu_test.dataset_cpp)
+            mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp)
             self.mu_hat_test = mu_raw_test[:,self.keep_indices]*self.y_std + self.y_bar
-            tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau_test.dataset_cpp)
+            tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp)
             self.tau_hat_test = tau_raw_test[:,self.keep_indices]
             if self.adaptive_coding:
                 adaptive_coding_weights_test = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2))
diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py
index 905cab88..4f62da99 100644
--- a/stochtree/preprocessing.py
+++ b/stochtree/preprocessing.py
@@ -22,6 +22,7 @@ def __init__(self) -> None:
         self._onehot_feature_index = []
         self._processed_feature_types = []
         self._original_feature_types = []
+        self._original_feature_indices = []
     
     def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
         if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f":
@@ -169,7 +170,8 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
             raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")
         
         output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64)
-        output_iter = 0        
+        output_iter = 0
+        self._original_feature_indices = []
         for i in range(covariates.shape[1]):
             covariate = covariates.iloc[:,i]
             if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string":
@@ -178,20 +180,24 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
                     covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate))
                     output_array[:,output_iter] = np.squeeze(covariate_transformed)
                     output_iter += 1
+                    self._original_feature_indices.extend(i)
                 else:
                     onehot_ind = self._onehot_feature_index[i]
                     covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate))
                     output_dim = covariate_transformed.shape[1]
                     output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed)
                     output_iter += output_dim
+                    self._original_feature_indices.extend([i for _ in range(output_dim)])
             
             elif self._original_feature_types[i] == "boolean":
                 output_array[:,output_iter] = (covariate*1.0).to_numpy()
                 output_iter += 1
+                self._original_feature_indices.extend(i)
             
             elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float":
                 output_array[:,output_iter] = (covariate).to_numpy()
                 output_iter += 1
+                self._original_feature_indices.extend(i)
         
         return output_array
 
@@ -202,7 +208,7 @@ def _transform_numpy(self, covariates: np.array) -> np.array:
             raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
         if self._num_original_features != covariates.shape[1]:
             raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")
-        
+        self._original_feature_indices = [i for i in range(covariates.shape[1])]
         return covariates
 
     def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
@@ -247,7 +253,7 @@ def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
 
         Parameters
         ----------
-        covariates : np.array or pd.DataFrame
+        covariates : :obj:`np.array` or :obj:`pd.DataFrame`
             Covariates to be preprocessed.
         
         Returns
@@ -264,3 +270,6 @@ def transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
     def fit_transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
         self._fit(covariates)
         return self._transform(covariates)
+    
+    def fetch_original_feature_indices(self) -> list:
+        return self._original_feature_indices

From 31312b99d3d615248248074575338043ea408258 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Tue, 25 Jun 2024 01:20:02 -0500
Subject: [PATCH 2/2] Fixed list append vs extend issue

---
 stochtree/preprocessing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py
index 4f62da99..b9c6ceb8 100644
--- a/stochtree/preprocessing.py
+++ b/stochtree/preprocessing.py
@@ -180,7 +180,7 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
                     covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate))
                     output_array[:,output_iter] = np.squeeze(covariate_transformed)
                     output_iter += 1
-                    self._original_feature_indices.extend(i)
+                    self._original_feature_indices.append(i)
                 else:
                     onehot_ind = self._onehot_feature_index[i]
                     covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate))
@@ -192,12 +192,12 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
             elif self._original_feature_types[i] == "boolean":
                 output_array[:,output_iter] = (covariate*1.0).to_numpy()
                 output_iter += 1
-                self._original_feature_indices.extend(i)
+                self._original_feature_indices.append(i)
             
             elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float":
                 output_array[:,output_iter] = (covariate).to_numpy()
                 output_iter += 1
-                self._original_feature_indices.extend(i)
+                self._original_feature_indices.append(i)
         
         return output_array