From 2ca1bbe9ae0cf07c251751ede22cf71e813d94d6 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 25 Jun 2024 01:07:54 -0500 Subject: [PATCH 1/2] Updated python interface to allow the use of different feature subsets in mu and tau forests for BCF --- R/utils.R | 2 - .../causal_inference_feature_subsets.ipynb | 302 ++++++++++++++++++ include/stochtree/tree_sampler.h | 24 +- stochtree/bcf.py | 295 ++++++++++++----- stochtree/preprocessing.py | 15 +- 5 files changed, 533 insertions(+), 105 deletions(-) create mode 100644 demo/notebooks/causal_inference_feature_subsets.ipynb diff --git a/R/utils.R b/R/utils.R index 7c8803c9..7afffe27 100644 --- a/R/utils.R +++ b/R/utils.R @@ -64,7 +64,6 @@ preprocessPredictionData <- function(input_data, metadata) { #' Returns a list including a matrix of preprocessed covariate values and associated tracking. #' #' @param input_matrix Covariate matrix. -#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable #' #' @return List with preprocessed (unmodified) data and details on the number of each type #' of variable, unique categories associated with categorical variables, and the @@ -142,7 +141,6 @@ preprocessPredictionMatrix <- function(input_matrix, metadata) { #' #' @param input_df Dataframe of covariates. Users must pre-process any #' categorical variables as factors (ordered for ordered categorical). -#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable #' #' @return List with preprocessed data and details on the number of each type #' of variable, unique categories associated with categorical variables, and the diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb new file mode 100644 index 00000000..1d35bb5c --- /dev/null +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Causal Inference with Feature Subsets Demo Notebook\n", + "\n", + "This is a duplicate of the main causal inference demo which shows how a user might decide to use only a subset of covariates in the treatment effect forest. \n", + "Why might we want to do that? Well, in many cases it is plausible that some covariates (for example age, income, etc...) influence the outcome of interest \n", + "in a causal problem, but do not **moderate** the treatment effect. In this case, we'd need to include these variables in the prognostic forest for deconfounding \n", + "but we don't necessarily need to include them in the treatment effect forest." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from stochtree import BCFModel\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate sample data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# RNG\n", + "rng = np.random.default_rng()\n", + "\n", + "# Generate covariates and basis\n", + "n = 1000\n", + "p_X = 10\n", + "X = rng.uniform(0, 1, (n, p_X))\n", + "pi_X = 0.25 + 0.5*X[:,0]\n", + "Z = rng.binomial(1, pi_X, n).astype(float)\n", + "\n", + "# Define the outcome mean functions (prognostic and treatment effects)\n", + "mu_X = pi_X*5 + 2*X[:,2]\n", + "tau_X = 1 - 2*X[:,0] + 2*X[:,1] + 1*X[:,0]*X[:,1]\n", + "\n", + "# Generate outcome\n", + "epsilon = rng.normal(0, 1, n)\n", + "y = mu_X + tau_X*Z + epsilon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Test-train split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_inds = np.arange(n)\n", + "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", + "X_train = X[train_inds,:]\n", + "X_test = X[test_inds,:]\n", + "Z_train = Z[train_inds]\n", + "Z_test = Z[test_inds]\n", + "y_train = y[train_inds]\n", + "y_test = y[test_inds]\n", + "pi_train = pi_X[train_inds]\n", + "pi_test = pi_X[test_inds]\n", + "mu_train = mu_X[train_inds]\n", + "mu_test = mu_X[test_inds]\n", + "tau_train = tau_X[train_inds]\n", + "tau_test = tau_X[test_inds]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run BCF without feature subsetting for $\\tau(X)$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bcf_model = BCFModel()\n", + "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the MCMC (BART) samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_y_mcmc = bcf_model.y_hat_test\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", + "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", + "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", + "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", + "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run BCF, subsetting to the two features that show up in $\\tau(X)$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bcf_model_subset = BCFModel()\n", + "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, keep_vars_tau=[0,1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the MCMC (BART) samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_y_mcmc = bcf_model_subset.y_hat_test\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", + "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_tau_mcmc = bcf_model_subset.tau_hat_test\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", + "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "forest_preds_mu_mcmc = bcf_model_subset.mu_hat_test\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", + "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n", + " np.expand_dims(bcf_model_subset.global_var_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n", + " columns=[\"Sample\", \"Sigma\"])\n", + "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n", + " np.expand_dims(bcf_model_subset.b0_samples[bcf_model_subset.num_gfr:],axis=1), \n", + " np.expand_dims(bcf_model_subset.b1_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", + "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "stochtree-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/include/stochtree/tree_sampler.h b/include/stochtree/tree_sampler.h index 243ec39d..e9ef65d9 100644 --- a/include/stochtree/tree_sampler.h +++ b/include/stochtree/tree_sampler.h @@ -113,15 +113,9 @@ static inline bool NodeNonConstant(ForestDataset& dataset, ForestTracker& tracke static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, TreeSplit& split, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int feature_split, bool keep_sorted = false) { // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete - int basis_dim = dataset.NumBasis(); - if (dataset.HasBasis()) { - if (basis_dim > 1) { - std::vector temp_leaf_values(basis_dim, 0.); - tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values); - } else { - double temp_leaf_value = 0.; - tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value); - } + if (tree->OutputDimension() > 1) { + std::vector temp_leaf_values(tree->OutputDimension(), 0.); + tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values); } else { double temp_leaf_value = 0.; tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value); @@ -135,15 +129,9 @@ static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& datase static inline void RemoveSplitFromModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int left_node, int right_node, bool keep_sorted = false) { // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete - int basis_dim = dataset.NumBasis(); - if (dataset.HasBasis()) { - if (basis_dim > 1) { - std::vector temp_leaf_values(basis_dim, 0.); - tree->CollapseToLeaf(leaf_node, temp_leaf_values); - } else { - double temp_leaf_value = 0.; - tree->CollapseToLeaf(leaf_node, temp_leaf_value); - } + if (tree->OutputDimension() > 1) { + std::vector temp_leaf_values(tree->OutputDimension(), 0.); + tree->CollapseToLeaf(leaf_node, temp_leaf_values); } else { double temp_leaf_value = 0.; tree->CollapseToLeaf(leaf_node, temp_leaf_value); diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 6509ebc8..5b6d2351 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -6,7 +6,7 @@ from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor from sklearn.model_selection import GridSearchCV, KFold from sklearn.utils import check_scalar -from typing import Optional +from typing import Optional, Union from scipy.linalg import lstsq from scipy.stats import gamma from .bart import BARTModel @@ -28,17 +28,19 @@ def __init__(self) -> None: def is_sampled(self) -> bool: return self.sampled - def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_train: np.array = None, - X_test: np.array = None, Z_test: np.array = None, pi_test: np.array = None, + def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_train: np.array, pi_train: np.array = None, + X_test: Union[pd.DataFrame, np.array] = None, Z_test: np.array = None, pi_test: np.array = None, cutpoint_grid_size = 100, sigma_leaf_mu: float = None, sigma_leaf_tau: float = None, alpha_mu: float = 0.95, alpha_tau: float = 0.25, beta_mu: float = 2.0, beta_tau: float = 3.0, min_samples_leaf_mu: int = 5, min_samples_leaf_tau: int = 5, nu: float = 3, lamb: float = None, a_leaf_mu: float = 3, a_leaf_tau: float = 3, b_leaf_mu: float = None, b_leaf_tau: float = None, - q: float = 0.9, sigma2: float = None, num_trees_mu: int = 200, num_trees_tau: int = 50, - num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, sample_sigma_global: bool = True, - sample_sigma_leaf_mu: bool = True, sample_sigma_leaf_tau: bool = False, propensity_covariate: str = "mu", - adaptive_coding: bool = True, b_0: float = -0.5, b_1: float = 0.5, random_seed: int = -1, - keep_burnin: bool = False, keep_gfr: bool = False) -> None: + q: float = 0.9, sigma2: float = None, variable_weights: np.array = None, + keep_vars_mu: Union[list, np.array] = None, drop_vars_mu: Union[list, np.array] = None, + keep_vars_tau: Union[list, np.array] = None, drop_vars_tau: Union[list, np.array] = None, + num_trees_mu: int = 200, num_trees_tau: int = 50, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, + sample_sigma_global: bool = True, sample_sigma_leaf_mu: bool = True, sample_sigma_leaf_tau: bool = False, + propensity_covariate: str = "mu", adaptive_coding: bool = True, b_0: float = -0.5, b_1: float = 0.5, + random_seed: int = -1, keep_burnin: bool = False, keep_gfr: bool = False) -> None: """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions will be cached for the training set and (if provided) the test set. @@ -102,6 +104,16 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra Quantile used to calibrated ``lamb`` as in Sparapani et al (2021). Defaults to ``0.9``. sigma2 : :obj:`float`, optional Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + variable_weights : :obj:`np.array`, optional + Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly. + keep_vars_mu : obj:`list` or :obj:`np.array`, optional + Vector of variable names or column indices denoting variables that should be included in the prognostic (``mu(X)``) forest. Defaults to ``None``. + drop_vars_mu : obj:`list` or :obj:`np.array`, optional + Vector of variable names or column indices denoting variables that should be excluded from the prognostic (``mu(X)``) forest. Defaults to ``None``. If both ``drop_vars_mu`` and ``keep_vars_mu`` are set, ``drop_vars_mu`` will be ignored. + keep_vars_tau : obj:`list` or :obj:`np.array`, optional + Vector of variable names or column indices denoting variables that should be included in the treatment effect (``tau(X)``) forest. Defaults to ``None``. + drop_vars_tau : obj:`list` or :obj:`np.array`, optional + Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (``tau(X)``) forest. Defaults to ``None``. If both ``drop_vars_tau`` and ``keep_vars_tau`` are set, ``drop_vars_tau`` will be ignored. num_trees_mu : :obj:`int`, optional Number of trees in the prognostic forest. Defaults to ``200``. num_trees_tau : :obj:`int`, optional @@ -142,6 +154,15 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra self : BCFModel Sampled BCF Model. """ + # Variable weight preprocessing (and initialization if necessary) + if variable_weights is None: + if X_train.ndim > 1: + variable_weights = np.repeat(1/X_train.shape[1], X_train.shape[1]) + else: + variable_weights = np.repeat(1., 1) + if np.any(variable_weights < 0): + raise ValueError("variable_weights cannot have any negative weights") + # Check data inputs if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray): raise ValueError("X_train must be a pandas dataframe or numpy array") @@ -185,6 +206,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra if pi_test is not None: if pi_test.ndim == 1: pi_test = np.expand_dims(pi_test, 1) + + # Original number of covariates + num_cov_orig = X_train.shape[1] # Data checks if X_test is not None: @@ -311,6 +335,124 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra if not isinstance(keep_gfr, bool): raise ValueError("keep_gfr must be a bool") + # Standardize the keep variable lists to numeric indices + if keep_vars_mu is not None: + if isinstance(keep_vars_mu, list): + if all(isinstance(i, str) for i in keep_vars_mu): + if not np.all(np.isin(keep_vars_mu, X_train.columns)): + raise ValueError("keep_vars_mu includes some variable names that are not in X_train") + variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0] + elif all(isinstance(i, int) for i in keep_vars_mu): + if any(i >= X_train.shape[1] for i in keep_vars_mu): + raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in keep_vars_mu): + raise ValueError("keep_vars_mu includes some negative variable indices") + variable_subset_mu = keep_vars_mu + else: + raise ValueError("keep_vars_mu must be a list of variable names (str) or column indices (int)") + elif isinstance(keep_vars_mu, np.ndarray): + if keep_vars_mu.dtype == np.str_: + if not np.all(np.isin(keep_vars_mu, X_train.columns)): + raise ValueError("keep_vars_mu includes some variable names that are not in X_train") + variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0] + else: + if np.any(keep_vars_mu >= X_train.shape[1]): + raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train") + if np.any(keep_vars_mu < 0): + raise ValueError("keep_vars_mu includes some negative variable indices") + variable_subset_mu = [i for i in keep_vars_mu] + else: + raise ValueError("keep_vars_mu must be a list or np.array") + elif keep_vars_mu is None and drop_vars_mu is not None: + if isinstance(drop_vars_mu, list): + if all(isinstance(i, str) for i in drop_vars_mu): + if not np.all(np.isin(drop_vars_mu, X_train.columns)): + raise ValueError("drop_vars_mu includes some variable names that are not in X_train") + variable_subset_mu = [i for i in X_train.shape[1] if drop_vars_mu.count(X_train.columns.array[i]) == 0] + elif all(isinstance(i, int) for i in drop_vars_mu): + if any(i >= X_train.shape[1] for i in drop_vars_mu): + raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in drop_vars_mu): + raise ValueError("drop_vars_mu includes some negative variable indices") + variable_subset_mu = [i for i in X_train.shape[1] if drop_vars_mu.count(i) == 0] + else: + raise ValueError("drop_vars_mu must be a list of variable names (str) or column indices (int)") + elif isinstance(drop_vars_mu, np.ndarray): + if drop_vars_mu.dtype == np.str_: + if not np.all(np.isin(drop_vars_mu, X_train.columns)): + raise ValueError("drop_vars_mu includes some variable names that are not in X_train") + keep_inds = ~np.isin(X_train.columns.array, drop_vars_mu) + variable_subset_mu = [i for i in keep_inds] + else: + if np.any(drop_vars_mu >= X_train.shape[1]): + raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train") + if np.any(drop_vars_mu < 0): + raise ValueError("drop_vars_mu includes some negative variable indices") + keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_mu) + variable_subset_mu = [i for i in keep_inds] + else: + raise ValueError("drop_vars_mu must be a list or np.array") + else: + variable_subset_mu = [i for i in range(X_train.shape[1])] + if keep_vars_tau is not None: + if isinstance(keep_vars_tau, list): + if all(isinstance(i, str) for i in keep_vars_tau): + if not np.all(np.isin(keep_vars_tau, X_train.columns)): + raise ValueError("keep_vars_tau includes some variable names that are not in X_train") + variable_subset_tau = [i for i in X_train.shape[1] if keep_vars_tau.count(X_train.columns.array[i]) > 0] + elif all(isinstance(i, int) for i in keep_vars_tau): + if any(i >= X_train.shape[1] for i in keep_vars_tau): + raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in keep_vars_tau): + raise ValueError("keep_vars_tau includes some negative variable indices") + variable_subset_tau = keep_vars_tau + else: + raise ValueError("keep_vars_tau must be a list of variable names (str) or column indices (int)") + elif isinstance(keep_vars_tau, np.ndarray): + if keep_vars_tau.dtype == np.str_: + if not np.all(np.isin(keep_vars_tau, X_train.columns)): + raise ValueError("keep_vars_tau includes some variable names that are not in X_train") + variable_subset_tau = [i for i in X_train.shape[1] if keep_vars_tau.count(X_train.columns.array[i]) > 0] + else: + if np.any(keep_vars_tau >= X_train.shape[1]): + raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train") + if np.any(keep_vars_tau < 0): + raise ValueError("keep_vars_tau includes some negative variable indices") + variable_subset_tau = [i for i in keep_vars_tau] + else: + raise ValueError("keep_vars_tau must be a list or np.array") + elif keep_vars_tau is None and drop_vars_tau is not None: + if isinstance(drop_vars_tau, list): + if all(isinstance(i, str) for i in drop_vars_tau): + if not np.all(np.isin(drop_vars_tau, X_train.columns)): + raise ValueError("drop_vars_tau includes some variable names that are not in X_train") + variable_subset_tau = [i for i in X_train.shape[1] if drop_vars_tau.count(X_train.columns.array[i]) == 0] + elif all(isinstance(i, int) for i in drop_vars_tau): + if any(i >= X_train.shape[1] for i in drop_vars_tau): + raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in drop_vars_tau): + raise ValueError("drop_vars_tau includes some negative variable indices") + variable_subset_tau = [i for i in X_train.shape[1] if drop_vars_tau.count(i) == 0] + else: + raise ValueError("drop_vars_tau must be a list of variable names (str) or column indices (int)") + elif isinstance(drop_vars_tau, np.ndarray): + if drop_vars_tau.dtype == np.str_: + if not np.all(np.isin(drop_vars_tau, X_train.columns)): + raise ValueError("drop_vars_tau includes some variable names that are not in X_train") + keep_inds = ~np.isin(X_train.columns.array, drop_vars_tau) + variable_subset_tau = [i for i in keep_inds] + else: + if np.any(drop_vars_tau >= X_train.shape[1]): + raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train") + if np.any(drop_vars_tau < 0): + raise ValueError("drop_vars_tau includes some negative variable indices") + keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_tau) + variable_subset_tau = [i for i in keep_inds] + else: + raise ValueError("drop_vars_tau must be a list or np.array") + else: + variable_subset_tau = [i for i in range(X_train.shape[1])] + # Covariate preprocessing self._covariate_transformer = CovariateTransformer() self._covariate_transformer.fit(X_train) @@ -318,6 +460,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra if X_test is not None: X_test_processed = self._covariate_transformer.transform(X_test) feature_types = np.asarray(self._covariate_transformer._processed_feature_types) + original_var_indices = self._covariate_transformer.fetch_original_feature_indices() # Determine whether a test set is provided self.has_test = X_test is not None @@ -355,47 +498,6 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra self.internal_propensity_model = True else: self.internal_propensity_model = False - - # Update covariates to include propensities if requested - if propensity_covariate == "mu": - feature_types_mu = np.append(feature_types, 0).astype('int') - feature_types_tau = feature_types.astype('int') - X_train_mu = np.c_[X_train_processed, pi_train] - X_train_tau = X_train_processed - if self.has_test: - X_test_mu = np.c_[X_test, pi_test] - X_test_tau = X_test - elif propensity_covariate == "tau": - feature_types_tau = np.append(feature_types, 0).astype('int') - feature_types_mu = feature_types.astype('int') - X_train_tau = np.c_[X_train_processed, pi_train] - X_train_mu = X_train_processed - if self.has_test: - X_test_tau = np.c_[X_test, pi_test] - X_test_mu = X_test - elif propensity_covariate == "both": - feature_types_tau = np.append(feature_types, 0).astype('int') - feature_types_mu = np.append(feature_types, 0).astype('int') - X_train_tau = np.c_[X_train_processed, pi_train] - X_train_mu = np.c_[X_train_processed, pi_train] - if self.has_test: - X_test_tau = np.c_[X_test, pi_test] - X_test_mu = np.c_[X_test, pi_test] - elif propensity_covariate == "none": - feature_types_tau = feature_types.astype('int') - feature_types_mu = feature_types.astype('int') - X_train_tau = X_train_processed - X_train_mu = X_train_processed - if self.has_test: - X_test_tau = X_test - X_test_mu = X_test - - # Store propensity score requirements of the BCF forests - self.propensity_covariate = propensity_covariate - - # Set variable weights for the prognostic and treatment effect forests - variable_weights_mu = np.repeat(1.0/X_train_mu.shape[1], X_train_mu.shape[1]) - variable_weights_tau = np.repeat(1.0/X_train_tau.shape[1], X_train_tau.shape[1]) # Scale outcome self.y_bar = np.squeeze(np.mean(y_train)) @@ -424,6 +526,42 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra else: current_leaf_scale_tau = sigma_leaf_tau + # Update variable weights + variable_counts = [original_var_indices.count(i) for i in original_var_indices] + variable_weights_adj = [1/i for i in variable_counts] + variable_weights = variable_weights[original_var_indices]*variable_weights_adj + + # Create mu and tau specific variable weights with weights zeroed out for excluded variables + variable_weights_tau = variable_weights + variable_weights_mu = variable_weights + variable_weights_mu[[variable_subset_mu.count(i) == 0 for i in original_var_indices]] = 0 + variable_weights_tau[[variable_subset_tau.count(i) == 0 for i in original_var_indices]] <- 0 + + # Update covariates to include propensities if requested + if propensity_covariate not in ["none", "mu", "tau", "both"]: + raise ValueError("propensity_covariate must equal one of 'none', 'mu', 'tau', or 'both'") + if propensity_covariate != "none": + feature_types = np.append(feature_types, 0).astype('int') + X_train_processed = np.c_[X_train_processed, pi_train] + if self.has_test: + X_test_processed = np.c_[X_test_processed, pi_test] + if propensity_covariate == "mu": + variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1])) + variable_weights_tau = np.append(variable_weights_tau, np.repeat(0., pi_train.shape[1])) + elif propensity_covariate == "tau": + variable_weights_mu = np.append(variable_weights_mu, np.repeat(0., pi_train.shape[1])) + variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1])) + elif propensity_covariate == "both": + variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1])) + variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1])) + + # Renormalize variable weights + variable_weights_mu = variable_weights_mu / np.sum(variable_weights_mu) + variable_weights_tau = variable_weights_tau / np.sum(variable_weights_tau) + + # Store propensity score requirements of the BCF forests + self.propensity_covariate = propensity_covariate + # Container of variance parameter samples self.num_gfr = num_gfr self.num_burnin = num_burnin @@ -458,20 +596,13 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra tau_basis_test = Z_test # Prognostic Forest Dataset (covariates) - forest_dataset_mu_train = Dataset() - forest_dataset_mu_train.add_covariates(X_train_mu) - if self.has_test: - forest_dataset_mu_test = Dataset() - forest_dataset_mu_test.add_covariates(X_test_mu) - - # Treatment Forest Dataset (covariates and treatment variable) - forest_dataset_tau_train = Dataset() - forest_dataset_tau_train.add_covariates(X_train_tau) - forest_dataset_tau_train.add_basis(tau_basis_train) + forest_dataset_train = Dataset() + forest_dataset_train.add_covariates(X_train_processed) + forest_dataset_train.add_basis(tau_basis_train) if self.has_test: - forest_dataset_tau_test = Dataset() - forest_dataset_tau_test.add_covariates(X_test_tau) - forest_dataset_tau_test.add_basis(tau_basis_test) + forest_dataset_test = Dataset() + forest_dataset_test.add_covariates(X_test_processed) + forest_dataset_test.add_basis(tau_basis_test) # Residual residual_train = Residual(resid_train) @@ -483,8 +614,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra cpp_rng = RNG(random_seed) # Sampling data structures - forest_sampler_mu = ForestSampler(forest_dataset_mu_train, feature_types_mu, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu) - forest_sampler_tau = ForestSampler(forest_dataset_tau_train, feature_types_tau, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau) + forest_sampler_mu = ForestSampler(forest_dataset_train, feature_types, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu) + forest_sampler_tau = ForestSampler(forest_dataset_train, feature_types, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau) # Container of forest samples self.forest_container_mu = ForestContainer(num_trees_mu, 1, True) @@ -501,14 +632,14 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Initialize the leaves of each tree in the prognostic forest init_mu = np.squeeze(np.mean(resid_train)) / num_trees_mu self.forest_container_mu.set_root_leaves(0, init_mu) - forest_sampler_mu.update_residual(forest_dataset_mu_train, residual_train, self.forest_container_mu, False, 0, True) + forest_sampler_mu.update_residual(forest_dataset_train, residual_train, self.forest_container_mu, False, 0, True) # Initialize the leaves of each tree in the treatment forest if self.multivariate_treatment: self.forest_container_tau.set_root_leaves(0, np.zeros(self.treatment_dim)) else: self.forest_container_tau.set_root_leaves(0, 0.) - forest_sampler_tau.update_residual(forest_dataset_tau_train, residual_train, self.forest_container_tau, True, 0, True) + forest_sampler_tau.update_residual(forest_dataset_train, residual_train, self.forest_container_tau, True, 0, True) # Run GFR (warm start) if specified if self.num_gfr > 0: @@ -516,7 +647,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra for i in range(self.num_gfr): # Sample the prognostic forest forest_sampler_mu.sample_one_iteration( - self.forest_container_mu, forest_dataset_mu_train, residual_train, cpp_rng, feature_types_mu, + self.forest_container_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, current_sigma2, 0, True, True ) @@ -530,7 +661,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Sample the treatment forest forest_sampler_tau.sample_one_iteration( - self.forest_container_tau, forest_dataset_tau_train, residual_train, cpp_rng, feature_types_tau, + self.forest_container_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, current_sigma2, treatment_leaf_model, True, True ) @@ -544,8 +675,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Sample coding parameters (if requested) if self.adaptive_coding: - mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_mu_train, i) - tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_tau_train, i)) + mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_train, i) + tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_train, i)) s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0)) s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1)) partial_resid_mu = np.squeeze(resid_train - mu_x) @@ -556,10 +687,10 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1) tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1 - forest_dataset_tau_train.update_basis(tau_basis_train) + forest_dataset_train.update_basis(tau_basis_train) if self.has_test: tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1 - forest_dataset_tau_test.update_basis(tau_basis_test) + forest_dataset_test.update_basis(tau_basis_test) self.b0_samples[i] = current_b_0 self.b1_samples[i] = current_b_1 @@ -572,7 +703,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra for i in range(self.num_gfr, self.num_samples): # Sample the prognostic forest forest_sampler_mu.sample_one_iteration( - self.forest_container_mu, forest_dataset_mu_train, residual_train, cpp_rng, feature_types_mu, + self.forest_container_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, current_sigma2, 0, False, True ) @@ -586,7 +717,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Sample the treatment forest forest_sampler_tau.sample_one_iteration( - self.forest_container_tau, forest_dataset_tau_train, residual_train, cpp_rng, feature_types_tau, + self.forest_container_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, current_sigma2, treatment_leaf_model, False, True ) @@ -600,8 +731,8 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Sample coding parameters (if requested) if self.adaptive_coding: - mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_mu_train, i) - tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_tau_train, i)) + mu_x = self.forest_container_mu.predict_raw_single_forest(forest_dataset_train, i) + tau_x = np.squeeze(self.forest_container_tau.predict_raw_single_forest(forest_dataset_train, i)) s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0)) s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1)) partial_resid_mu = np.squeeze(resid_train - mu_x) @@ -612,10 +743,10 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1) tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1 - forest_dataset_tau_train.update_basis(tau_basis_train) + forest_dataset_train.update_basis(tau_basis_train) if self.has_test: tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1 - forest_dataset_tau_test.update_basis(tau_basis_test) + forest_dataset_test.update_basis(tau_basis_test) self.b0_samples[i] = current_b_0 self.b1_samples[i] = current_b_1 @@ -644,9 +775,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra raise RuntimeError("There are no samples to retain!") # Store predictions - mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_mu_train.dataset_cpp) + mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) self.mu_hat_train = mu_raw[:,self.keep_indices]*self.y_std + self.y_bar - tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau_train.dataset_cpp) + tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_train.dataset_cpp) self.tau_hat_train = tau_raw_train[:,self.keep_indices] if self.adaptive_coding: adaptive_coding_weights = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2)) @@ -658,9 +789,9 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra treatment_term_train = Z_train*np.squeeze(self.tau_hat_train) self.y_hat_train = self.mu_hat_train + treatment_term_train if self.has_test: - mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_mu_test.dataset_cpp) + mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) self.mu_hat_test = mu_raw_test[:,self.keep_indices]*self.y_std + self.y_bar - tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau_test.dataset_cpp) + tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) self.tau_hat_test = tau_raw_test[:,self.keep_indices] if self.adaptive_coding: adaptive_coding_weights_test = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2)) diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 905cab88..4f62da99 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -22,6 +22,7 @@ def __init__(self) -> None: self._onehot_feature_index = [] self._processed_feature_types = [] self._original_feature_types = [] + self._original_feature_indices = [] def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool: if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f": @@ -169,7 +170,8 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64) - output_iter = 0 + output_iter = 0 + self._original_feature_indices = [] for i in range(covariates.shape[1]): covariate = covariates.iloc[:,i] if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string": @@ -178,20 +180,24 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate)) output_array[:,output_iter] = np.squeeze(covariate_transformed) output_iter += 1 + self._original_feature_indices.extend(i) else: onehot_ind = self._onehot_feature_index[i] covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate)) output_dim = covariate_transformed.shape[1] output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed) output_iter += output_dim + self._original_feature_indices.extend([i for _ in range(output_dim)]) elif self._original_feature_types[i] == "boolean": output_array[:,output_iter] = (covariate*1.0).to_numpy() output_iter += 1 + self._original_feature_indices.extend(i) elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float": output_array[:,output_iter] = (covariate).to_numpy() output_iter += 1 + self._original_feature_indices.extend(i) return output_array @@ -202,7 +208,7 @@ def _transform_numpy(self, covariates: np.array) -> np.array: raise ValueError("Covariates passed as a numpy array must be 1d or 2d") if self._num_original_features != covariates.shape[1]: raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") - + self._original_feature_indices = [i for i in range(covariates.shape[1])] return covariates def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: @@ -247,7 +253,7 @@ def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: Parameters ---------- - covariates : np.array or pd.DataFrame + covariates : :obj:`np.array` or :obj:`pd.DataFrame` Covariates to be preprocessed. Returns @@ -264,3 +270,6 @@ def transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: def fit_transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: self._fit(covariates) return self._transform(covariates) + + def fetch_original_feature_indices(self) -> list: + return self._original_feature_indices From 31312b99d3d615248248074575338043ea408258 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 25 Jun 2024 01:20:02 -0500 Subject: [PATCH 2/2] Fixed list append vs extend issue --- stochtree/preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 4f62da99..b9c6ceb8 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -180,7 +180,7 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate)) output_array[:,output_iter] = np.squeeze(covariate_transformed) output_iter += 1 - self._original_feature_indices.extend(i) + self._original_feature_indices.append(i) else: onehot_ind = self._onehot_feature_index[i] covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate)) @@ -192,12 +192,12 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: elif self._original_feature_types[i] == "boolean": output_array[:,output_iter] = (covariate*1.0).to_numpy() output_iter += 1 - self._original_feature_indices.extend(i) + self._original_feature_indices.append(i) elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float": output_array[:,output_iter] = (covariate).to_numpy() output_iter += 1 - self._original_feature_indices.extend(i) + self._original_feature_indices.append(i) return output_array