Added same collapse functionality and tests to python interface

andrewherren · andrewherren · commit a22ca1cbaa72 · 2025-05-30T16:47:10.000-04:00
diff --git a/stochtree/forest.py b/stochtree/forest.py
@@ -159,6 +159,42 @@ def set_root_leaves(
         else:
             self.forest_container_cpp.SetRootValue(forest_num, leaf_value)
 
+    def collapse(self, batch_size: int) -> None:
+        """
+        Collapse forests in this container by a pre-specified batch size. 
+        For example, if we have a container of twenty 10-tree forests, and we 
+        specify a `batch_size` of 5, then this method will yield four 50-tree 
+        forests. "Excess" forests remaining after the size of a forest container 
+        is divided by `batch_size` will be pruned from the beginning of the 
+        container (i.e. earlier sampled forests will be deleted). This method 
+        has no effect if `batch_size` is larger than the number of forests 
+        in a container.
+
+        Parameters
+        ----------
+        batch_size : int
+            Number of forests to be collapsed into a single forest
+        """
+        container_size = self.num_samples()
+        if batch_size <= container_size and batch_size > 1:
+            reverse_container_inds = np.linspace(start=container_size, stop=1, num=container_size, dtype=int)
+            num_clean_batches = container_size // batch_size
+            batch_inds = (reverse_container_inds - (container_size - ((container_size // num_clean_batches) * num_clean_batches)) - 1) // batch_size
+            batch_inds = batch_inds.astype(int)
+            for batch_ind in np.flip(np.unique(batch_inds[batch_inds >= 0])):
+                merge_forest_inds = np.sort(reverse_container_inds[batch_inds == batch_ind] - 1)
+                num_merge_forests = len(merge_forest_inds)
+                self.combine_forests(merge_forest_inds)
+                for i in range(num_merge_forests - 1, 0, -1):
+                    self.delete_sample(merge_forest_inds[i])
+                forest_scale_factor = 1.0 / num_merge_forests
+                self.multiply_forest(merge_forest_inds[0], forest_scale_factor)
+            if np.min(batch_inds) < 0:
+                delete_forest_inds = np.sort(reverse_container_inds[batch_inds < 0] - 1)
+                num_delete_forests = len(delete_forest_inds)
+                for i in range(num_delete_forests - 1, -1, -1):
+                    self.delete_sample(delete_forest_inds[i])
+    
     def combine_forests(
         self, forest_inds: np.array
     ) -> None:
diff --git a/test/python/test_forest_container.py b/test/python/test_forest_container.py
@@ -1,10 +1,11 @@
 import numpy as np
 
-from stochtree import Dataset, ForestContainer
+from stochtree import Dataset, ForestContainer, BARTModel
+from sklearn.model_selection import train_test_split
 
 
 class TestPredict:
-    def test_constant_leaf_prediction(self):
+    def test_constant_leaf_forest_container(self):
         # Create dataset
         X = np.array(
             [[1.5, 8.7, 1.2],
@@ -75,3 +76,180 @@ def test_constant_leaf_prediction(self):
         
         # Assertion
         np.testing.assert_almost_equal(pred, pred_expected_new)
+    
+    def test_collapse_forest_container(self):
+        # RNG
+        rng = np.random.default_rng()
+
+        # Generate covariates and basis
+        n = 100
+        p_X = 10
+        X = rng.uniform(0, 1, (n, p_X))
+
+        # Define the outcome mean function
+        def outcome_mean(X):
+            return np.where(
+                (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),
+                -7.5,
+                np.where(
+                    (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),
+                    -2.5,
+                    np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5, 7.5),
+                ),
+            )
+
+        # Generate outcome
+        epsilon = rng.normal(0, 1, n)
+        y = outcome_mean(X) + epsilon
+
+        # Test-train split
+        sample_inds = np.arange(n)
+        train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)
+        X_train = X[train_inds, :]
+        X_test = X[test_inds, :]
+        y_train = y[train_inds]
+        # y_test = y[test_inds]
+        n_train = X_train.shape[0]
+        n_test = X_test.shape[0]
+
+        # Create forest dataset
+        forest_dataset_test = Dataset()
+        forest_dataset_test.add_covariates(X_test)
+
+        # Run BART with 50 MCMC
+        num_mcmc = 50
+        bart_model = BARTModel()
+        bart_model.sample(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            num_gfr=0,
+            num_burnin=0,
+            num_mcmc=num_mcmc,
+        )
+
+        # Extract the mean forest container
+        mean_forest_container = bart_model.forest_container_mean
+
+        # Predict from the original container
+        pred_orig = mean_forest_container.predict(forest_dataset_test)
+
+        # Collapse the container in batches of 5
+        batch_size = 5
+        mean_forest_container.collapse(batch_size)
+
+        # Predict from the modified container
+        pred_new = mean_forest_container.predict(forest_dataset_test)
+
+        # Check that corresponding (sums of) predictions match
+        container_inds = np.linspace(start=1, stop=num_mcmc, num=num_mcmc)
+        batch_inds = (container_inds - (num_mcmc - ((num_mcmc // (num_mcmc // batch_size)) * (num_mcmc // batch_size))) - 1) // batch_size
+        batch_inds = batch_inds.astype(int)
+        num_batches = np.max(batch_inds) + 1
+        pred_orig_collapsed = np.empty((n_test, num_batches))
+        for i in range(num_batches):
+            pred_orig_collapsed[:,i] = np.sum(pred_orig[:,batch_inds == i], axis=1) / np.sum(batch_inds == i)
+        
+        # Assertion
+        np.testing.assert_almost_equal(pred_orig_collapsed, pred_new)
+
+        # Run BART with 52 MCMC
+        num_mcmc = 52
+        bart_model = BARTModel()
+        bart_model.sample(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            num_gfr=0,
+            num_burnin=0,
+            num_mcmc=num_mcmc,
+        )
+
+        # Extract the mean forest container
+        mean_forest_container = bart_model.forest_container_mean
+
+        # Predict from the original container
+        pred_orig = mean_forest_container.predict(forest_dataset_test)
+
+        # Collapse the container in batches of 5
+        batch_size = 5
+        mean_forest_container.collapse(batch_size)
+
+        # Predict from the modified container
+        pred_new = mean_forest_container.predict(forest_dataset_test)
+
+        # Check that corresponding (sums of) predictions match
+        container_inds = np.linspace(start=1, stop=num_mcmc, num=num_mcmc)
+        batch_inds = (container_inds - (num_mcmc - ((num_mcmc // (num_mcmc // batch_size)) * (num_mcmc // batch_size))) - 1) // batch_size
+        batch_inds = batch_inds.astype(int)
+        num_batches = np.max(batch_inds) + 1
+        pred_orig_collapsed = np.empty((n_test, num_batches))
+        for i in range(num_batches):
+            pred_orig_collapsed[:,i] = np.sum(pred_orig[:,batch_inds == i], axis=1) / np.sum(batch_inds == i)
+        
+        # Assertion
+        np.testing.assert_almost_equal(pred_orig_collapsed, pred_new)
+
+        # Run BART with 5 MCMC
+        num_mcmc = 5
+        bart_model = BARTModel()
+        bart_model.sample(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            num_gfr=0,
+            num_burnin=0,
+            num_mcmc=num_mcmc,
+        )
+
+        # Extract the mean forest container
+        mean_forest_container = bart_model.forest_container_mean
+
+        # Predict from the original container
+        pred_orig = mean_forest_container.predict(forest_dataset_test)
+
+        # Collapse the container in batches of 5
+        batch_size = 5
+        mean_forest_container.collapse(batch_size)
+
+        # Predict from the modified container
+        pred_new = mean_forest_container.predict(forest_dataset_test)
+
+        # Check that corresponding (sums of) predictions match
+        num_batches = 1
+        pred_orig_collapsed = np.empty((n_test, num_batches))
+        pred_orig_collapsed[:,0] = np.sum(pred_orig, axis=1) / batch_size
+        
+        # Assertion
+        np.testing.assert_almost_equal(pred_orig_collapsed, pred_new)
+
+        # Run BART with 4 MCMC
+        num_mcmc = 4
+        bart_model = BARTModel()
+        bart_model.sample(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            num_gfr=0,
+            num_burnin=0,
+            num_mcmc=num_mcmc,
+        )
+
+        # Extract the mean forest container
+        mean_forest_container = bart_model.forest_container_mean
+
+        # Predict from the original container
+        pred_orig = mean_forest_container.predict(forest_dataset_test)
+
+        # Collapse the container in batches of 5
+        batch_size = 5
+        mean_forest_container.collapse(batch_size)
+
+        # Predict from the modified container
+        pred_new = mean_forest_container.predict(forest_dataset_test)
+
+        # Check that corresponding (sums of) predictions match
+        pred_orig_collapsed = pred_orig
+        
+        # Assertion
+        np.testing.assert_almost_equal(pred_orig_collapsed, pred_new)