tommyod · tommyod · Feb 4, 2024 · Feb 4, 2024
diff --git a/generalized_additive_models/gam.py b/generalized_additive_models/gam.py
@@ -83,14 +83,14 @@ class GAM(BaseEstimator):
     >>> for term in gam.terms:
     ...     print(term, term.coef_) # doctest: +SKIP
     >>> gam.score(df, y)
-    0.4412081401019129
+    0.4412081...
     >>> from sklearn.metrics import r2_score
     >>> r2_score(y_true=y, y_pred=predictions)
-    0.4412081401019129
+    0.4412081...
     >>> gam.terms["age"]
     Spline(feature='age')
     >>> gam.terms["age"].coef_[:3]
-    array([  0.        , -11.86887791, -23.59686477])
+    array([  0.        , -11.86..., -23.59...])
 
     """
 

diff --git a/generalized_additive_models/terms.py b/generalized_additive_models/terms.py
@@ -1065,7 +1065,8 @@ class Tensor(TransformerMixin, Term, BaseEstimator):
            [1., 0., 0.],
            [0., 0., 1.],
            [0., 0., 1.]])
-    >>> te = Tensor(Categorical('color') + Categorical('grade', penalty=4))
+    >>> te = Tensor(Categorical('color', sum_to_zero=False) +
+    ...             Categorical('grade', penalty=4, sum_to_zero=False))
     >>> te.fit_transform(df).astype(int)
     array([[0, 0, 0, 0, 0, 0, 1, 0, 0],
            [0, 0, 0, 1, 0, 0, 0, 0, 0],
@@ -1208,7 +1209,7 @@ def _build_marginal_penalties(self, i):
         >>> df = pd.DataFrame({'cat':list('AAAABBBB'),
         ...                    'x1': [1, 2, 3, 4, 1, 2, 3, 4]})
         >>> spline = Spline('x1', num_splines=3, degree=0)
-        >>> cat = Categorical('cat')
+        >>> cat = Categorical('cat', sum_to_zero=False)
         >>> tensor = Tensor([spline, cat]).fit(df)
         >>> tensor._build_marginal_penalties(0).astype(int)
         array([[ 0,  0,  0,  0,  0,  0],
@@ -1527,7 +1528,7 @@ class Categorical(TransformerMixin, Term, BaseEstimator):
 
     >>> import pandas as pd
     >>> df = pd.DataFrame({"colors": ["red", "red", "blue", "yellow", "red"]})
-    >>> categorical = Categorical("colors")
+    >>> categorical = Categorical("colors", sum_to_zero=False)
     >>> categorical.fit_transform(df)
     array([[0., 1., 0.],
            [0., 1., 0.],
@@ -1559,13 +1560,16 @@ class Categorical(TransformerMixin, Term, BaseEstimator):
         "feature": [Interval(Integral, 0, None, closed="left"), str, None],
         "penalty": [Interval(Real, 0, None, closed="left")],
         "by": [Interval(Integral, 0, None, closed="left"), str, None],
+        "sum_to_zero": ["boolean"],
     }
 
     def __init__(
         self,
         feature=None,
+        *,
         penalty=1,
         by=None,
+        sum_to_zero=True,
         handle_unknown="error",
         min_frequency=None,
         max_categories=None,
@@ -1609,6 +1613,7 @@ def __init__(
         self.handle_unknown = handle_unknown
         self.min_frequency = min_frequency
         self.max_categories = max_categories
+        self.sum_to_zero = sum_to_zero
 
     def _validate_params(self, X):
         # Validate using BaseEsimator._validate_params, which in turn calls
@@ -1635,7 +1640,25 @@ def num_coefficients(self):
     def penalty_matrix(self):
         """Return the penalty matrix for the term."""
         super()._validate_params()  # Validate the 'penalty' parameter
-        return np.sqrt(self.penalty) * np.eye(self.num_coefficients)
+
+        # Only l2 penalties
+        if not self.sum_to_zero:
+            return np.sqrt(self.penalty) * np.eye(self.num_coefficients)
+
+        # The penalty for e.g. three coefficients will be
+        # |coef_1 + coef_2 + coef_3|^2 +
+        # |coef_1|^2 + |coef_2|^2 + |coef_3|^2
+
+        # Sum to zero penalties.
+        sum_to_zero_penalty = self.counts_ * 1e3
+
+        # L2 penalties
+        l2_penalty = np.sqrt(self.penalty) * np.eye(self.num_coefficients)
+
+        P = np.vstack((sum_to_zero_penalty, l2_penalty))
+        assert P.shape[1] == self.num_coefficients
+
+        return P
 
     def fit(self, X):
         """Fit to data.
@@ -1670,6 +1693,8 @@ def fit(self, X):
 
         self.categories_ = list(self.onehotencoder_.categories_[0])
         self.means_ = basis_matrix.mean(axis=0) * 0  # Do not shift means for Categorical
+        self.counts_ = (basis_matrix > 0).sum(axis=0)
+        assert len(self.counts_) == self.num_coefficients
 
         # Set the bounds
         self._lower_bound = np.array([-np.inf for _ in range(self.num_coefficients)])

diff --git a/generalized_additive_models/tests/test_against_other_software.py b/generalized_additive_models/tests/test_against_other_software.py
@@ -72,8 +72,8 @@ def test_that_inference_of_factors_equals_Rs_lm_function(self):
             }
         )
 
-        gender_cat = Categorical("gender", penalty=0)
-        country_cat = Categorical("country", penalty=0)
+        gender_cat = Categorical("gender", penalty=0, sum_to_zero=False)
+        country_cat = Categorical("country", penalty=0, sum_to_zero=False)
 
         gam = GAM(gender_cat + country_cat, fit_intercept=False).fit(df, df.height)
 

diff --git a/generalized_additive_models/tests/test_gam.py b/generalized_additive_models/tests/test_gam.py
@@ -82,6 +82,23 @@ def test_that_underscore_results_are_present(self):
         assert isinstance(gam._distribution.scale, Real)
 
 
+class TestCategoricalModels:
+    def test_that_unseen_category_works(self):
+        """User must set 'handle_unknown'. Prediction will be the grand mean."""
+
+        # Train a model on two categories
+        df_train = pd.DataFrame({"cat": list("abaab"), "y": [1, 3, 2, 1, 2]})
+        model = GAM(terms=Categorical("cat", handle_unknown="ignore")).fit(df_train, df_train.y)
+
+        # Predict on new data
+        df_test = pd.DataFrame({"cat": list("abcc")})
+        predictions = model.predict(df_test)
+
+        # The unknown category should be predicted as the grand mean
+        assert np.isclose(df_train.y.mean(), predictions[-1])
+        assert np.isclose(df_train.y.mean(), predictions[-2])
+
+
 class TestNonCanonicalLinks:
     @pytest.mark.parametrize("num_splines", [4, 8, 12, 16, 20, 24, 28])
     def test_that_log_link_optimization_works_california_housing(self, num_splines):
@@ -1089,6 +1106,6 @@ def test_that_quantile_fitting_finds_empirical_quantile(self, quantile):
             "--capture=sys",
             "--doctest-modules",
             "--maxfail=1",
-            "-k test_that_increasing_l2_spline_penalty_pulls_coefs_to_zero",
+            "-k TestCategoricalModels",
         ]
     )