Skip to content

Categoricals #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions generalized_additive_models/gam.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ class GAM(BaseEstimator):
>>> for term in gam.terms:
... print(term, term.coef_) # doctest: +SKIP
>>> gam.score(df, y)
0.4412081401019129
0.4412081...
>>> from sklearn.metrics import r2_score
>>> r2_score(y_true=y, y_pred=predictions)
0.4412081401019129
0.4412081...
>>> gam.terms["age"]
Spline(feature='age')
>>> gam.terms["age"].coef_[:3]
array([ 0. , -11.86887791, -23.59686477])
array([ 0. , -11.86..., -23.59...])

"""

Expand Down
33 changes: 29 additions & 4 deletions generalized_additive_models/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,8 @@ class Tensor(TransformerMixin, Term, BaseEstimator):
[1., 0., 0.],
[0., 0., 1.],
[0., 0., 1.]])
>>> te = Tensor(Categorical('color') + Categorical('grade', penalty=4))
>>> te = Tensor(Categorical('color', sum_to_zero=False) +
... Categorical('grade', penalty=4, sum_to_zero=False))
>>> te.fit_transform(df).astype(int)
array([[0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0],
Expand Down Expand Up @@ -1208,7 +1209,7 @@ def _build_marginal_penalties(self, i):
>>> df = pd.DataFrame({'cat':list('AAAABBBB'),
... 'x1': [1, 2, 3, 4, 1, 2, 3, 4]})
>>> spline = Spline('x1', num_splines=3, degree=0)
>>> cat = Categorical('cat')
>>> cat = Categorical('cat', sum_to_zero=False)
>>> tensor = Tensor([spline, cat]).fit(df)
>>> tensor._build_marginal_penalties(0).astype(int)
array([[ 0, 0, 0, 0, 0, 0],
Expand Down Expand Up @@ -1527,7 +1528,7 @@ class Categorical(TransformerMixin, Term, BaseEstimator):

>>> import pandas as pd
>>> df = pd.DataFrame({"colors": ["red", "red", "blue", "yellow", "red"]})
>>> categorical = Categorical("colors")
>>> categorical = Categorical("colors", sum_to_zero=False)
>>> categorical.fit_transform(df)
array([[0., 1., 0.],
[0., 1., 0.],
Expand Down Expand Up @@ -1559,13 +1560,16 @@ class Categorical(TransformerMixin, Term, BaseEstimator):
"feature": [Interval(Integral, 0, None, closed="left"), str, None],
"penalty": [Interval(Real, 0, None, closed="left")],
"by": [Interval(Integral, 0, None, closed="left"), str, None],
"sum_to_zero": ["boolean"],
}

def __init__(
self,
feature=None,
*,
penalty=1,
by=None,
sum_to_zero=True,
handle_unknown="error",
min_frequency=None,
max_categories=None,
Expand Down Expand Up @@ -1609,6 +1613,7 @@ def __init__(
self.handle_unknown = handle_unknown
self.min_frequency = min_frequency
self.max_categories = max_categories
self.sum_to_zero = sum_to_zero

def _validate_params(self, X):
# Validate using BaseEsimator._validate_params, which in turn calls
Expand All @@ -1635,7 +1640,25 @@ def num_coefficients(self):
def penalty_matrix(self):
"""Return the penalty matrix for the term."""
super()._validate_params() # Validate the 'penalty' parameter
return np.sqrt(self.penalty) * np.eye(self.num_coefficients)

# Only l2 penalties
if not self.sum_to_zero:
return np.sqrt(self.penalty) * np.eye(self.num_coefficients)

# The penalty for e.g. three coefficients will be
# |coef_1 + coef_2 + coef_3|^2 +
# |coef_1|^2 + |coef_2|^2 + |coef_3|^2

# Sum to zero penalties.
sum_to_zero_penalty = self.counts_ * 1e3

# L2 penalties
l2_penalty = np.sqrt(self.penalty) * np.eye(self.num_coefficients)

P = np.vstack((sum_to_zero_penalty, l2_penalty))
assert P.shape[1] == self.num_coefficients

return P

def fit(self, X):
"""Fit to data.
Expand Down Expand Up @@ -1670,6 +1693,8 @@ def fit(self, X):

self.categories_ = list(self.onehotencoder_.categories_[0])
self.means_ = basis_matrix.mean(axis=0) * 0 # Do not shift means for Categorical
self.counts_ = (basis_matrix > 0).sum(axis=0)
assert len(self.counts_) == self.num_coefficients

# Set the bounds
self._lower_bound = np.array([-np.inf for _ in range(self.num_coefficients)])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def test_that_inference_of_factors_equals_Rs_lm_function(self):
}
)

gender_cat = Categorical("gender", penalty=0)
country_cat = Categorical("country", penalty=0)
gender_cat = Categorical("gender", penalty=0, sum_to_zero=False)
country_cat = Categorical("country", penalty=0, sum_to_zero=False)

gam = GAM(gender_cat + country_cat, fit_intercept=False).fit(df, df.height)

Expand Down
19 changes: 18 additions & 1 deletion generalized_additive_models/tests/test_gam.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,23 @@ def test_that_underscore_results_are_present(self):
assert isinstance(gam._distribution.scale, Real)


class TestCategoricalModels:
def test_that_unseen_category_works(self):
"""User must set 'handle_unknown'. Prediction will be the grand mean."""

# Train a model on two categories
df_train = pd.DataFrame({"cat": list("abaab"), "y": [1, 3, 2, 1, 2]})
model = GAM(terms=Categorical("cat", handle_unknown="ignore")).fit(df_train, df_train.y)

# Predict on new data
df_test = pd.DataFrame({"cat": list("abcc")})
predictions = model.predict(df_test)

# The unknown category should be predicted as the grand mean
assert np.isclose(df_train.y.mean(), predictions[-1])
assert np.isclose(df_train.y.mean(), predictions[-2])


class TestNonCanonicalLinks:
@pytest.mark.parametrize("num_splines", [4, 8, 12, 16, 20, 24, 28])
def test_that_log_link_optimization_works_california_housing(self, num_splines):
Expand Down Expand Up @@ -1089,6 +1106,6 @@ def test_that_quantile_fitting_finds_empirical_quantile(self, quantile):
"--capture=sys",
"--doctest-modules",
"--maxfail=1",
"-k test_that_increasing_l2_spline_penalty_pulls_coefs_to_zero",
"-k TestCategoricalModels",
]
)