Skip to content

Commit 5496c5d

Browse files
authored
Merge pull request #941 from mindsdb/staging
Release 22.7.3.0
2 parents a590bec + be93122 commit 5496c5d

23 files changed

+481
-412
lines changed

docssrc/source/tutorials/tutorial_time_series/tutorial_time_series.ipynb

+270-270
Large diffs are not rendered by default.

lightwood/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
__title__ = 'lightwood'
22
__package_name__ = 'lightwood'
3-
__version__ = '22.7.2.2'
3+
__version__ = '22.7.3.0'
44
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
55
__email__ = "community@mindsdb.com"
66
__author__ = 'MindsDB Inc'

lightwood/analysis/explain.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,9 @@ def explain(data: pd.DataFrame,
5454
for col in timeseries_settings.group_by:
5555
row_insights[f'group_{col}'] = data[col]
5656

57-
for col in timeseries_settings.order_by:
58-
row_insights[f'order_{col}'] = data[col]
59-
60-
for col in timeseries_settings.order_by:
61-
row_insights[f'order_{col}'] = get_inferred_timestamps(
62-
row_insights, col, ts_analysis['deltas'], timeseries_settings)
57+
row_insights[f'order_{timeseries_settings.order_by}'] = data[timeseries_settings.order_by]
58+
row_insights[f'order_{timeseries_settings.order_by}'] = get_inferred_timestamps(
59+
row_insights, timeseries_settings.order_by, ts_analysis['deltas'], timeseries_settings)
6360

6461
kwargs = {
6562
'data': data,

lightwood/analysis/helpers/feature_importance.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
3838
else:
3939
empty_input_accuracy = {}
4040
ignorable_input_cols = [x for x in ns.input_cols if (not ns.tss.is_timeseries or
41-
(x not in ns.tss.order_by and
41+
(x != ns.tss.order_by and
4242
x not in ns.tss.historical_columns))]
4343
for col in ignorable_input_cols:
4444
partial_data = deepcopy(ns.encoded_val_data)

lightwood/analysis/nc/calibrate.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from lightwood.api.dtype import dtype
1111
from lightwood.api.types import PredictionArguments
12-
from lightwood.helpers.ts import add_tn_conf_bounds
12+
from lightwood.helpers.ts import add_tn_num_conf_bounds, add_tn_cat_conf_bounds
1313

1414
from lightwood.data import EncodedDs
1515
from lightwood.analysis.base import BaseAnalysisBlock
@@ -120,7 +120,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
120120
# fit additional ICPs in time series tasks with grouped columns
121121
if ns.tss.is_timeseries and ns.tss.group_by:
122122
# generate a multiindex
123-
midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
123+
midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by}']])
124124
icp_df.index = midx
125125

126126
# create an ICP for each possible group
@@ -157,7 +157,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
157157

158158
# add all predictions to DF
159159
icps_df = deepcopy(ns.data)
160-
midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
160+
midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by}']])
161161
icps_df.index = midx
162162
if ns.is_multi_ts or pred_is_list:
163163
icps_df[f'__predicted_{ns.target}'] = np.array([p[0] for p in ns.normal_predictions['prediction']])
@@ -380,8 +380,11 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
380380
cooldown=ns.pred_args.anomaly_cooldown)
381381
row_insights['anomaly'] = anomalies
382382

383-
if ns.tss.is_timeseries and ns.tss.horizon > 1 and is_numerical:
384-
row_insights = add_tn_conf_bounds(row_insights, ns.tss)
383+
if ns.tss.is_timeseries and ns.tss.horizon > 1:
384+
if is_numerical:
385+
row_insights = add_tn_num_conf_bounds(row_insights, ns.tss)
386+
else:
387+
row_insights = add_tn_cat_conf_bounds(row_insights, ns.tss)
385388

386389
# clip bounds if necessary
387390
if is_numerical:

lightwood/api/json_ai.py

+22-19
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def lookup_encoder(
135135
# Time-series representations require more advanced flags
136136
if tss.is_timeseries:
137137
gby = tss.group_by if tss.group_by is not None else []
138-
if col_name in tss.order_by:
138+
if col_name == tss.order_by:
139139
encoder_dict["module"] = "ArrayEncoder"
140140
encoder_dict["args"]["original_type"] = f'"{tss.target_type}"'
141141
encoder_dict["args"]["window"] = f"{tss.window}"
@@ -575,6 +575,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
575575
mixers[i]["args"]["tss"] = mixers[i]["args"].get("tss", "$problem_definition.timeseries_settings")
576576
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
577577
mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
578+
mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False")
578579

579580
elif mixers[i]["module"] == "NHitsMixer":
580581
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -752,27 +753,29 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
752753
encoder_dict[col_name] = call(encoder)
753754

754755
# Populate time-series specific details
756+
# TODO: consider moving this to a `JsonAI override` phase
755757
tss = json_ai.problem_definition.timeseries_settings
756-
if tss.is_timeseries and tss.use_previous_target:
757-
col_name = f"__mdb_ts_previous_{json_ai.problem_definition.target}"
758-
target_type = json_ai.dtype_dict[json_ai.problem_definition.target]
759-
json_ai.problem_definition.timeseries_settings.target_type = target_type
760-
encoder_dict[col_name] = call(
761-
lookup_encoder(
762-
target_type,
763-
col_name,
764-
False,
765-
json_ai.problem_definition,
766-
False,
767-
None,
758+
if tss.is_timeseries:
759+
if tss.use_previous_target:
760+
col_name = f"__mdb_ts_previous_{json_ai.problem_definition.target}"
761+
target_type = json_ai.dtype_dict[json_ai.problem_definition.target]
762+
json_ai.problem_definition.timeseries_settings.target_type = target_type
763+
encoder_dict[col_name] = call(
764+
lookup_encoder(
765+
target_type,
766+
col_name,
767+
False,
768+
json_ai.problem_definition,
769+
False,
770+
None,
771+
)
768772
)
769-
)
770773

771-
dtype_dict[col_name] = target_type
772-
# @TODO: Is populating the json_ai at this stage even necessary?
773-
json_ai.encoders[col_name] = encoder_dict[col_name]
774-
json_ai.dtype_dict[col_name] = target_type
775-
json_ai.dependency_dict[col_name] = []
774+
dtype_dict[col_name] = target_type
775+
# @TODO: Is populating the json_ai at this stage even necessary?
776+
json_ai.encoders[col_name] = encoder_dict[col_name]
777+
json_ai.dtype_dict[col_name] = target_type
778+
json_ai.dependency_dict[col_name] = []
776779

777780
# ----------------- #
778781

lightwood/api/types.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,11 @@ class TimeseriesSettings:
110110
111111
:param is_timeseries: Whether the input data should be treated as time series; if true, this flag is checked in \
112112
subsequent internal steps to ensure processing is appropriate for time-series data.
113-
:param order_by: A list of columns by which the data should be ordered.
113+
:param order_by: Column by which the data should be ordered.
114114
:param group_by: Optional list of columns by which the data should be grouped. Each different combination of values\
115115
for these columns will yield a different series.
116116
:param window: The temporal horizon (number of rows) that a model intakes to "look back" into when making a\
117-
prediction, after the rows are ordered by order_by columns and split into groups if applicable.
117+
prediction, after the rows are ordered by the order_by column and split into groups if applicable.
118118
:param horizon: The number of points in the future that predictions should be made for, defaults to 1. Once \
119119
trained, the model will be able to predict up to this many points into the future.
120120
:param historical_columns: The temporal dynamics of these columns will be used as additional context to train the \
@@ -128,7 +128,7 @@ class TimeseriesSettings:
128128
""" # noqa
129129

130130
is_timeseries: bool
131-
order_by: List[str] = None
131+
order_by: str = None
132132
window: int = None
133133
group_by: List[str] = None
134134
use_previous_target: bool = True
@@ -152,11 +152,15 @@ def from_dict(obj: Dict):
152152
:returns: A populated ``TimeseriesSettings`` object.
153153
""" # noqa
154154
if len(obj) > 0:
155-
for mandatory_setting in ["order_by", "window"]:
155+
for mandatory_setting, etype in zip(["order_by", "window"], [str, int]):
156156
if mandatory_setting not in obj:
157157
err = f"Missing mandatory timeseries setting: {mandatory_setting}"
158158
log.error(err)
159159
raise Exception(err)
160+
if obj[mandatory_setting] and not isinstance(obj[mandatory_setting], etype):
161+
err = f"Wrong type for mandatory timeseries setting '{mandatory_setting}': found '{type(obj[mandatory_setting])}', expected '{etype}'" # noqa
162+
log.error(err)
163+
raise Exception(err)
160164

161165
timeseries_settings = TimeseriesSettings(
162166
is_timeseries=True,

lightwood/data/cleaner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ def clean_timeseries(df: pd.DataFrame, tss: TimeseriesSettings) -> pd.DataFrame:
394394
invalid_rows = []
395395

396396
for idx, row in df.iterrows():
397-
if pd.isna(row[tss.order_by[0]]):
397+
if pd.isna(row[tss.order_by]):
398398
invalid_rows.append(idx)
399399

400400
df = df.drop(invalid_rows)

lightwood/data/splitter.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ def stratify(data: pd.DataFrame,
9393
pct_test: float,
9494
stratify_on: List[str],
9595
seed: int,
96-
reshuffle: bool) -> List[pd.DataFrame]:
96+
reshuffle: bool,
97+
atol: float = 0.05) -> List[pd.DataFrame]:
9798
"""
9899
Stratified data splitter.
99100
@@ -109,6 +110,7 @@ def stratify(data: pd.DataFrame,
109110
:param stratify_on: Columns to consider when stratifying
110111
:param seed: Random state for pandas data-frame shuffling
111112
:param reshuffle: specify if reshuffling should be done post-split
113+
:param atol: absolute tolerance for difference in stratification percentages. If violated, reverts to a non-stratified split.
112114
113115
:returns Stratified train, dev, test dataframes
114116
""" # noqa
@@ -136,10 +138,13 @@ def stratify(data: pd.DataFrame,
136138
for df in [train_st, dev_st, test_st]]
137139

138140
# check that stratified lengths conform to expected percentages
139-
if not np.isclose(len(train_st) / len(data), pct_train, atol=0.01) or \
140-
not np.isclose(len(dev_st) / len(data), pct_dev, atol=0.01) or \
141-
not np.isclose(len(test_st) / len(data), pct_test, atol=0.01):
142-
log.info("Could not stratify; reverting to simple split")
141+
emp_tr = len(train_st) / len(data)
142+
emp_dev = len(dev_st) / len(data)
143+
emp_te = len(test_st) / len(data)
144+
if not np.isclose(emp_tr, pct_train, atol=atol) or \
145+
not np.isclose(emp_dev, pct_dev, atol=atol) or \
146+
not np.isclose(emp_te, pct_test, atol=atol):
147+
log.warning(f"Stratification is outside of imposed tolerance ({atol}) ({emp_tr} train - {emp_dev} dev - {emp_te} test), reverting to a simple split.") # noqa
143148
train_st, dev_st, test_st = simple_split(data, pct_train, pct_dev, pct_test)
144149

145150
return [train_st, dev_st, test_st]

lightwood/data/timeseries_analyzer.py

+26-19
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from lightwood.api.types import TimeseriesSettings
1212
from lightwood.api.dtype import dtype
13-
from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer
13+
from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer, max_pacf
1414
from lightwood.helpers.log import log
1515
from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers
1616

@@ -41,6 +41,7 @@ def timeseries_analyzer(data: Dict[str, pd.DataFrame], dtype_dict: Dict[str, str
4141
normalizers = generate_target_group_normalizers(data['train'], target, dtype_dict, groups, tss)
4242

4343
if dtype_dict[target] in (dtype.integer, dtype.float, dtype.num_tsarray):
44+
periods = max_pacf(data['train'], groups, target, tss) # override with PACF output
4445
naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'],
4546
target,
4647
tss,
@@ -96,9 +97,10 @@ def get_grouped_naive_residuals(
9697
group_scale_factors = {}
9798
for group in group_combinations:
9899
idxs, subset = get_group_matches(info, group, tss.group_by)
99-
residuals, scale_factor = get_naive_residuals(subset[target]) # @TODO: pass m once we handle seasonality
100-
group_residuals[group] = residuals
101-
group_scale_factors[group] = scale_factor
100+
if subset.shape[0] > 1:
101+
residuals, scale_factor = get_naive_residuals(subset[target]) # @TODO: pass m once we handle seasonality
102+
group_residuals[group] = residuals
103+
group_scale_factors[group] = scale_factor
102104
return group_residuals, group_scale_factors
103105

104106

@@ -119,34 +121,38 @@ def get_stls(train_df: pd.DataFrame,
119121
groups: list,
120122
tss: TimeseriesSettings
121123
) -> Dict[str, object]:
122-
stls = {}
124+
stls = {'__default': None}
123125
for group in groups:
124-
_, tr_subset = get_group_matches(train_df, group, tss.group_by)
125-
_, dev_subset = get_group_matches(dev_df, group, tss.group_by)
126-
group_freq = tr_subset['__mdb_inferred_freq'].iloc[0]
127-
tr_subset = deepcopy(tr_subset)[target]
128-
dev_subset = deepcopy(dev_subset)[target]
129-
tr_subset.index = pd.date_range(start=tr_subset.iloc[0], freq=group_freq, periods=len(tr_subset)).to_period()
130-
dev_subset.index = pd.date_range(start=dev_subset.iloc[0], freq=group_freq, periods=len(dev_subset)).to_period()
131-
stl = _pick_ST(tr_subset, dev_subset, sps[group])
132-
log.info(f'Best STL decomposition params for group {group} are: {stl["best_params"]}')
133-
stls[group] = stl
126+
if group != '__default':
127+
_, tr_subset = get_group_matches(train_df, group, tss.group_by)
128+
_, dev_subset = get_group_matches(dev_df, group, tss.group_by)
129+
if tr_subset.shape[0] > 0 and dev_subset.shape[0] > 0 and sps.get(group, False):
130+
group_freq = tr_subset['__mdb_inferred_freq'].iloc[0]
131+
tr_subset = deepcopy(tr_subset)[target]
132+
dev_subset = deepcopy(dev_subset)[target]
133+
tr_subset.index = pd.date_range(start=tr_subset.iloc[0], freq=group_freq,
134+
periods=len(tr_subset)).to_period()
135+
dev_subset.index = pd.date_range(start=dev_subset.iloc[0], freq=group_freq,
136+
periods=len(dev_subset)).to_period()
137+
stl = _pick_ST(tr_subset, dev_subset, sps[group])
138+
log.info(f'Best STL decomposition params for group {group} are: {stl["best_params"]}')
139+
stls[group] = stl
134140
return stls
135141

136142

137-
def _pick_ST(tr_subset: pd.Series, dev_subset: pd.Series, sp: int):
143+
def _pick_ST(tr_subset: pd.Series, dev_subset: pd.Series, sp: list):
138144
"""
139145
Perform hyperparam search with optuna to find best combination of ST transforms for a time series.
140146
141147
:param tr_subset: training series used for fitting blocks. Index should be datetime, and values are the actual time series.
142148
:param dev_subset: dev series used for computing loss. Index should be datetime, and values are the actual time series.
143-
:param sp: seasonal period
149+
:param sp: list of candidate seasonal periods
144150
:return: best deseasonalizer and detrender combination based on dev_loss
145151
""" # noqa
146152

147153
def _ST_objective(trial: optuna.Trial):
148154
trend_degree = trial.suggest_categorical("trend_degree", [1, 2])
149-
ds_sp = trial.suggest_categorical("ds_sp", [sp]) # seasonality period to use in deseasonalizer
155+
ds_sp = trial.suggest_categorical("ds_sp", sp) # seasonality period to use in deseasonalizer
150156
if min(min(tr_subset), min(dev_subset)) <= 0:
151157
decomp_type = trial.suggest_categorical("decomp_type", ['additive'])
152158
else:
@@ -161,7 +167,8 @@ def _ST_objective(trial: optuna.Trial):
161167
trial.set_user_attr("transformer", transformer)
162168
return np.power(residuals, 2).sum()
163169

164-
study = optuna.create_study()
170+
space = {"trend_degree": [1, 2], "ds_sp": sp, "decomp_type": ['additive', 'multiplicative']}
171+
study = optuna.create_study(sampler=optuna.samplers.GridSampler(space))
165172
study.optimize(_ST_objective, n_trials=8)
166173

167174
return {

0 commit comments

Comments
 (0)