Skip to content

Commit 8fa4521

Browse files
authored
Merge pull request #929 from mindsdb/staging
Release 22.7.2.0
2 parents eee7234 + 75fa8cf commit 8fa4521

33 files changed

+1304
-560
lines changed

lightwood/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
__title__ = 'lightwood'
22
__package_name__ = 'lightwood'
3-
__version__ = '22.6.1.2'
3+
__version__ = '22.7.2.0'
44
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
55
__email__ = "community@mindsdb.com"
66
__author__ = 'MindsDB Inc'

lightwood/analysis/explain.py

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def explain(data: pd.DataFrame,
3737
# Setup base insights
3838
# ------------------------- #
3939
data = data.reset_index(drop=True)
40+
predictions = predictions.reset_index(drop=True)
4041

4142
row_insights = pd.DataFrame()
4243
global_insights = {}

lightwood/analysis/helpers/conf_stats.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from copy import deepcopy
12
from typing import Dict
23
from types import SimpleNamespace
34

@@ -55,10 +56,10 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
5556
mce: maximum value in `bins`.
5657
global_score: 1.0 minus absolute difference between accuracy and confidence over the entire validation set.
5758
"""
58-
59+
confs = deepcopy(confs).reset_index(drop=True)
60+
sorted_preds = deepcopy(preds).reset_index(drop=True)
61+
sorted_inp = deepcopy(data).reset_index(drop=True)
5962
sorted_val = confs.sort_values(by='confidence', kind='stable')
60-
sorted_preds = preds.reindex(sorted_val.index)
61-
sorted_inp = data.reindex(sorted_val.index)
6263
sorted_inp['__mdb_confidence'] = sorted_val['confidence']
6364

6465
if task_type == 'categorical':

lightwood/analysis/nc/calibrate.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
9090
icp = icp_class(nc, cal_size=self.validation_size)
9191

9292
output['icp']['__default'] = icp
93+
icp_df = deepcopy(ns.data)
9394

9495
# setup prediction cache to avoid additional .predict() calls
96+
pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \
97+
isinstance(ns.normal_predictions['prediction'][0], list)
9598
if ns.is_classification:
9699
if ns.predictor.supports_proba:
97100
icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values
@@ -105,7 +108,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
105108
predicted_classes = pd.get_dummies(preds).values # inflate to one-hot enc
106109
icp.nc_function.model.prediction_cache = predicted_classes
107110

108-
elif ns.is_multi_ts:
111+
elif ns.is_multi_ts or pred_is_list:
109112
# we fit ICPs for time series confidence bounds only at t+1 forecast
110113
icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']])
111114
else:
@@ -116,6 +119,9 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
116119

117120
# fit additional ICPs in time series tasks with grouped columns
118121
if ns.tss.is_timeseries and ns.tss.group_by:
122+
# generate a multiindex
123+
midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
124+
icp_df.index = midx
119125

120126
# create an ICP for each possible group
121127
group_info = ns.data[ns.tss.group_by].to_dict('list')
@@ -127,7 +133,6 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
127133
output['icp'][tuple(combination)] = deepcopy(icp)
128134

129135
# calibrate ICP
130-
icp_df = deepcopy(ns.data)
131136
icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
132137
output['icp']['__default'].index = icp_df.columns
133138
output['icp']['__default'].calibrate(icp_df.values, y)
@@ -137,11 +142,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
137142
icp_df, icp, ns.dtype_dict[ns.target],
138143
output, positive_domain=self.positive_domain, significance=self.fixed_significance)
139144
if not ns.is_classification:
140-
result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float)
145+
result_df = pd.DataFrame(index=icp_df.index, columns=['confidence', 'lower', 'upper'], dtype=float)
141146
result_df.loc[icp_df.index, 'lower'] = ranges[:, 0]
142147
result_df.loc[icp_df.index, 'upper'] = ranges[:, 1]
143148
else:
144-
result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float)
149+
result_df = pd.DataFrame(index=icp_df.index, columns=['confidence'], dtype=float)
145150

146151
result_df.loc[icp_df.index, 'confidence'] = conf
147152

@@ -152,10 +157,12 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
152157

153158
# add all predictions to DF
154159
icps_df = deepcopy(ns.data)
155-
if ns.is_multi_ts:
156-
icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']]
160+
midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
161+
icps_df.index = midx
162+
if ns.is_multi_ts or pred_is_list:
163+
icps_df[f'__predicted_{ns.target}'] = np.array([p[0] for p in ns.normal_predictions['prediction']])
157164
else:
158-
icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction']
165+
icps_df[f'__predicted_{ns.target}'] = np.array(ns.normal_predictions['prediction'])
159166

160167
for group in icps['__mdb_groups']:
161168
icp_df = icps_df
@@ -207,6 +214,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
207214
# consolidate all groups here
208215
output['icp']['__mdb_active'] = True
209216

217+
result_df.index = ns.data.index
210218
output['result_df'] = result_df
211219

212220
info = {**info, **output}
@@ -216,12 +224,21 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
216224
**kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]:
217225
ns = SimpleNamespace(**kwargs)
218226

227+
if 'confidence' in ns.predictions.columns:
228+
# bypass calibrator if model already outputs confidence
229+
row_insights['prediction'] = ns.predictions['prediction']
230+
row_insights['confidence'] = ns.predictions['confidence']
231+
if 'upper' in ns.predictions.columns and 'lower' in ns.predictions.columns:
232+
row_insights['upper'] = ns.predictions['upper']
233+
row_insights['lower'] = ns.predictions['lower']
234+
return row_insights, global_insights
235+
219236
if ns.analysis['icp']['__mdb_active']:
220237
icp_X = deepcopy(ns.data)
221238

222239
# replace observed data w/predictions
223240
preds = ns.predictions['prediction']
224-
if ns.tss.is_timeseries and ns.tss.horizon > 1:
241+
if ns.tss.is_timeseries and (ns.tss.horizon > 1 or isinstance(preds[0], list)):
225242
preds = [p[0] for p in preds]
226243

227244
for col in [f'timestep_{i}' for i in range(1, ns.tss.horizon)]:

lightwood/api/json_ai.py

+67-33
Original file line numberDiff line numberDiff line change
@@ -220,27 +220,46 @@ def generate_json_ai(
220220
):
221221
is_target_predicting_encoder = True
222222

223+
submodels = []
223224
if is_target_predicting_encoder:
224-
submodels = [
225-
{
226-
"module": "Unit",
227-
"args": {
228-
"target_encoder": "$encoders[self.target]",
229-
"stop_after": "$problem_definition.seconds_per_mixer",
230-
},
231-
}
232-
]
225+
submodels.extend(
226+
[
227+
{
228+
"module": "Unit",
229+
"args": {
230+
"target_encoder": "$encoders[self.target]",
231+
"stop_after": "$problem_definition.seconds_per_mixer",
232+
},
233+
}
234+
]
235+
)
233236
else:
234-
submodels = [
235-
{
236-
"module": "Neural",
237-
"args": {
238-
"fit_on_dev": True,
239-
"stop_after": "$problem_definition.seconds_per_mixer",
240-
"search_hyperparameters": True,
241-
},
242-
}
243-
]
237+
if not tss.is_timeseries:
238+
submodels.extend(
239+
[
240+
{
241+
"module": "Neural",
242+
"args": {
243+
"fit_on_dev": True,
244+
"stop_after": "$problem_definition.seconds_per_mixer",
245+
"search_hyperparameters": True,
246+
},
247+
}
248+
]
249+
)
250+
else:
251+
submodels.extend(
252+
[
253+
{
254+
"module": "NeuralTs",
255+
"args": {
256+
"fit_on_dev": True,
257+
"stop_after": "$problem_definition.seconds_per_mixer",
258+
"search_hyperparameters": True,
259+
},
260+
}
261+
]
262+
)
244263

245264
if (not tss.is_timeseries or tss.horizon == 1) and dtype_dict[target] not in (dtype.num_array, dtype.cat_array):
246265
submodels.extend(
@@ -268,7 +287,8 @@ def generate_json_ai(
268287
"args": {
269288
"fit_on_dev": True,
270289
"stop_after": "$problem_definition.seconds_per_mixer",
271-
"horizon": "$problem_definition.timeseries_settings.horizon",
290+
"ts_analysis": "$ts_analysis",
291+
"tss": "$problem_definition.timeseries_settings",
272292
},
273293
}
274294
]
@@ -494,29 +514,30 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
494514
is_ts = tss.is_timeseries
495515

496516
# Add implicit arguments
497-
# @TODO: Consider removing once we have a proper editor in studio
498517
mixers = json_ai.model['args']['submodels']
499518
for i in range(len(mixers)):
500519
if mixers[i]["module"] == "Unit":
501520
pass
502521

503-
elif mixers[i]["module"] == "Neural":
522+
elif mixers[i]["module"] in ("Neural", "NeuralTs"):
504523
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
505524
"target_encoder", "$encoders[self.target]"
506525
)
507526
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
508527
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
509528
"dtype_dict", "$dtype_dict"
510529
)
511-
mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
512-
"timeseries_settings", "$problem_definition.timeseries_settings"
513-
)
514530
mixers[i]["args"]["net"] = mixers[i]["args"].get(
515531
"net",
516532
'"DefaultNet"'
517533
if not tss.is_timeseries or not tss.use_previous_target
518534
else '"ArNet"',
519535
)
536+
if mixers[i]["module"] == "NeuralTs":
537+
mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
538+
"timeseries_settings", "$problem_definition.timeseries_settings"
539+
)
540+
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
520541

521542
elif mixers[i]["module"] == "LightGBM":
522543
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -551,8 +572,17 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
551572
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
552573
"target_encoder", "$encoders[self.target]"
553574
)
554-
if "horizon" not in mixers[i]["args"]:
555-
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
575+
mixers[i]["args"]["tss"] = mixers[i]["args"].get("tss", "$problem_definition.timeseries_settings")
576+
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
577+
mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
578+
579+
elif mixers[i]["module"] == "NHitsMixer":
580+
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
581+
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
582+
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
583+
"ts_analysis", "$ts_analysis"
584+
)
585+
problem_definition.fit_on_all = False # takes too long otherwise
556586

557587
elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
558588
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -666,6 +696,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
666696
"dtype_dict": "$dtype_dict",
667697
"target": "$target",
668698
"mode": "$mode",
699+
"ts_analysis": "$ts_analysis"
669700
},
670701
},
671702
"timeseries_analyzer": {
@@ -807,11 +838,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
807838
808839
# Time-series blocks
809840
{ts_transform_code}
810-
"""
811-
if ts_analyze_code is not None:
812-
clean_body += f"""
813-
if self.mode != 'predict':
814-
{align(ts_analyze_code,1)}
815841
"""
816842

817843
clean_body += '\nreturn data'
@@ -835,12 +861,19 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
835861
# Prepare features Body
836862
# ----------------- #
837863

838-
prepare_body = f"""
864+
prepare_body = """
839865
self.mode = 'train'
840866
841867
if self.statistical_analysis is None:
842868
raise Exception("Please run analyze_data first")
869+
"""
870+
if ts_analyze_code is not None:
871+
prepare_body += f"""
872+
if self.mode != 'predict':
873+
{align(ts_analyze_code, 1)}
874+
"""
843875

876+
prepare_body += f"""
844877
# Column to encoder mapping
845878
self.encoders = {inline_dict(encoder_dict)}
846879
@@ -1133,6 +1166,7 @@ def __init__(self):
11331166
11341167
# Initial stats analysis
11351168
self.statistical_analysis = None
1169+
self.ts_analysis = None
11361170
self.runtime_log = dict()
11371171
11381172
@timed

lightwood/api/types.py

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class StatisticalAnalysis:
7373
:param bias:
7474
:param avg_words_per_sentence:
7575
:param positive_domain:
76+
:param ts_stats:
7677
""" # noqa
7778

7879
nr_rows: int
@@ -87,6 +88,7 @@ class StatisticalAnalysis:
8788
bias: object
8889
avg_words_per_sentence: object
8990
positive_domain: bool
91+
ts_stats: dict
9092

9193

9294
@dataclass_json

lightwood/data/cleaner.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,15 @@ def _standardize_datetime(element: object) -> Optional[float]:
147147
"""
148148
Parses an expected date-time element. Intakes an element that can in theory be anything.
149149
"""
150+
if element is None or pd.isna(element):
151+
return 0.0 # correct? TODO: Remove if the TS encoder can handle `None`
150152
try:
151153
date = parse_dt(str(element))
152154
except Exception:
153155
try:
154156
date = datetime.datetime.utcfromtimestamp(element)
155157
except Exception:
156-
return None
158+
return 0.0
157159

158160
return date.timestamp()
159161

lightwood/data/encoded_ds.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,15 @@ class ConcatedEncodedDs(EncodedDs):
140140
"""
141141
`ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity.
142142
""" # noqa
143+
# TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead
143144
def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
144145
# @TODO: missing super() call here?
145146
self.encoded_ds_arr = encoded_ds_arr
146147
self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr]
147148
self.encoders = self.encoded_ds_arr[0].encoders
148149
self.encoder_spans = self.encoded_ds_arr[0].encoder_spans
149150
self.target = self.encoded_ds_arr[0].target
151+
self.data_frame = pd.concat([x.data_frame for x in self.encoded_ds_arr])
150152

151153
def __len__(self):
152154
"""
@@ -166,17 +168,6 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
166168
idx -= length
167169
raise StopIteration()
168170

169-
@property
170-
def data_frame(self) -> pd.DataFrame:
171-
"""
172-
Property that concatenates all underlying `EncodedDs`'s dataframes and returns them.
173-
174-
Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect.
175-
176-
:return: Dataframe with all original data.
177-
""" # noqa
178-
return pd.concat([x.data_frame for x in self.encoded_ds_arr])
179-
180171
def get_column_original_data(self, column_name: str) -> pd.Series:
181172
"""
182173
See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`.

0 commit comments

Comments
 (0)