Skip to content

Commit 2812838

Browse files
authored
Merge pull request #1172 from mindsdb/staging
Release 23.7.1.0
2 parents 3262859 + 9eadd14 commit 2812838

21 files changed

+516
-178
lines changed

lightwood/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
__title__ = 'lightwood'
22
__package_name__ = 'lightwood'
3-
__version__ = '23.6.4.0'
3+
__version__ = '23.7.1.0'
44
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
55
__email__ = "community@mindsdb.com"
66
__author__ = 'MindsDB Inc'

lightwood/analysis/analyze.py

+54-43
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Dict, List, Tuple, Optional
22

3+
import numpy as np
34
from dataprep_ml import StatisticalAnalysis
45

56
from lightwood.helpers.log import log
@@ -8,7 +9,7 @@
89
from lightwood.analysis.base import BaseAnalysisBlock
910
from lightwood.data.encoded_ds import EncodedDs
1011
from lightwood.encoder.text.pretrained import PretrainedLangEncoder
11-
from lightwood.api.types import ModelAnalysis, TimeseriesSettings, PredictionArguments
12+
from lightwood.api.types import ModelAnalysis, ProblemDefinition, PredictionArguments
1213

1314

1415
def model_analyzer(
@@ -17,7 +18,7 @@ def model_analyzer(
1718
train_data: EncodedDs,
1819
stats_info: StatisticalAnalysis,
1920
target: str,
20-
tss: TimeseriesSettings,
21+
pdef: ProblemDefinition,
2122
dtype_dict: Dict[str, str],
2223
accuracy_functions,
2324
ts_analysis: Dict,
@@ -39,54 +40,64 @@ def model_analyzer(
3940

4041
runtime_analyzer = {}
4142
data_type = dtype_dict[target]
43+
tss = pdef.timeseries_settings
4244

4345
# retrieve encoded data representations
4446
encoded_train_data = train_data
4547
encoded_val_data = data
4648
data = encoded_val_data.data_frame
4749
input_cols = list([col for col in data.columns if col != target])
4850

49-
# predictive task
50-
is_numerical = data_type in (dtype.integer, dtype.float, dtype.num_tsarray, dtype.quantity)
51-
is_classification = data_type in (dtype.categorical, dtype.binary, dtype.cat_tsarray)
52-
is_multi_ts = tss.is_timeseries and tss.horizon > 1
53-
has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder)
54-
for enc in encoded_train_data.encoders.values()])
55-
56-
# raw predictions for validation dataset
57-
args = {} if not is_classification else {"predict_proba": True}
58-
filtered_df = encoded_val_data.data_frame
59-
normal_predictions = None
60-
61-
if len(analysis_blocks) > 0:
62-
normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
63-
normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)
64-
65-
# ------------------------- #
66-
# Run analysis blocks, both core and user-defined
67-
# ------------------------- #
68-
kwargs = {
69-
'predictor': predictor,
70-
'target': target,
71-
'input_cols': input_cols,
72-
'dtype_dict': dtype_dict,
73-
'normal_predictions': normal_predictions,
74-
'data': filtered_df,
75-
'train_data': train_data,
76-
'encoded_val_data': encoded_val_data,
77-
'is_classification': is_classification,
78-
'is_numerical': is_numerical,
79-
'is_multi_ts': is_multi_ts,
80-
'stats_info': stats_info,
81-
'tss': tss,
82-
'ts_analysis': ts_analysis,
83-
'accuracy_functions': accuracy_functions,
84-
'has_pretrained_text_enc': has_pretrained_text_enc
85-
}
86-
87-
for block in analysis_blocks:
88-
log.info("The block %s is now running its analyze() method", block.__class__.__name__)
89-
runtime_analyzer = block.analyze(runtime_analyzer, **kwargs)
51+
if not pdef.embedding_only:
52+
# predictive task
53+
is_numerical = data_type in (dtype.integer, dtype.float, dtype.num_tsarray, dtype.quantity)
54+
is_classification = data_type in (dtype.categorical, dtype.binary, dtype.cat_tsarray)
55+
is_multi_ts = tss.is_timeseries and tss.horizon > 1
56+
has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder)
57+
for enc in encoded_train_data.encoders.values()])
58+
59+
# raw predictions for validation dataset
60+
args = {} if not is_classification else {"predict_proba": True}
61+
normal_predictions = None
62+
63+
if len(analysis_blocks) > 0:
64+
if tss.is_timeseries:
65+
# we retrieve the first entry per group (closest to supervision cutoff)
66+
if tss.group_by:
67+
encoded_val_data.data_frame['__mdb_val_idx'] = np.arange(len(encoded_val_data))
68+
idxs = encoded_val_data.data_frame.groupby(by=tss.group_by).first()['__mdb_val_idx'].values
69+
encoded_val_data.data_frame = encoded_val_data.data_frame.iloc[idxs, :]
70+
if encoded_val_data.cache_built:
71+
encoded_val_data.X_cache = encoded_val_data.X_cache[idxs, :]
72+
encoded_val_data.Y_cache = encoded_val_data.Y_cache[idxs, :]
73+
normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
74+
normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)
75+
76+
# ------------------------- #
77+
# Run analysis blocks, both core and user-defined
78+
# ------------------------- #
79+
kwargs = {
80+
'predictor': predictor,
81+
'target': target,
82+
'input_cols': input_cols,
83+
'dtype_dict': dtype_dict,
84+
'normal_predictions': normal_predictions,
85+
'data': encoded_val_data.data_frame,
86+
'train_data': train_data,
87+
'encoded_val_data': encoded_val_data,
88+
'is_classification': is_classification,
89+
'is_numerical': is_numerical,
90+
'is_multi_ts': is_multi_ts,
91+
'stats_info': stats_info,
92+
'tss': tss,
93+
'ts_analysis': ts_analysis,
94+
'accuracy_functions': accuracy_functions,
95+
'has_pretrained_text_enc': has_pretrained_text_enc
96+
}
97+
98+
for block in analysis_blocks:
99+
log.info("The block %s is now running its analyze() method", block.__class__.__name__)
100+
runtime_analyzer = block.analyze(runtime_analyzer, **kwargs)
90101

91102
# ------------------------- #
92103
# Populate ModelAnalysis object

lightwood/api/json_ai.py

+41-27
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
# TODO: _add_implicit_values unit test ensures NO changes for a fully specified file.
2+
import inspect
23
from copy import deepcopy
4+
5+
from type_infer.dtype import dtype
36
from type_infer.base import TypeInformation
47
from dataprep_ml import StatisticalAnalysis
58

9+
from lightwood.helpers.log import log
610
from lightwood.helpers.templating import call, inline_dict, align
7-
from lightwood.helpers.templating import _consolidate_analysis_blocks
8-
from type_infer.dtype import dtype
11+
from lightwood.helpers.templating import _consolidate_analysis_blocks, _add_cls_kwarg
912
from lightwood.api.types import (
1013
JsonAI,
1114
ProblemDefinition,
1215
)
13-
import inspect
14-
from lightwood.helpers.log import log
1516
from lightwood.__about__ import __version__ as lightwood_version
16-
17+
import lightwood.ensemble
1718

1819
# For custom modules, we create a module loader with necessary imports below
1920
IMPORT_EXTERNAL_DIRS = """
@@ -535,29 +536,29 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
535536
problem_definition = json_ai.problem_definition
536537
tss = problem_definition.timeseries_settings
537538
is_ts = tss.is_timeseries
539+
# tsa_val = "self.ts_analysis" if is_ts else None # TODO: remove
540+
mixers = json_ai.model['args']['submodels']
538541

539542
# Add implicit ensemble arguments
540-
json_ai.model["args"]["target"] = json_ai.model["args"].get("target", "$target")
541-
json_ai.model["args"]["data"] = json_ai.model["args"].get("data", "encoded_test_data")
542-
json_ai.model["args"]["mixers"] = json_ai.model["args"].get("mixers", "$mixers")
543-
json_ai.model["args"]["fit"] = json_ai.model["args"].get("fit", True)
544-
json_ai.model["args"]["args"] = json_ai.model["args"].get("args", "$pred_args") # TODO correct?
545-
546-
# @TODO: change this to per-parameter basis and signature inspection
547-
if json_ai.model["module"] in ("BestOf", "ModeEnsemble", "WeightedMeanEnsemble"):
548-
json_ai.model["args"]["accuracy_functions"] = json_ai.model["args"].get("accuracy_functions",
549-
"$accuracy_functions")
550-
551-
if json_ai.model["module"] in ("BestOf", "TsStackedEnsemble", "WeightedMeanEnsemble"):
552-
tsa_val = "self.ts_analysis" if is_ts else None
553-
json_ai.model["args"]["ts_analysis"] = json_ai.model["args"].get("ts_analysis", tsa_val)
543+
param_pairs = {
544+
'target': json_ai.model["args"].get("target", "$target"),
545+
'data': json_ai.model["args"].get("data", "encoded_test_data"),
546+
'mixers': json_ai.model["args"].get("mixers", "$mixers"),
547+
'fit': json_ai.model["args"].get("fit", True),
548+
'args': json_ai.model["args"].get("args", "$pred_args"),
549+
'accuracy_functions': json_ai.model["args"].get("accuracy_functions", "$accuracy_functions"),
550+
'ts_analysis': json_ai.model["args"].get("ts_analysis", "self.ts_analysis" if is_ts else None),
551+
'dtype_dict': json_ai.model["args"].get("dtype_dict", "$dtype_dict"),
552+
}
553+
ensemble_cls = getattr(lightwood.ensemble, json_ai.model["module"])
554+
filtered_params = {}
555+
for p_name, p_value in param_pairs.items():
556+
_add_cls_kwarg(ensemble_cls, filtered_params, p_name, p_value)
554557

555-
if json_ai.model["module"] in ("MeanEnsemble", "ModeEnsemble", "StackedEnsemble", "TsStackedEnsemble",
556-
"WeightedMeanEnsemble"):
557-
json_ai.model["args"]["dtype_dict"] = json_ai.model["args"].get("dtype_dict", "$dtype_dict")
558+
json_ai.model["args"] = filtered_params
559+
json_ai.model["args"]['submodels'] = mixers # add mixers back in
558560

559561
# Add implicit mixer arguments
560-
mixers = json_ai.model['args']['submodels']
561562
for i in range(len(mixers)):
562563
if not mixers[i].get("args", False):
563564
mixers[i]["args"] = {}
@@ -685,7 +686,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
685686
"module": "model_analyzer",
686687
"args": {
687688
"stats_info": "$statistical_analysis",
688-
"tss": "$problem_definition.timeseries_settings",
689+
"pdef": "$problem_definition",
689690
"accuracy_functions": "$accuracy_functions",
690691
"predictor": "$ensemble",
691692
"data": "encoded_test_data",
@@ -1170,7 +1171,12 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
11701171
11711172
# Prepare mixers
11721173
log.info(f'[Learn phase 6/{n_phases}] - Mixer training')
1173-
self.fit(enc_train_test)
1174+
if not self.problem_definition.embedding_only:
1175+
self.fit(enc_train_test)
1176+
else:
1177+
self.mixers = []
1178+
self.ensemble = Embedder(self.target, mixers=list(), data=enc_train_test['train'])
1179+
self.supports_proba = self.ensemble.supports_proba
11741180
11751181
# Analyze the ensemble
11761182
log.info(f'[Learn phase 7/{n_phases}] - Ensemble analysis')
@@ -1221,9 +1227,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
12211227
encoded_data = encoded_ds.get_encoded_data(include_target=False)
12221228
12231229
log.info(f'[Predict phase 3/{{n_phases}}] - Calling ensemble')
1224-
df = self.ensemble(encoded_ds, args=self.pred_args)
1230+
if self.pred_args.return_embedding:
1231+
embedder = Embedder(self.target, mixers=list(), data=encoded_ds)
1232+
df = embedder(encoded_ds, args=self.pred_args)
1233+
else:
1234+
df = self.ensemble(encoded_ds, args=self.pred_args)
12251235
1226-
if not self.pred_args.all_mixers:
1236+
if not(any(
1237+
[self.pred_args.all_mixers,
1238+
self.pred_args.return_embedding,
1239+
self.problem_definition.embedding_only]
1240+
)):
12271241
log.info(f'[Predict phase 4/{{n_phases}}] - Analyzing output')
12281242
df, global_insights = {call(json_ai.explainer)}
12291243
self.global_insights = {{**self.global_insights, **global_insights}}

lightwood/api/types.py

+6
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ class ProblemDefinition:
185185
timeseries_settings: TimeseriesSettings
186186
anomaly_detection: bool
187187
use_default_analysis: bool
188+
embedding_only: bool
188189
dtype_dict: Optional[dict]
189190
ignore_features: List[str]
190191
fit_on_all: bool
@@ -220,6 +221,7 @@ def from_dict(obj: Dict):
220221
ignore_features = obj.get('ignore_features', [])
221222
fit_on_all = obj.get('fit_on_all', True)
222223
use_default_analysis = obj.get('use_default_analysis', True)
224+
embedding_only = obj.get('embedding_only', False)
223225
strict_mode = obj.get('strict_mode', True)
224226
seed_nr = obj.get('seed_nr', 1)
225227
problem_definition = ProblemDefinition(
@@ -237,6 +239,7 @@ def from_dict(obj: Dict):
237239
dtype_dict=dtype_dict,
238240
ignore_features=ignore_features,
239241
use_default_analysis=use_default_analysis,
242+
embedding_only=embedding_only,
240243
fit_on_all=fit_on_all,
241244
strict_mode=strict_mode,
242245
seed_nr=seed_nr
@@ -453,6 +456,7 @@ class PredictionArguments:
453456
simple_ts_bounds: bool = False
454457
time_format: str = ''
455458
force_ts_infer: bool = False
459+
return_embedding: bool = False
456460

457461
@staticmethod
458462
def from_dict(obj: Dict):
@@ -474,6 +478,7 @@ def from_dict(obj: Dict):
474478
simple_ts_bounds = obj.get('simple_ts_bounds', PredictionArguments.simple_ts_bounds)
475479
time_format = obj.get('time_format', PredictionArguments.time_format)
476480
force_ts_infer = obj.get('force_ts_infer', PredictionArguments.force_ts_infer)
481+
return_embedding = obj.get('return_embedding', PredictionArguments.return_embedding)
477482

478483
pred_args = PredictionArguments(
479484
predict_proba=predict_proba,
@@ -485,6 +490,7 @@ def from_dict(obj: Dict):
485490
simple_ts_bounds=simple_ts_bounds,
486491
time_format=time_format,
487492
force_ts_infer=force_ts_infer,
493+
return_embedding=return_embedding,
488494
)
489495

490496
return pred_args

lightwood/data/timeseries_transform.py

+20-19
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def transform_timeseries(
109109
secondary_type_dict[oby] = dtype_dict[oby]
110110

111111
original_df[f'__mdb_original_{oby}'] = original_df[oby]
112+
original_df = _ts_to_obj(original_df, [oby] + tss.historical_columns)
112113
group_lengths = []
113114
if len(gb_arr) > 0:
114115
df_arr = []
@@ -136,30 +137,30 @@ def transform_timeseries(
136137
make_preds = [True for _ in range(len(df_arr[i]))]
137138
df_arr[i]['__make_predictions'] = make_preds
138139

139-
if len(original_df) > 500:
140+
if len(df_arr) > 1 and len(original_df) > 5000:
140141
# @TODO: restore possibility to override this with args
141-
nr_procs = get_nr_procs(original_df)
142+
biggest_sub_df = df_arr[np.argmax(group_lengths)]
143+
nr_procs = min(get_nr_procs(biggest_sub_df), len(df_arr))
142144
log.info(f'Using {nr_procs} processes to reshape.')
143-
pool = mp.Pool(processes=nr_procs)
144-
# Make type `object` so that dataframe cells can be python lists
145-
df_arr = pool.map(partial(_ts_to_obj, historical_columns=[oby] + tss.historical_columns), df_arr)
146-
df_arr = pool.map(
147-
partial(
148-
_ts_add_previous_rows, order_cols=[oby] + tss.historical_columns, window=window),
149-
df_arr)
150-
df_arr = pool.map(partial(_ts_add_future_target, target=target, horizon=tss.horizon,
151-
data_dtype=tss.target_type, mode=mode),
152-
df_arr)
153-
154-
if tss.use_previous_target:
145+
with mp.Pool(processes=nr_procs) as pool:
155146
df_arr = pool.map(
156-
partial(_ts_add_previous_target, target=target, window=tss.window),
157-
df_arr)
158-
pool.close()
159-
pool.join()
147+
partial(_ts_add_previous_rows, order_cols=[oby] + tss.historical_columns, window=window),
148+
df_arr
149+
)
150+
151+
df_arr = pool.map(
152+
partial(_ts_add_future_target, target=target, horizon=tss.horizon,
153+
data_dtype=tss.target_type, mode=mode),
154+
df_arr
155+
)
156+
157+
if tss.use_previous_target:
158+
df_arr = pool.map(
159+
partial(_ts_add_previous_target, target=target, window=tss.window),
160+
df_arr
161+
)
160162
else:
161163
for i in range(n_groups):
162-
df_arr[i] = _ts_to_obj(df_arr[i], historical_columns=[oby] + tss.historical_columns)
163164
df_arr[i] = _ts_add_previous_rows(df_arr[i],
164165
order_cols=[oby] + tss.historical_columns, window=window)
165166
df_arr[i] = _ts_add_future_target(df_arr[i], target=target, horizon=tss.horizon,

lightwood/encoder/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from lightwood.encoder.text.short import ShortTextEncoder
1010
from lightwood.encoder.text.vocab import VocabularyEncoder
1111
from lightwood.encoder.text.rnn import RnnEncoder as TextRnnEncoder
12+
from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
1213
from lightwood.encoder.categorical.onehot import OneHotEncoder
1314
from lightwood.encoder.categorical.binary import BinaryEncoder
1415
from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder
@@ -23,5 +24,5 @@
2324
__all__ = ['BaseEncoder', 'DatetimeEncoder', 'Img2VecEncoder', 'NumericEncoder', 'TsNumericEncoder',
2425
'TsArrayNumericEncoder', 'ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'OneHotEncoder',
2526
'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'MultiHotEncoder', 'TsCatArrayEncoder',
26-
'NumArrayEncoder', 'CatArrayEncoder',
27+
'NumArrayEncoder', 'CatArrayEncoder', 'SimpleLabelEncoder',
2728
'PretrainedLangEncoder', 'BinaryEncoder', 'DatetimeNormalizerEncoder', 'MFCCEncoder']
+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from lightwood.encoder.categorical.onehot import OneHotEncoder
2+
from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
23
from lightwood.encoder.categorical.multihot import MultiHotEncoder
34
from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder
45

5-
__all__ = ['OneHotEncoder', 'MultiHotEncoder', 'CategoricalAutoEncoder']
6+
__all__ = ['OneHotEncoder', 'SimpleLabelEncoder', 'MultiHotEncoder', 'CategoricalAutoEncoder']

0 commit comments

Comments
 (0)