Skip to content

Commit 5cf15bb

Browse files
authored
Merge pull request #954 from mindsdb/staging
Release 22.7.4.0
2 parents 5496c5d + 99c3fdc commit 5cf15bb

19 files changed

+170
-66
lines changed

docssrc/source/index.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ To train a ``Predictor`` end-to-end, starting with unprocessed data, users can u
116116
117117
# Make the train/test splits and show predictions for a few examples
118118
test_df = predictor.split(predictor.preprocess(df))["test"]
119-
preds = predictor.predict(test).iloc[:10]
119+
preds = predictor.predict(test_df).iloc[:10]
120120
print(preds)
121121
122122
BYOM: Bring your own models

lightwood/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
__title__ = 'lightwood'
22
__package_name__ = 'lightwood'
3-
__version__ = '22.7.3.0'
3+
__version__ = '22.7.4.0'
44
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
55
__email__ = "community@mindsdb.com"
66
__author__ = 'MindsDB Inc'

lightwood/analysis/helpers/acc_stats.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
2121
ns = SimpleNamespace(**kwargs)
2222

2323
if ns.accuracy_functions == ['evaluate_array_accuracy'] and ns.ts_analysis.get('ts_naive_mae', {}):
24-
accuracy_functions = ['bounded_evaluate_array_accuracy']
25-
log.info("AccStats will bound the array accuracy for reporting purposes. Check `bounded_evaluate_array_accuracy` for a description of the bounding procedure.") # noqa
24+
accuracy_functions = ['bounded_ts_accuracy']
25+
log.info("AccStats will bound the array accuracy for reporting purposes. Check `bounded_ts_accuracy` for a description of the bounding procedure.") # noqa
2626
else:
2727
accuracy_functions = ns.accuracy_functions
2828

lightwood/analysis/nc/calibrate.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -375,10 +375,12 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
375375

376376
# anomaly detection
377377
if is_anomaly_task:
378-
anomalies = get_anomalies(row_insights,
379-
ns.data[ns.target_name],
380-
cooldown=ns.pred_args.anomaly_cooldown)
381-
row_insights['anomaly'] = anomalies
378+
row_insights['anomaly'] = None
379+
if ns.target_name in ns.data.columns:
380+
anomalies = get_anomalies(row_insights,
381+
ns.data[ns.target_name],
382+
cooldown=ns.pred_args.anomaly_cooldown)
383+
row_insights['anomaly'] = anomalies
382384

383385
if ns.tss.is_timeseries and ns.tss.horizon > 1:
384386
if is_numerical:

lightwood/api/json_ai.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,20 @@ def generate_json_ai(
303303
"stop_after": "$problem_definition.seconds_per_mixer",
304304
"horizon": "$problem_definition.timeseries_settings.horizon",
305305
},
306+
},
307+
{
308+
"module": "ETSMixer",
309+
"args": {
310+
"stop_after": "$problem_definition.seconds_per_mixer",
311+
"horizon": "$problem_definition.timeseries_settings.horizon",
312+
},
313+
},
314+
{
315+
"module": "ARIMAMixer",
316+
"args": {
317+
"stop_after": "$problem_definition.seconds_per_mixer",
318+
"horizon": "$problem_definition.timeseries_settings.horizon",
319+
},
306320
}
307321
]
308322
)
@@ -361,7 +375,7 @@ def generate_json_ai(
361375
elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary]:
362376
accuracy_functions = ["balanced_accuracy_score"]
363377
elif output_dtype in (dtype.num_array, dtype.num_tsarray):
364-
accuracy_functions = ["evaluate_num_array_accuracy"]
378+
accuracy_functions = ["bounded_ts_accuracy"]
365379
elif output_dtype in (dtype.cat_array, dtype.cat_tsarray):
366380
accuracy_functions = ["evaluate_cat_array_accuracy"]
367381
else:
@@ -371,7 +385,7 @@ def generate_json_ai(
371385

372386
if is_ts:
373387
if output_dtype in [dtype.integer, dtype.float]:
374-
accuracy_functions = ["evaluate_num_array_accuracy"] # forces this acc fn for t+1 time series forecasters
388+
accuracy_functions = ["bounded_ts_accuracy"] # forces this acc fn for t+1 time series forecasters # noqa
375389

376390
if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
377391
imputers.append({"module": "NumericalImputer",
@@ -585,7 +599,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
585599
)
586600
problem_definition.fit_on_all = False # takes too long otherwise
587601

588-
elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
602+
elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"):
589603
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
590604
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
591605
"dtype_dict", "$dtype_dict"

lightwood/data/statistical_analysis.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,10 @@ def statistical_analysis(data: pd.DataFrame,
9090
missing = {}
9191
distinct = {}
9292
for col in columns:
93-
missing[col] = len([x for x in df[col] if x is None]) / len(df[col]) if len(df[col]) else 0
93+
missing[col] = {
94+
'missing': len([x for x in df[col] if x is None]) / len(df[col]) if len(df[col]) else 0,
95+
'description': 'Proportion of missing values for the column. Columns with high % of missing values may not be as useful for modelling purposes.' # noqa
96+
}
9497
distinct[col] = len(set([str(x) for x in df[col]])) / len(df[col]) if len(df[col]) else 0
9598

9699
nr_rows = len(df)
@@ -157,7 +160,7 @@ def statistical_analysis(data: pd.DataFrame,
157160
S, biased_buckets = compute_entropy_biased_buckets(histograms[col])
158161
bias[col] = {
159162
'entropy': S,
160-
'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected.""", # noqa
163+
'description': """"Potential bias" is flagged when data does not distribute normally or uniformly, likely over-representing or under-representing some values. This may be normal, hence bias is only "potential".""", # noqa
161164
'biased_buckets': biased_buckets
162165
}
163166

lightwood/data/timeseries_analyzer.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from lightwood.api.types import TimeseriesSettings
1212
from lightwood.api.dtype import dtype
13-
from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer, max_pacf
13+
from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer
1414
from lightwood.helpers.log import log
1515
from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers
1616

@@ -36,16 +36,12 @@ def timeseries_analyzer(data: Dict[str, pd.DataFrame], dtype_dict: Dict[str, str
3636
""" # noqa
3737
tss = timeseries_settings
3838
groups = get_ts_groups(data['train'], tss)
39-
deltas, periods, freqs = get_delta(data['train'], dtype_dict, groups, tss)
39+
deltas, periods, freqs = get_delta(data['train'], dtype_dict, groups, target, tss)
4040

4141
normalizers = generate_target_group_normalizers(data['train'], target, dtype_dict, groups, tss)
4242

4343
if dtype_dict[target] in (dtype.integer, dtype.float, dtype.num_tsarray):
44-
periods = max_pacf(data['train'], groups, target, tss) # override with PACF output
45-
naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'],
46-
target,
47-
tss,
48-
groups)
44+
naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'], target, tss, groups)
4945
differencers = get_differencers(data['train'], target, groups, tss.group_by)
5046
stl_transforms = get_stls(data['train'], data['dev'], target, periods, groups, tss)
5147
else:
@@ -71,6 +67,9 @@ def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, fl
7167
Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`).
7268
Useful for computing MASE forecasting error.
7369
70+
As per arxiv.org/abs/2203.10716, we resort to a constant forecast based on the last-seen measurement across the entire horizon.
71+
By following the original measure, the naive forecaster would have the advantage of knowing the actual values whereas the predictor would not.
72+
7473
Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple
7574
series, use `get_grouped_naive_resiudals`.
7675
@@ -80,7 +79,7 @@ def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, fl
8079
:return: (list of naive residuals, average residual value)
8180
""" # noqa
8281
# @TODO: support categorical series as well
83-
residuals = target_data.rolling(window=m + 1).apply(lambda x: abs(x.iloc[m] - x.iloc[0]))[m:].values.flatten()
82+
residuals = np.abs(target_data.values[1:] - target_data.values[0]).flatten()
8483
scale_factor = np.average(residuals)
8584
return residuals.tolist(), scale_factor
8685

lightwood/data/timeseries_transform.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def transform_timeseries(
5151
oby_col = tss.order_by
5252
groups = get_ts_groups(data, tss)
5353
if not ts_analysis:
54-
_, periods, freqs = get_delta(data, dtype_dict, groups, tss)
54+
_, periods, freqs = get_delta(data, dtype_dict, groups, target, tss)
5555
else:
5656
periods = ts_analysis['periods']
5757
freqs = ts_analysis['sample_freqs']
@@ -60,13 +60,16 @@ def transform_timeseries(
6060
subsets = []
6161
for group in groups:
6262
if (tss.group_by and group != '__default') or not tss.group_by:
63-
if periods[group] == 0:
64-
raise Exception(
65-
f"Partition is not valid, faulty group {group}. Please make sure you group by a set of columns that ensures unique measurements for each grouping through time.") # noqa
6663
idxs, subset = get_group_matches(data, group, tss.group_by)
6764
if subset.shape[0] > 0:
65+
if periods.get(group, periods['__default']) == 0 and subset.shape[0] > 1:
66+
raise Exception(
67+
f"Partition is not valid, faulty group {group}. Please make sure you group by a set of columns that ensures unique measurements for each grouping through time.") # noqa
68+
6869
index = pd.to_datetime(subset[oby_col], unit='s')
69-
subset.index = pd.date_range(start=index.iloc[0], freq=freqs[group], periods=len(subset))
70+
subset.index = pd.date_range(start=index.iloc[0],
71+
freq=freqs.get(group, freqs['__default']),
72+
periods=len(subset))
7073
subset['__mdb_inferred_freq'] = subset.index.freq # sets constant column because pd.concat forgets freq (see: https://github.com/pandas-dev/pandas/issues/3232) # noqa
7174
subsets.append(subset)
7275
original_df = pd.concat(subsets).sort_values(by='__mdb_original_index')

lightwood/helpers/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from lightwood.helpers.device import is_cuda_compatible, get_devices
33
from lightwood.helpers.general import mase, is_none, evaluate_accuracy, evaluate_num_array_accuracy,\
44
evaluate_array_accuracy, evaluate_multilabel_accuracy, evaluate_regression_accuracy, evaluate_cat_array_accuracy, \
5-
bounded_evaluate_array_accuracy
5+
bounded_ts_accuracy
66
from lightwood.helpers.ts import get_group_matches, get_ts_groups, get_inferred_timestamps, add_tn_num_conf_bounds, \
77
add_tn_cat_conf_bounds
88
from lightwood.helpers.io import read_from_path_or_url
@@ -17,7 +17,7 @@
1717

1818
__all__ = ['to_binary', 'f1_score', 'recall_score', 'precision_score', 'r2_score', 'is_cuda_compatible', 'get_devices',
1919
'get_group_matches', 'get_ts_groups', 'mase', 'is_none', 'evaluate_accuracy', 'evaluate_num_array_accuracy',
20-
'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_evaluate_array_accuracy',
20+
'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_ts_accuracy',
2121
'evaluate_multilabel_accuracy', 'evaluate_regression_accuracy', 'read_from_path_or_url', 'get_nr_procs',
2222
'mut_method_call', 'run_mut_method', 'tokenize_text', 'analyze_sentences', 'decontracted', 'contains_alnum',
2323
'get_identifier_description', 'get_identifier_description_mp', 'get_pct_auto_increment',

lightwood/helpers/general.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import importlib
2+
from copy import deepcopy
23
from typing import List, Dict, Optional
34
import numpy as np
45
import pandas as pd
@@ -28,7 +29,7 @@ def evaluate_accuracy(data: pd.DataFrame,
2829
score_dict = {}
2930

3031
for accuracy_function_str in accuracy_functions:
31-
if 'array_accuracy' in accuracy_function_str:
32+
if 'array_accuracy' in accuracy_function_str or accuracy_function_str in ('bounded_ts_accuracy', ):
3233
if ts_analysis is None or not ts_analysis['tss'].is_timeseries:
3334
# normal array, needs to be expanded
3435
cols = [target]
@@ -47,7 +48,7 @@ def evaluate_accuracy(data: pd.DataFrame,
4748
elif accuracy_function_str == 'evaluate_cat_array_accuracy':
4849
acc_fn = evaluate_cat_array_accuracy
4950
else:
50-
acc_fn = bounded_evaluate_array_accuracy
51+
acc_fn = bounded_ts_accuracy
5152
score_dict[accuracy_function_str] = acc_fn(true_values,
5253
predictions,
5354
data=data[cols],
@@ -204,7 +205,7 @@ def evaluate_cat_array_accuracy(
204205
base_acc_fn=balanced_accuracy_score)
205206

206207

207-
def bounded_evaluate_array_accuracy(
208+
def bounded_ts_accuracy(
208209
true_values: pd.Series,
209210
predictions: pd.Series,
210211
**kwargs
@@ -216,15 +217,18 @@ def bounded_evaluate_array_accuracy(
216217
For worse-than-naive, it scales linearly (with a factor).
217218
For better-than-naive, we fix 10 as 0.99, and scaled-logarithms (with 10 and 1e4 cutoffs as respective bases) are used to squash all remaining preimages to values between 0.5 and 1.0.
218219
""" # noqa
219-
result = evaluate_array_accuracy(np.array(true_values),
220-
np.array(predictions),
221-
**kwargs)
222-
if 10 < result <= 1e4:
220+
true_values = deepcopy(true_values)
221+
predictions = deepcopy(predictions)
222+
result = evaluate_num_array_accuracy(true_values,
223+
predictions,
224+
**kwargs)
225+
sp = 5
226+
if sp < result <= 1e4:
223227
step_base = 0.99
224228
return step_base + (np.log(result) / np.log(1e4)) * (1 - step_base)
225-
elif 1 <= result <= 10:
229+
elif 1 <= result <= sp:
226230
step_base = 0.5
227-
return step_base + (np.log(result) / np.log(10)) * (0.99 - step_base)
231+
return step_base + (np.log(result) / np.log(sp)) * (0.99 - step_base)
228232
else:
229233
return result / 2 # worse than naive
230234

lightwood/helpers/ts.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@ def get_delta(
4242
df: pd.DataFrame,
4343
dtype_dict: dict,
4444
group_combinations: list,
45+
target: str,
4546
tss
4647
) -> Tuple[Dict, Dict, Dict]:
4748
"""
4849
Infer the sampling interval of each time series, by picking the most popular time interval observed in the training data.
4950
5051
:param df: Dataframe with time series data.
5152
:param group_combinations: all tuples with distinct values for `TimeseriesSettings.group_by` columns, defining all available time series.
53+
:param target: name of target column
5254
:param tss: timeseries settings
5355
5456
:return:
@@ -58,8 +60,8 @@ def get_delta(
5860
original_col = f'__mdb_original_{tss.order_by}'
5961
order_col = original_col if original_col in df.columns else tss.order_by
6062
deltas = {"__default": df[order_col].astype(float).rolling(window=2).apply(np.diff).value_counts().index[0]}
61-
freq, period = detect_freq_period(deltas["__default"], tss)
62-
periods = {"__default": period}
63+
freq, period = detect_freq_period(deltas["__default"], tss, len(df))
64+
periods = {"__default": [period]}
6365
freqs = {"__default": freq}
6466

6567
if tss.group_by:
@@ -68,12 +70,15 @@ def get_delta(
6870
_, subset = get_group_matches(df, group, tss.group_by)
6971
if subset.shape[0] > 1:
7072
deltas[group] = subset[order_col].rolling(window=2).apply(np.diff).value_counts().index[0]
71-
freq, period = detect_freq_period(deltas[group], tss)
72-
periods[group] = period
73+
freq, period = detect_freq_period(deltas[group], tss, len(subset))
7374
freqs[group] = freq
75+
if period:
76+
periods[group] = [period]
77+
else:
78+
periods[group] = [max_pacf(df, group_combinations, target, tss)[group][0]]
7479
else:
7580
deltas[group] = 1.0
76-
periods[group] = 1
81+
periods[group] = [1]
7782
freqs[group] = 'S'
7883

7984
return deltas, periods, freqs
@@ -171,7 +176,7 @@ def _flatten_series(series: np.ndarray) -> np.ndarray:
171176
return series
172177

173178

174-
def detect_freq_period(deltas: pd.DataFrame, tss) -> tuple:
179+
def detect_freq_period(deltas: pd.DataFrame, tss, n_points) -> tuple:
175180
"""
176181
Helper method that, based on the most popular interval for a time series, determines its seasonal peridiocity (sp).
177182
This bit of information can be crucial for good modelling with methods like ARIMA.
@@ -212,9 +217,12 @@ def detect_freq_period(deltas: pd.DataFrame, tss) -> tuple:
212217
}
213218
freq_to_period = {interval: period for (interval, period) in tss.interval_periods}
214219
for tag, period in (('yearly', 1), ('quarterly', 4), ('bimonthly', 6), ('monthly', 12),
215-
('weekly', 4), ('daily', 1), ('hourly', 24), ('minute', 1), ('second', 1), ('constant', 0)):
220+
('weekly', 52), ('daily', 7), ('hourly', 24), ('minute', 60), ('second', 60), ('constant', 0)):
216221
if tag not in freq_to_period.keys():
217-
freq_to_period[tag] = period
222+
if period <= n_points:
223+
freq_to_period[tag] = period
224+
else:
225+
freq_to_period[tag] = None
218226

219227
diffs = [(tag, abs(deltas - secs)) for tag, secs in secs_to_interval.items()]
220228
freq, min_diff = sorted(diffs, key=lambda x: x[1])[0]

lightwood/mixer/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from lightwood.mixer.lightgbm import LightGBM
66
from lightwood.mixer.lightgbm_array import LightGBMArray
77
from lightwood.mixer.sktime import SkTime
8+
from lightwood.mixer.arima import ARIMAMixer
9+
from lightwood.mixer.ets import ETSMixer
810
from lightwood.mixer.nhits import NHitsMixer
911
from lightwood.mixer.prophet import ProphetMixer
1012
from lightwood.mixer.regression import Regression
@@ -15,4 +17,4 @@
1517
QClassic = None
1618

1719
__all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'LightGBMArray', 'Unit', 'Regression',
18-
'SkTime', 'QClassic', 'ProphetMixer', 'NHitsMixer']
20+
'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer']

0 commit comments

Comments
 (0)