mindsdb
diff --git a/‎docssrc/source/index.rst
+1-1 b/‎docssrc/source/index.rst
+1-1
diff --git a/‎lightwood/__about__.py
+1-1 b/‎lightwood/__about__.py
+1-1
diff --git a/‎lightwood/analysis/helpers/acc_stats.py
+2-2 b/‎lightwood/analysis/helpers/acc_stats.py
+2-2
diff --git a/‎lightwood/analysis/nc/calibrate.py
+6-4 b/‎lightwood/analysis/nc/calibrate.py
+6-4
diff --git a/‎lightwood/api/json_ai.py
+17-3 b/‎lightwood/api/json_ai.py
+17-3
diff --git a/‎lightwood/data/statistical_analysis.py
+5-2 b/‎lightwood/data/statistical_analysis.py
+5-2
diff --git a/‎lightwood/data/timeseries_analyzer.py
+7-8 b/‎lightwood/data/timeseries_analyzer.py
+7-8
diff --git a/‎lightwood/data/timeseries_transform.py
+8-5 b/‎lightwood/data/timeseries_transform.py
+8-5
diff --git a/‎lightwood/helpers/__init__.py
+2-2 b/‎lightwood/helpers/__init__.py
+2-2
diff --git a/‎lightwood/helpers/general.py
+13-9 b/‎lightwood/helpers/general.py
+13-9
diff --git a/‎lightwood/helpers/ts.py
+16-8 b/‎lightwood/helpers/ts.py
+16-8
diff --git a/‎lightwood/mixer/__init__.py
+3-1 b/‎lightwood/mixer/__init__.py
+3-1
@@ -116,7 +116,7 @@ To train a ``Predictor`` end-to-end, starting with unprocessed data, users can u
 
    # Make the train/test splits and show predictions for a few examples
    test_df = predictor.split(predictor.preprocess(df))["test"]
-   preds = predictor.predict(test).iloc[:10]
+   preds = predictor.predict(test_df).iloc[:10]
    print(preds)
 
 BYOM: Bring your own models
 
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.7.3.0'
+__version__ = '22.7.4.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "community@mindsdb.com"
 __author__ = 'MindsDB Inc'
 
@@ -21,8 +21,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
         ns = SimpleNamespace(**kwargs)
 
         if ns.accuracy_functions == ['evaluate_array_accuracy'] and ns.ts_analysis.get('ts_naive_mae', {}):
-            accuracy_functions = ['bounded_evaluate_array_accuracy']
-            log.info("AccStats will bound the array accuracy for reporting purposes. Check `bounded_evaluate_array_accuracy` for a description of the bounding procedure.")  # noqa
+            accuracy_functions = ['bounded_ts_accuracy']
+            log.info("AccStats will bound the array accuracy for reporting purposes. Check `bounded_ts_accuracy` for a description of the bounding procedure.")  # noqa
         else:
             accuracy_functions = ns.accuracy_functions
 
 
@@ -375,10 +375,12 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
 
                 # anomaly detection
                 if is_anomaly_task:
-                    anomalies = get_anomalies(row_insights,
-                                              ns.data[ns.target_name],
-                                              cooldown=ns.pred_args.anomaly_cooldown)
-                    row_insights['anomaly'] = anomalies
+                    row_insights['anomaly'] = None
+                    if ns.target_name in ns.data.columns:
+                        anomalies = get_anomalies(row_insights,
+                                                  ns.data[ns.target_name],
+                                                  cooldown=ns.pred_args.anomaly_cooldown)
+                        row_insights['anomaly'] = anomalies
 
             if ns.tss.is_timeseries and ns.tss.horizon > 1:
                 if is_numerical:
 
@@ -303,6 +303,20 @@ def generate_json_ai(
                                 "stop_after": "$problem_definition.seconds_per_mixer",
                                 "horizon": "$problem_definition.timeseries_settings.horizon",
                             },
+                        },
+                        {
+                            "module": "ETSMixer",
+                            "args": {
+                                "stop_after": "$problem_definition.seconds_per_mixer",
+                                "horizon": "$problem_definition.timeseries_settings.horizon",
+                            },
+                        },
+                        {
+                            "module": "ARIMAMixer",
+                            "args": {
+                                "stop_after": "$problem_definition.seconds_per_mixer",
+                                "horizon": "$problem_definition.timeseries_settings.horizon",
+                            },
                         }
                     ]
                 )
@@ -361,7 +375,7 @@ def generate_json_ai(
     elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary]:
         accuracy_functions = ["balanced_accuracy_score"]
     elif output_dtype in (dtype.num_array, dtype.num_tsarray):
-        accuracy_functions = ["evaluate_num_array_accuracy"]
+        accuracy_functions = ["bounded_ts_accuracy"]
     elif output_dtype in (dtype.cat_array, dtype.cat_tsarray):
         accuracy_functions = ["evaluate_cat_array_accuracy"]
     else:
@@ -371,7 +385,7 @@ def generate_json_ai(
 
     if is_ts:
         if output_dtype in [dtype.integer, dtype.float]:
-            accuracy_functions = ["evaluate_num_array_accuracy"]  # forces this acc fn for t+1 time series forecasters
+            accuracy_functions = ["bounded_ts_accuracy"]  # forces this acc fn for t+1 time series forecasters  # noqa
 
         if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
             imputers.append({"module": "NumericalImputer",
@@ -585,7 +599,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             )
             problem_definition.fit_on_all = False  # takes too long otherwise
 
-        elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
+        elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"):
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
             mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
                 "dtype_dict", "$dtype_dict"
 
@@ -90,7 +90,10 @@ def statistical_analysis(data: pd.DataFrame,
     missing = {}
     distinct = {}
     for col in columns:
-        missing[col] = len([x for x in df[col] if x is None]) / len(df[col]) if len(df[col]) else 0
+        missing[col] = {
+            'missing': len([x for x in df[col] if x is None]) / len(df[col]) if len(df[col]) else 0,
+            'description': 'Proportion of missing values for the column. Columns with high % of missing values may not be as useful for modelling purposes.'  # noqa
+        }
         distinct[col] = len(set([str(x) for x in df[col]])) / len(df[col]) if len(df[col]) else 0
 
     nr_rows = len(df)
@@ -157,7 +160,7 @@ def statistical_analysis(data: pd.DataFrame,
         S, biased_buckets = compute_entropy_biased_buckets(histograms[col])
         bias[col] = {
             'entropy': S,
-            'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected.""", # noqa
+            'description': """"Potential bias" is flagged when data does not distribute normally or uniformly, likely over-representing or under-representing some values. This may be normal, hence bias is only "potential".""", # noqa
             'biased_buckets': biased_buckets
         }
 
 
@@ -10,7 +10,7 @@
 
 from lightwood.api.types import TimeseriesSettings
 from lightwood.api.dtype import dtype
-from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer, max_pacf
+from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer
 from lightwood.helpers.log import log
 from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers
 
@@ -36,16 +36,12 @@ def timeseries_analyzer(data: Dict[str, pd.DataFrame], dtype_dict: Dict[str, str
     """  # noqa
     tss = timeseries_settings
     groups = get_ts_groups(data['train'], tss)
-    deltas, periods, freqs = get_delta(data['train'], dtype_dict, groups, tss)
+    deltas, periods, freqs = get_delta(data['train'], dtype_dict, groups, target, tss)
 
     normalizers = generate_target_group_normalizers(data['train'], target, dtype_dict, groups, tss)
 
     if dtype_dict[target] in (dtype.integer, dtype.float, dtype.num_tsarray):
-        periods = max_pacf(data['train'], groups, target, tss)  # override with PACF output
-        naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'],
-                                                                             target,
-                                                                             tss,
-                                                                             groups)
+        naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'], target, tss, groups)
         differencers = get_differencers(data['train'], target, groups, tss.group_by)
         stl_transforms = get_stls(data['train'], data['dev'], target, periods, groups, tss)
     else:
@@ -71,6 +67,9 @@ def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, fl
     Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`).
     Useful for computing MASE forecasting error.
 
+    As per arxiv.org/abs/2203.10716, we resort to a constant forecast based on the last-seen measurement across the entire horizon.
+    By following the original measure, the naive forecaster would have the advantage of knowing the actual values whereas the predictor would not.
+
     Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple
      series, use `get_grouped_naive_resiudals`.
 
@@ -80,7 +79,7 @@ def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, fl
     :return: (list of naive residuals, average residual value)
     """  # noqa
     # @TODO: support categorical series as well
-    residuals = target_data.rolling(window=m + 1).apply(lambda x: abs(x.iloc[m] - x.iloc[0]))[m:].values.flatten()
+    residuals = np.abs(target_data.values[1:] - target_data.values[0]).flatten()
     scale_factor = np.average(residuals)
     return residuals.tolist(), scale_factor
 
 
@@ -51,7 +51,7 @@ def transform_timeseries(
     oby_col = tss.order_by
     groups = get_ts_groups(data, tss)
     if not ts_analysis:
-        _, periods, freqs = get_delta(data, dtype_dict, groups, tss)
+        _, periods, freqs = get_delta(data, dtype_dict, groups, target, tss)
     else:
         periods = ts_analysis['periods']
         freqs = ts_analysis['sample_freqs']
@@ -60,13 +60,16 @@ def transform_timeseries(
     subsets = []
     for group in groups:
         if (tss.group_by and group != '__default') or not tss.group_by:
-            if periods[group] == 0:
-                raise Exception(
-                    f"Partition is not valid, faulty group {group}. Please make sure you group by a set of columns that ensures unique measurements for each grouping through time.")  # noqa
             idxs, subset = get_group_matches(data, group, tss.group_by)
             if subset.shape[0] > 0:
+                if periods.get(group, periods['__default']) == 0 and subset.shape[0] > 1:
+                    raise Exception(
+                        f"Partition is not valid, faulty group {group}. Please make sure you group by a set of columns that ensures unique measurements for each grouping through time.")  # noqa
+
                 index = pd.to_datetime(subset[oby_col], unit='s')
-                subset.index = pd.date_range(start=index.iloc[0], freq=freqs[group], periods=len(subset))
+                subset.index = pd.date_range(start=index.iloc[0],
+                                             freq=freqs.get(group, freqs['__default']),
+                                             periods=len(subset))
                 subset['__mdb_inferred_freq'] = subset.index.freq   # sets constant column because pd.concat forgets freq (see: https://github.com/pandas-dev/pandas/issues/3232)  # noqa
                 subsets.append(subset)
     original_df = pd.concat(subsets).sort_values(by='__mdb_original_index')
 
@@ -2,7 +2,7 @@
 from lightwood.helpers.device import is_cuda_compatible, get_devices
 from lightwood.helpers.general import mase, is_none, evaluate_accuracy, evaluate_num_array_accuracy,\
     evaluate_array_accuracy, evaluate_multilabel_accuracy, evaluate_regression_accuracy, evaluate_cat_array_accuracy, \
-    bounded_evaluate_array_accuracy
+    bounded_ts_accuracy
 from lightwood.helpers.ts import get_group_matches, get_ts_groups, get_inferred_timestamps, add_tn_num_conf_bounds, \
     add_tn_cat_conf_bounds
 from lightwood.helpers.io import read_from_path_or_url
@@ -17,7 +17,7 @@
 
 __all__ = ['to_binary', 'f1_score', 'recall_score', 'precision_score', 'r2_score', 'is_cuda_compatible', 'get_devices',
            'get_group_matches', 'get_ts_groups', 'mase', 'is_none', 'evaluate_accuracy', 'evaluate_num_array_accuracy',
-           'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_evaluate_array_accuracy',
+           'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_ts_accuracy',
            'evaluate_multilabel_accuracy', 'evaluate_regression_accuracy', 'read_from_path_or_url', 'get_nr_procs',
            'mut_method_call', 'run_mut_method', 'tokenize_text', 'analyze_sentences', 'decontracted', 'contains_alnum',
            'get_identifier_description', 'get_identifier_description_mp', 'get_pct_auto_increment',
 
@@ -1,4 +1,5 @@
 import importlib
+from copy import deepcopy
 from typing import List, Dict, Optional
 import numpy as np
 import pandas as pd
@@ -28,7 +29,7 @@ def evaluate_accuracy(data: pd.DataFrame,
     score_dict = {}
 
     for accuracy_function_str in accuracy_functions:
-        if 'array_accuracy' in accuracy_function_str:
+        if 'array_accuracy' in accuracy_function_str or accuracy_function_str in ('bounded_ts_accuracy', ):
             if ts_analysis is None or not ts_analysis['tss'].is_timeseries:
                 # normal array, needs to be expanded
                 cols = [target]
@@ -47,7 +48,7 @@ def evaluate_accuracy(data: pd.DataFrame,
             elif accuracy_function_str == 'evaluate_cat_array_accuracy':
                 acc_fn = evaluate_cat_array_accuracy
             else:
-                acc_fn = bounded_evaluate_array_accuracy
+                acc_fn = bounded_ts_accuracy
             score_dict[accuracy_function_str] = acc_fn(true_values,
                                                        predictions,
                                                        data=data[cols],
@@ -204,7 +205,7 @@ def evaluate_cat_array_accuracy(
                                    base_acc_fn=balanced_accuracy_score)
 
 
-def bounded_evaluate_array_accuracy(
+def bounded_ts_accuracy(
         true_values: pd.Series,
         predictions: pd.Series,
         **kwargs
@@ -216,15 +217,18 @@ def bounded_evaluate_array_accuracy(
     For worse-than-naive, it scales linearly (with a factor).
     For better-than-naive, we fix 10 as 0.99, and scaled-logarithms (with 10 and 1e4 cutoffs as respective bases) are used to squash all remaining preimages to values between 0.5 and 1.0.
     """  # noqa
-    result = evaluate_array_accuracy(np.array(true_values),
-                                     np.array(predictions),
-                                     **kwargs)
-    if 10 < result <= 1e4:
+    true_values = deepcopy(true_values)
+    predictions = deepcopy(predictions)
+    result = evaluate_num_array_accuracy(true_values,
+                                         predictions,
+                                         **kwargs)
+    sp = 5
+    if sp < result <= 1e4:
         step_base = 0.99
         return step_base + (np.log(result) / np.log(1e4)) * (1 - step_base)
-    elif 1 <= result <= 10:
+    elif 1 <= result <= sp:
         step_base = 0.5
-        return step_base + (np.log(result) / np.log(10)) * (0.99 - step_base)
+        return step_base + (np.log(result) / np.log(sp)) * (0.99 - step_base)
     else:
         return result / 2  # worse than naive
 
 
@@ -42,13 +42,15 @@ def get_delta(
         df: pd.DataFrame,
         dtype_dict: dict,
         group_combinations: list,
+        target: str,
         tss
 ) -> Tuple[Dict, Dict, Dict]:
     """
     Infer the sampling interval of each time series, by picking the most popular time interval observed in the training data.
 
     :param df: Dataframe with time series data.
     :param group_combinations: all tuples with distinct values for `TimeseriesSettings.group_by` columns, defining all available time series.
+    :param target: name of target column
     :param tss: timeseries settings
 
     :return:
@@ -58,8 +60,8 @@ def get_delta(
     original_col = f'__mdb_original_{tss.order_by}'
     order_col = original_col if original_col in df.columns else tss.order_by
     deltas = {"__default": df[order_col].astype(float).rolling(window=2).apply(np.diff).value_counts().index[0]}
-    freq, period = detect_freq_period(deltas["__default"], tss)
-    periods = {"__default": period}
+    freq, period = detect_freq_period(deltas["__default"], tss, len(df))
+    periods = {"__default": [period]}
     freqs = {"__default": freq}
 
     if tss.group_by:
@@ -68,12 +70,15 @@ def get_delta(
                 _, subset = get_group_matches(df, group, tss.group_by)
                 if subset.shape[0] > 1:
                     deltas[group] = subset[order_col].rolling(window=2).apply(np.diff).value_counts().index[0]
-                    freq, period = detect_freq_period(deltas[group], tss)
-                    periods[group] = period
+                    freq, period = detect_freq_period(deltas[group], tss, len(subset))
                     freqs[group] = freq
+                    if period:
+                        periods[group] = [period]
+                    else:
+                        periods[group] = [max_pacf(df, group_combinations, target, tss)[group][0]]
                 else:
                     deltas[group] = 1.0
-                    periods[group] = 1
+                    periods[group] = [1]
                     freqs[group] = 'S'
 
     return deltas, periods, freqs
@@ -171,7 +176,7 @@ def _flatten_series(series: np.ndarray) -> np.ndarray:
         return series
 
 
-def detect_freq_period(deltas: pd.DataFrame, tss) -> tuple:
+def detect_freq_period(deltas: pd.DataFrame, tss, n_points) -> tuple:
     """
     Helper method that, based on the most popular interval for a time series, determines its seasonal peridiocity (sp).
     This bit of information can be crucial for good modelling with methods like ARIMA.
@@ -212,9 +217,12 @@ def detect_freq_period(deltas: pd.DataFrame, tss) -> tuple:
     }
     freq_to_period = {interval: period for (interval, period) in tss.interval_periods}
     for tag, period in (('yearly', 1), ('quarterly', 4), ('bimonthly', 6), ('monthly', 12),
-                        ('weekly', 4), ('daily', 1), ('hourly', 24), ('minute', 1), ('second', 1), ('constant', 0)):
+                        ('weekly', 52), ('daily', 7), ('hourly', 24), ('minute', 60), ('second', 60), ('constant', 0)):
         if tag not in freq_to_period.keys():
-            freq_to_period[tag] = period
+            if period <= n_points:
+                freq_to_period[tag] = period
+            else:
+                freq_to_period[tag] = None
 
     diffs = [(tag, abs(deltas - secs)) for tag, secs in secs_to_interval.items()]
     freq, min_diff = sorted(diffs, key=lambda x: x[1])[0]
 
@@ -5,6 +5,8 @@
 from lightwood.mixer.lightgbm import LightGBM
 from lightwood.mixer.lightgbm_array import LightGBMArray
 from lightwood.mixer.sktime import SkTime
+from lightwood.mixer.arima import ARIMAMixer
+from lightwood.mixer.ets import ETSMixer
 from lightwood.mixer.nhits import NHitsMixer
 from lightwood.mixer.prophet import ProphetMixer
 from lightwood.mixer.regression import Regression
@@ -15,4 +17,4 @@
     QClassic = None
 
 __all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'LightGBMArray', 'Unit', 'Regression',
-           'SkTime', 'QClassic', 'ProphetMixer', 'NHitsMixer']
+           'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer']