mindsdb
diff --git a/‎lightwood/__about__.py
+1-1 b/‎lightwood/__about__.py
+1-1
diff --git a/‎lightwood/analysis/explain.py
+1 b/‎lightwood/analysis/explain.py
+1
diff --git a/‎lightwood/analysis/helpers/conf_stats.py
+4-3 b/‎lightwood/analysis/helpers/conf_stats.py
+4-3
diff --git a/‎lightwood/analysis/nc/calibrate.py
+25-8 b/‎lightwood/analysis/nc/calibrate.py
+25-8
diff --git a/‎lightwood/api/json_ai.py
+67-33 b/‎lightwood/api/json_ai.py
+67-33
diff --git a/‎lightwood/api/types.py
+2 b/‎lightwood/api/types.py
+2
diff --git a/‎lightwood/data/cleaner.py
+3-1 b/‎lightwood/data/cleaner.py
+3-1
diff --git a/‎lightwood/data/encoded_ds.py
+2-11 b/‎lightwood/data/encoded_ds.py
+2-11
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.6.1.2'
+__version__ = '22.7.2.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "community@mindsdb.com"
 __author__ = 'MindsDB Inc'
 
@@ -37,6 +37,7 @@ def explain(data: pd.DataFrame,
     # Setup base insights
     # ------------------------- #
     data = data.reset_index(drop=True)
+    predictions = predictions.reset_index(drop=True)
 
     row_insights = pd.DataFrame()
     global_insights = {}
 
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import Dict
 from types import SimpleNamespace
 
@@ -55,10 +56,10 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
         mce: maximum value in `bins`.
         global_score: 1.0 minus absolute difference between accuracy and confidence over the entire validation set.
         """
-
+        confs = deepcopy(confs).reset_index(drop=True)
+        sorted_preds = deepcopy(preds).reset_index(drop=True)
+        sorted_inp = deepcopy(data).reset_index(drop=True)
         sorted_val = confs.sort_values(by='confidence', kind='stable')
-        sorted_preds = preds.reindex(sorted_val.index)
-        sorted_inp = data.reindex(sorted_val.index)
         sorted_inp['__mdb_confidence'] = sorted_val['confidence']
 
         if task_type == 'categorical':
 
@@ -90,8 +90,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
             icp = icp_class(nc, cal_size=self.validation_size)
 
             output['icp']['__default'] = icp
+            icp_df = deepcopy(ns.data)
 
             # setup prediction cache to avoid additional .predict() calls
+            pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \
+                isinstance(ns.normal_predictions['prediction'][0], list)
             if ns.is_classification:
                 if ns.predictor.supports_proba:
                     icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values
@@ -105,7 +108,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     predicted_classes = pd.get_dummies(preds).values  # inflate to one-hot enc
                     icp.nc_function.model.prediction_cache = predicted_classes
 
-            elif ns.is_multi_ts:
+            elif ns.is_multi_ts or pred_is_list:
                 # we fit ICPs for time series confidence bounds only at t+1 forecast
                 icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']])
             else:
@@ -116,6 +119,9 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
 
             # fit additional ICPs in time series tasks with grouped columns
             if ns.tss.is_timeseries and ns.tss.group_by:
+                # generate a multiindex
+                midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
+                icp_df.index = midx
 
                 # create an ICP for each possible group
                 group_info = ns.data[ns.tss.group_by].to_dict('list')
@@ -127,7 +133,6 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     output['icp'][tuple(combination)] = deepcopy(icp)
 
             # calibrate ICP
-            icp_df = deepcopy(ns.data)
             icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
             output['icp']['__default'].index = icp_df.columns
             output['icp']['__default'].calibrate(icp_df.values, y)
@@ -137,11 +142,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                 icp_df, icp, ns.dtype_dict[ns.target],
                 output, positive_domain=self.positive_domain, significance=self.fixed_significance)
             if not ns.is_classification:
-                result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float)
+                result_df = pd.DataFrame(index=icp_df.index, columns=['confidence', 'lower', 'upper'], dtype=float)
                 result_df.loc[icp_df.index, 'lower'] = ranges[:, 0]
                 result_df.loc[icp_df.index, 'upper'] = ranges[:, 1]
             else:
-                result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float)
+                result_df = pd.DataFrame(index=icp_df.index, columns=['confidence'], dtype=float)
 
             result_df.loc[icp_df.index, 'confidence'] = conf
 
@@ -152,10 +157,12 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
 
                 # add all predictions to DF
                 icps_df = deepcopy(ns.data)
-                if ns.is_multi_ts:
-                    icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']]
+                midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
+                icps_df.index = midx
+                if ns.is_multi_ts or pred_is_list:
+                    icps_df[f'__predicted_{ns.target}'] = np.array([p[0] for p in ns.normal_predictions['prediction']])
                 else:
-                    icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction']
+                    icps_df[f'__predicted_{ns.target}'] = np.array(ns.normal_predictions['prediction'])
 
                 for group in icps['__mdb_groups']:
                     icp_df = icps_df
@@ -207,6 +214,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
             # consolidate all groups here
             output['icp']['__mdb_active'] = True
 
+        result_df.index = ns.data.index
         output['result_df'] = result_df
 
         info = {**info, **output}
@@ -216,12 +224,21 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                 **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]:
         ns = SimpleNamespace(**kwargs)
 
+        if 'confidence' in ns.predictions.columns:
+            # bypass calibrator if model already outputs confidence
+            row_insights['prediction'] = ns.predictions['prediction']
+            row_insights['confidence'] = ns.predictions['confidence']
+            if 'upper' in ns.predictions.columns and 'lower' in ns.predictions.columns:
+                row_insights['upper'] = ns.predictions['upper']
+                row_insights['lower'] = ns.predictions['lower']
+            return row_insights, global_insights
+
         if ns.analysis['icp']['__mdb_active']:
             icp_X = deepcopy(ns.data)
 
             # replace observed data w/predictions
             preds = ns.predictions['prediction']
-            if ns.tss.is_timeseries and ns.tss.horizon > 1:
+            if ns.tss.is_timeseries and (ns.tss.horizon > 1 or isinstance(preds[0], list)):
                 preds = [p[0] for p in preds]
 
                 for col in [f'timestep_{i}' for i in range(1, ns.tss.horizon)]:
 
@@ -220,27 +220,46 @@ def generate_json_ai(
     ):
         is_target_predicting_encoder = True
 
+    submodels = []
     if is_target_predicting_encoder:
-        submodels = [
-            {
-                "module": "Unit",
-                "args": {
-                    "target_encoder": "$encoders[self.target]",
-                    "stop_after": "$problem_definition.seconds_per_mixer",
-                },
-            }
-        ]
+        submodels.extend(
+            [
+                {
+                    "module": "Unit",
+                    "args": {
+                        "target_encoder": "$encoders[self.target]",
+                        "stop_after": "$problem_definition.seconds_per_mixer",
+                    },
+                }
+            ]
+        )
     else:
-        submodels = [
-            {
-                "module": "Neural",
-                "args": {
-                    "fit_on_dev": True,
-                    "stop_after": "$problem_definition.seconds_per_mixer",
-                    "search_hyperparameters": True,
-                },
-            }
-        ]
+        if not tss.is_timeseries:
+            submodels.extend(
+                [
+                    {
+                        "module": "Neural",
+                        "args": {
+                            "fit_on_dev": True,
+                            "stop_after": "$problem_definition.seconds_per_mixer",
+                            "search_hyperparameters": True,
+                        },
+                    }
+                ]
+            )
+        else:
+            submodels.extend(
+                [
+                    {
+                        "module": "NeuralTs",
+                        "args": {
+                            "fit_on_dev": True,
+                            "stop_after": "$problem_definition.seconds_per_mixer",
+                            "search_hyperparameters": True,
+                        },
+                    }
+                ]
+            )
 
         if (not tss.is_timeseries or tss.horizon == 1) and dtype_dict[target] not in (dtype.num_array, dtype.cat_array):
             submodels.extend(
@@ -268,7 +287,8 @@ def generate_json_ai(
                         "args": {
                             "fit_on_dev": True,
                             "stop_after": "$problem_definition.seconds_per_mixer",
-                            "horizon": "$problem_definition.timeseries_settings.horizon",
+                            "ts_analysis": "$ts_analysis",
+                            "tss": "$problem_definition.timeseries_settings",
                         },
                     }
                 ]
@@ -494,29 +514,30 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
     is_ts = tss.is_timeseries
 
     # Add implicit arguments
-    # @TODO: Consider removing once we have a proper editor in studio
     mixers = json_ai.model['args']['submodels']
     for i in range(len(mixers)):
         if mixers[i]["module"] == "Unit":
             pass
 
-        elif mixers[i]["module"] == "Neural":
+        elif mixers[i]["module"] in ("Neural", "NeuralTs"):
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
             mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
                 "dtype_dict", "$dtype_dict"
             )
-            mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
-                "timeseries_settings", "$problem_definition.timeseries_settings"
-            )
             mixers[i]["args"]["net"] = mixers[i]["args"].get(
                 "net",
                 '"DefaultNet"'
                 if not tss.is_timeseries or not tss.use_previous_target
                 else '"ArNet"',
             )
+            if mixers[i]["module"] == "NeuralTs":
+                mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
+                    "timeseries_settings", "$problem_definition.timeseries_settings"
+                )
+                mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
 
         elif mixers[i]["module"] == "LightGBM":
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -551,8 +572,17 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
-            if "horizon" not in mixers[i]["args"]:
-                mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+            mixers[i]["args"]["tss"] = mixers[i]["args"].get("tss", "$problem_definition.timeseries_settings")
+            mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
+            mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
+
+        elif mixers[i]["module"] == "NHitsMixer":
+            mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
+            mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+            mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
+                "ts_analysis", "$ts_analysis"
+            )
+            problem_definition.fit_on_all = False  # takes too long otherwise
 
         elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -666,6 +696,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
                 "dtype_dict": "$dtype_dict",
                 "target": "$target",
                 "mode": "$mode",
+                "ts_analysis": "$ts_analysis"
             },
         },
         "timeseries_analyzer": {
@@ -807,11 +838,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 
 # Time-series blocks
 {ts_transform_code}
-"""
-    if ts_analyze_code is not None:
-        clean_body += f"""
-if self.mode != 'predict':
-{align(ts_analyze_code,1)}
 """
 
     clean_body += '\nreturn data'
@@ -835,12 +861,19 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
     # Prepare features Body
     # ----------------- #
 
-    prepare_body = f"""
+    prepare_body = """
 self.mode = 'train'
 
 if self.statistical_analysis is None:
     raise Exception("Please run analyze_data first")
+"""
+    if ts_analyze_code is not None:
+        prepare_body += f"""
+if self.mode != 'predict':
+    {align(ts_analyze_code, 1)}
+"""
 
+    prepare_body += f"""
 # Column to encoder mapping
 self.encoders = {inline_dict(encoder_dict)}
 
@@ -1133,6 +1166,7 @@ def __init__(self):
 
         # Initial stats analysis
         self.statistical_analysis = None
+        self.ts_analysis = None
         self.runtime_log = dict()
 
     @timed
 
@@ -73,6 +73,7 @@ class StatisticalAnalysis:
     :param bias:
     :param avg_words_per_sentence:
     :param positive_domain:
+    :param ts_stats:
     """ # noqa
 
     nr_rows: int
@@ -87,6 +88,7 @@ class StatisticalAnalysis:
     bias: object
     avg_words_per_sentence: object
     positive_domain: bool
+    ts_stats: dict
 
 
 @dataclass_json
 
@@ -147,13 +147,15 @@ def _standardize_datetime(element: object) -> Optional[float]:
     """
     Parses an expected date-time element. Intakes an element that can in theory be anything.
     """
+    if element is None or pd.isna(element):
+        return 0.0  # correct? TODO: Remove if the TS encoder can handle `None`
     try:
         date = parse_dt(str(element))
     except Exception:
         try:
             date = datetime.datetime.utcfromtimestamp(element)
         except Exception:
-            return None
+            return 0.0
 
     return date.timestamp()
 
 
@@ -140,13 +140,15 @@ class ConcatedEncodedDs(EncodedDs):
     """
     `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity.
     """  # noqa
+    # TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead
     def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
         # @TODO: missing super() call here?
         self.encoded_ds_arr = encoded_ds_arr
         self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr]
         self.encoders = self.encoded_ds_arr[0].encoders
         self.encoder_spans = self.encoded_ds_arr[0].encoder_spans
         self.target = self.encoded_ds_arr[0].target
+        self.data_frame = pd.concat([x.data_frame for x in self.encoded_ds_arr])
 
     def __len__(self):
         """
@@ -166,17 +168,6 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
                 idx -= length
         raise StopIteration()
 
-    @property
-    def data_frame(self) -> pd.DataFrame:
-        """
-        Property that concatenates all underlying `EncodedDs`'s dataframes and returns them.
-        
-        Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect.
-        
-        :return: Dataframe with all original data.
-        """  # noqa
-        return pd.concat([x.data_frame for x in self.encoded_ds_arr])
-
     def get_column_original_data(self, column_name: str) -> pd.Series:
         """
         See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`.