10
10
11
11
from lightwood .api .types import TimeseriesSettings
12
12
from lightwood .api .dtype import dtype
13
- from lightwood .helpers .ts import get_ts_groups , get_delta , get_group_matches , Differencer
13
+ from lightwood .helpers .ts import get_ts_groups , get_delta , get_group_matches , Differencer , max_pacf
14
14
from lightwood .helpers .log import log
15
15
from lightwood .encoder .time_series .helpers .common import generate_target_group_normalizers
16
16
@@ -41,6 +41,7 @@ def timeseries_analyzer(data: Dict[str, pd.DataFrame], dtype_dict: Dict[str, str
41
41
normalizers = generate_target_group_normalizers (data ['train' ], target , dtype_dict , groups , tss )
42
42
43
43
if dtype_dict [target ] in (dtype .integer , dtype .float , dtype .num_tsarray ):
44
+ periods = max_pacf (data ['train' ], groups , target , tss ) # override with PACF output
44
45
naive_forecast_residuals , scale_factor = get_grouped_naive_residuals (data ['dev' ],
45
46
target ,
46
47
tss ,
@@ -96,9 +97,10 @@ def get_grouped_naive_residuals(
96
97
group_scale_factors = {}
97
98
for group in group_combinations :
98
99
idxs , subset = get_group_matches (info , group , tss .group_by )
99
- residuals , scale_factor = get_naive_residuals (subset [target ]) # @TODO: pass m once we handle seasonality
100
- group_residuals [group ] = residuals
101
- group_scale_factors [group ] = scale_factor
100
+ if subset .shape [0 ] > 1 :
101
+ residuals , scale_factor = get_naive_residuals (subset [target ]) # @TODO: pass m once we handle seasonality
102
+ group_residuals [group ] = residuals
103
+ group_scale_factors [group ] = scale_factor
102
104
return group_residuals , group_scale_factors
103
105
104
106
@@ -119,34 +121,38 @@ def get_stls(train_df: pd.DataFrame,
119
121
groups : list ,
120
122
tss : TimeseriesSettings
121
123
) -> Dict [str , object ]:
122
- stls = {}
124
+ stls = {'__default' : None }
123
125
for group in groups :
124
- _ , tr_subset = get_group_matches (train_df , group , tss .group_by )
125
- _ , dev_subset = get_group_matches (dev_df , group , tss .group_by )
126
- group_freq = tr_subset ['__mdb_inferred_freq' ].iloc [0 ]
127
- tr_subset = deepcopy (tr_subset )[target ]
128
- dev_subset = deepcopy (dev_subset )[target ]
129
- tr_subset .index = pd .date_range (start = tr_subset .iloc [0 ], freq = group_freq , periods = len (tr_subset )).to_period ()
130
- dev_subset .index = pd .date_range (start = dev_subset .iloc [0 ], freq = group_freq , periods = len (dev_subset )).to_period ()
131
- stl = _pick_ST (tr_subset , dev_subset , sps [group ])
132
- log .info (f'Best STL decomposition params for group { group } are: { stl ["best_params" ]} ' )
133
- stls [group ] = stl
126
+ if group != '__default' :
127
+ _ , tr_subset = get_group_matches (train_df , group , tss .group_by )
128
+ _ , dev_subset = get_group_matches (dev_df , group , tss .group_by )
129
+ if tr_subset .shape [0 ] > 0 and dev_subset .shape [0 ] > 0 and sps .get (group , False ):
130
+ group_freq = tr_subset ['__mdb_inferred_freq' ].iloc [0 ]
131
+ tr_subset = deepcopy (tr_subset )[target ]
132
+ dev_subset = deepcopy (dev_subset )[target ]
133
+ tr_subset .index = pd .date_range (start = tr_subset .iloc [0 ], freq = group_freq ,
134
+ periods = len (tr_subset )).to_period ()
135
+ dev_subset .index = pd .date_range (start = dev_subset .iloc [0 ], freq = group_freq ,
136
+ periods = len (dev_subset )).to_period ()
137
+ stl = _pick_ST (tr_subset , dev_subset , sps [group ])
138
+ log .info (f'Best STL decomposition params for group { group } are: { stl ["best_params" ]} ' )
139
+ stls [group ] = stl
134
140
return stls
135
141
136
142
137
- def _pick_ST (tr_subset : pd .Series , dev_subset : pd .Series , sp : int ):
143
+ def _pick_ST (tr_subset : pd .Series , dev_subset : pd .Series , sp : list ):
138
144
"""
139
145
Perform hyperparam search with optuna to find best combination of ST transforms for a time series.
140
146
141
147
:param tr_subset: training series used for fitting blocks. Index should be datetime, and values are the actual time series.
142
148
:param dev_subset: dev series used for computing loss. Index should be datetime, and values are the actual time series.
143
- :param sp: seasonal period
149
+ :param sp: list of candidate seasonal periods
144
150
:return: best deseasonalizer and detrender combination based on dev_loss
145
151
""" # noqa
146
152
147
153
def _ST_objective (trial : optuna .Trial ):
148
154
trend_degree = trial .suggest_categorical ("trend_degree" , [1 , 2 ])
149
- ds_sp = trial .suggest_categorical ("ds_sp" , [ sp ] ) # seasonality period to use in deseasonalizer
155
+ ds_sp = trial .suggest_categorical ("ds_sp" , sp ) # seasonality period to use in deseasonalizer
150
156
if min (min (tr_subset ), min (dev_subset )) <= 0 :
151
157
decomp_type = trial .suggest_categorical ("decomp_type" , ['additive' ])
152
158
else :
@@ -161,7 +167,8 @@ def _ST_objective(trial: optuna.Trial):
161
167
trial .set_user_attr ("transformer" , transformer )
162
168
return np .power (residuals , 2 ).sum ()
163
169
164
- study = optuna .create_study ()
170
+ space = {"trend_degree" : [1 , 2 ], "ds_sp" : sp , "decomp_type" : ['additive' , 'multiplicative' ]}
171
+ study = optuna .create_study (sampler = optuna .samplers .GridSampler (space ))
165
172
study .optimize (_ST_objective , n_trials = 8 )
166
173
167
174
return {
0 commit comments