@@ -115,26 +115,28 @@ def scatterplot(dflist, names, index_x=0):
115
115
dfa = dfa .resample ("D" ).mean ()
116
116
return dfa .hvplot .scatter (x = dfa .columns [index_x ], hover_cols = "all" )
117
117
118
-
119
118
def remove_data_for_time_windows_thresholds (
120
- df : pd .DataFrame ,
121
- time_window_exclusion_list_str ,
122
- invert_selection = False ,
123
- upper_threshold = None ,
119
+ df1 ,
120
+ df2 ,
121
+ time_window_exclusion_list_str ,
122
+ invert_selection = False ,
123
+ upper_threshold = None
124
124
):
125
- """removes data from dataframe that is within time windows in the time_window_exclusion_list
125
+ """ Remove data from all godin filtered dataframes using time window exclusion and threshold values. Time windows to remove for threshold values
126
+ should be determined using observed data only, to ensure consistency.
126
127
Args:
127
- df (DataFrame): The DataFrame from which to remove data
128
+ df1 (DataFrame): Assumed to be observed data. Thresholds are applied only to observed data.
129
+ df2 (DataFrame): Assumed to be the data we want to process. This DataFrame will be returned. To process observed data,
130
+ df1 and df2 should both be the observed time series.
128
131
time_window_exclusion_list_str (str): A string consisting of one or more time windows separated by commas, each time window
129
132
using the format 'yyyy-mm-dd_yyyy-mm-dd' Data in each of the specified time windows will be excluded from the metrics calculations
130
- invert_selection (bool): If True, keep data in the time windows rather than removing it.
133
+ invert_selection (bool): If True, keep data in the time windows rather than removing it. This is for the right hand side plot, showing excluded data.
131
134
threshold_value (float): If specified, and if invert_selection==True, then data will be retained if value is above threshold OR
132
135
datetime is outside all specified timewindows.
133
136
Returns:
134
137
DataFrame: DataFrame with data removed
135
138
"""
136
- # df = df.copy()
137
- cols = df .columns
139
+ cols = df1 .columns
138
140
if upper_threshold is None :
139
141
upper_threshold = 999999
140
142
else :
@@ -143,65 +145,38 @@ def remove_data_for_time_windows_thresholds(
143
145
else :
144
146
upper_threshold = 999999
145
147
148
+ # parse time window exclusion list
146
149
time_window_exclusion_list = None
147
- if (
148
- time_window_exclusion_list_str is not None
149
- and len (time_window_exclusion_list_str .strip ()) > 0
150
- ):
150
+ if (time_window_exclusion_list_str is not None and len (time_window_exclusion_list_str .strip ()) > 0 ):
151
151
time_window_exclusion_list = time_window_exclusion_list_str .split ("," )
152
- if (
153
- time_window_exclusion_list is not None
154
- and len (time_window_exclusion_list ) > 0
155
- and df is not None
156
- ):
152
+
153
+ # set above_threshold in df1
154
+ df1 ["above_threshold" ] = False
155
+ df1 .loc [(df1 [cols [0 ]] >= upper_threshold ), "above_threshold" ] = True
156
+ # instead set df2["above_threshold"] to value matching datetime from df1
157
+ df2 ["above_threshold" ] = df1 ["above_threshold" ]
158
+
159
+ # if time windows have been specified for data exclusion (upper threshold is also handled here)
160
+ if (time_window_exclusion_list is not None and len (time_window_exclusion_list ) > 0 and df2 is not None ):
157
161
tw_index = 0
158
162
last_tw = None
159
163
164
+ cols = df2 .columns
160
165
if invert_selection :
161
166
# set all values NOT in any timewindow to nan.
162
- cols = df . columns
163
- df [ "outside_all_tw " ] = True
164
- df [ "above_threshold " ] = False
165
- df [ "keep_inverted" ] = False
167
+ df2 [ "outside_all_tw" ] = True
168
+ # df2["above_threshold "] = False
169
+ df2 [ "keep_inverted " ] = False
170
+ # set outside_all_tw to False for all rows where datetime is inside one of the timewindows
166
171
for tw in time_window_exclusion_list :
167
172
start_dt_str , end_dt_str = tw .split ("_" )
168
- df .loc [
169
- ((df .index >= start_dt_str ) & (df .index < end_dt_str )),
170
- "outside_all_tw" ,
171
- ] = False
172
- df .loc [(df [cols [0 ]] >= upper_threshold ), "above_threshold" ] = True
173
- df .loc [
174
- ((df ["outside_all_tw" ] == False ) | (df ["above_threshold" ] == True )),
175
- "keep_inverted" ,
176
- ] = True
177
- df .loc [df ["keep_inverted" ] == False , cols [0 ]] = np .nan
178
- df .drop (
179
- columns = ["outside_all_tw" , "above_threshold" , "keep_inverted" ],
180
- inplace = True ,
181
- )
182
- # df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df.index<pd.Timestamp(start_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
183
- # conditions = [ (df.index >= pd.Timestamp(s)) & (df.index <= pd.Timestamp(e)) for s,e in array_of_tuples] # [(3,5), (19, 38)]
184
- # functools.reduce
185
- # c=conditions[0]
186
- # for c2 in conditions[1:]:
187
- # c = c | c2
188
- # df[c] = np.nan
189
- # date_range_list = []
190
- # start_dt_list = []
191
- # end_dt_list = []
192
- # for tw in time_window_exclusion_list:
193
- # start_dt_str, end_dt_str = tw.split('_')
194
- # # date_range_list.append(pd.date_range(start=pd.Timestamp(start_dt_str), end=pd.Timestamp(end_dt_str, freq='15T')))
195
- # start_dt_list.append(start_dt_str)
196
- # end_dt_list.append(end_dt_str)
197
- # print('*****************************************************************************************')
198
- # print('lengths of start, end date lists='+str(len(start_dt_list))+','+str(len(end_dt_list)))
199
- # print('*****************************************************************************************')
200
- # # if the timestamp is outside every time window, AND is above the threshold
201
- # df[(all((df.index < start_dt) | (df.index >= end_dt)) for start_dt, end_dt in zip(start_dt_list, end_dt_list)) & df>=threshold_value] = np.nan
202
- # # df[all(df.index not in date_range for date_range in date_range_list) & (df[cols[0]] < threshold_value)] = np.nan
203
- # # df[test_function(df, start_dt_list, end_dt_list) & df>=threshold_value] = np.nan
173
+ df2 .loc [((df2 .index >= start_dt_str ) & (df2 .index < end_dt_str )),"outside_all_tw" ,] = False
204
174
175
+ # set keep_inverted to True for all rows that are either outside one of the exclusion time windows or >= threshold value
176
+ df2 .loc [((df2 ["outside_all_tw" ] == False ) | (df2 ["above_threshold" ] == True )),"keep_inverted" ,] = True
177
+ # if keep_inverted is false, set value to nan
178
+ df2 .loc [df2 ["keep_inverted" ] == False , cols [0 ]] = np .nan
179
+ # df2.drop(columns=["outside_all_tw", "above_threshold", "keep_inverted"], inplace=True,)
205
180
for tw in time_window_exclusion_list :
206
181
if len (tw ) > 0 :
207
182
start_dt_str , end_dt_str = tw .split ("_" )
@@ -210,119 +185,22 @@ def remove_data_for_time_windows_thresholds(
210
185
# This is the old way: not good for plotting, because it becomes an ITS
211
186
# df = df[(df.index < start_dt_str) | (df.index > end_dt_str)]
212
187
# df[start_dt_str:end_dt_str] = np.nan
213
- df [
214
- (
215
- (df .index > pd .Timestamp (start_dt_str ))
216
- & (df .index <= pd .Timestamp (end_dt_str ))
217
- )
218
- | (df [cols [0 ]] >= upper_threshold )
219
- ] = np .nan
220
- # else:
221
- # # keep data in the timewindows, and remove all other data, except those that are above the threshold
222
- # if tw_index == 0:
223
- # df[(df.index<=pd.Timestamp(start_dt_str)) & (df[cols[0]]<threshold_value)] = np.nan
224
- # else:
225
- # # if in any time window
226
- # last_start_dt_str, last_end_dt_str = last_tw.split('_')
227
- # # df[last_end_dt_str:start_dt_str | df < threshold_value] = np.nan
228
- # # # if the timestamp is outside every time window, AND is above the threshold
229
- # # df[(all((df.index < start_dt) | (df.index >= end_dt)) for start_dt, end_dt in zip(start_dt_list, end_dt_list)) & df>=threshold_value] = np.nan
230
-
231
- # # df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df.index<pd.Timestamp(start_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
232
- # last_tw = tw
188
+ # for the do not invert (left hand side plot) option, remove values that are in the exclusion timewindows or above threshold
189
+ df2 [((df2 .index > pd .Timestamp (start_dt_str )) & (df2 .index <= pd .Timestamp (end_dt_str ))) | (df2 [cols [0 ]] >= upper_threshold )] = np .nan
233
190
tw_index += 1
234
- # now remove the data after the end of the last timewindow
235
- # if invert_selection and last_tw is not None and len(last_tw)>0:
236
- # last_start_dt_str, last_end_dt_str = last_tw.split('_')
237
- # df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
238
- elif upper_threshold is not None :
191
+ else :
239
192
if not invert_selection :
240
- df [df >= upper_threshold ] = np .nan
193
+ df2 .loc [df2 ["above_threshold" ] == True ] = np .nan
194
+ # df[df >= upper_threshold] = np.nan
241
195
else :
242
- df [df < upper_threshold ] = np .nan
243
- return df
244
-
245
-
246
- # def remove_data_for_time_windows_thresholds(df: pd.DataFrame, time_window_exclusion_list_str, invert_selection=False, upper_threshold=None, \
247
- # lower_threshold=None):
248
- # """removes data from dataframe that is within time windows in the time_window_exclusion_list
249
- # if data masking does not remove any data (which could happen if invert_selection=True and the data masking timewindow is outside the
250
- # time window of the data set), then this will return a dataframe with only nans. Code that calls this method must be prepared to
251
- # deal with this situation.
252
- # Args:
253
- # df (DataFrame): The DataFrame from which to remove data
254
- # time_window_exclusion_list_str (str): A string consisting of one or more time windows separated by commas, each time window
255
- # using the format 'yyyy-mm-dd_yyyy-mm-dd' Data in each of the specified time windows will be excluded from the metrics calculations
256
- # invert_selection (bool): If True, keep data in the time windows rather than removing it.
257
- # upper_threshold (float): If specified, and if invert_selection==True, then data will be retained if value is above threshold OR
258
- # datetime is outside all specified timewindows.
259
- # lower_threshold (float): If specified, and if invert_selection==True, then data will be retained if value is below threshold OR
260
- # datetime is outside all specified timewindows.
261
- # Returns:
262
- # DataFrame: DataFrame with data removed
263
- # """
264
- # # df = df.copy()
265
- # cols = df.columns
266
- # if upper_threshold is None:
267
- # upper_threshold = 999999
268
- # else:
269
- # if(len(str(upper_threshold))>0):
270
- # upper_threshold = float(upper_threshold)
271
- # else:
272
- # upper_threshold = 999999
273
-
274
- # if lower_threshold is None:
275
- # lower_threshold = -999999
276
- # else:
277
- # if(len(str(lower_threshold))>0):
278
- # lower_threshold = float(lower_threshold)
279
- # else:
280
- # lower_threshold = -999999
281
-
282
- # time_window_exclusion_list = None
283
- # if time_window_exclusion_list_str is not None and len(time_window_exclusion_list_str.strip())>0:
284
- # time_window_exclusion_list = time_window_exclusion_list_str.split(',')
285
- # if (time_window_exclusion_list is not None and len(time_window_exclusion_list) > 0 and df is not None):
286
- # tw_index = 0
287
- # last_tw = None
288
-
289
- # if invert_selection:
290
- # # set all values NOT in any timewindow to nan.
291
- # cols = df.columns
292
- # df['outside_all_tw'] = True
293
- # df['above_upper_threshold'] = False
294
- # df['below_lower_threshold'] = False
295
- # df['keep_inverted'] = False
296
- # for tw in time_window_exclusion_list:
297
- # start_dt_str, end_dt_str = tw.split('_')
298
- # df.loc[((df.index>=start_dt_str) & (df.index<end_dt_str)), 'outside_all_tw'] = False
299
- # df.loc[(df[cols[0]]>=upper_threshold), 'above_lower_threshold'] = True
300
- # df.loc[(df[cols[0]]<=lower_threshold), 'below_lower_threshold'] = True
301
- # df.loc[((df['outside_all_tw']==False) | (df['above_upper_threshold']==True) | (df['below_lower_threshold']==True)), 'keep_inverted'] = True
302
- # df.loc[df['keep_inverted']==False, cols[0]] = np.nan
303
- # df.drop(columns=['outside_all_tw', 'above_upper_threshold', 'below_lower_threshold', 'keep_inverted'], inplace=True)
304
-
305
- # for tw in time_window_exclusion_list:
306
- # if len(tw)>0:
307
- # start_dt_str, end_dt_str = tw.split('_')
308
- # if not invert_selection:
309
- # # remove data in the time windows
310
- # # This is the old way: not good for plotting, because it becomes an ITS
311
- # # df = df[(df.index < start_dt_str) | (df.index > end_dt_str)]
312
- # # df[start_dt_str:end_dt_str] = np.nan
313
- # df[((df.index>pd.Timestamp(start_dt_str)) & (df.index<=pd.Timestamp(end_dt_str))) | \
314
- # (df[cols[0]]>=upper_threshold) | (df[cols[0]]<=lower_threshold)] = np.nan
315
- # tw_index += 1
316
- # else:
317
- # if not invert_selection:
318
- # df[df>=upper_threshold] = np.nan
319
- # df[df<=lower_threshold] = np.nan
320
- # else:
321
- # df[df<upper_threshold] = np.nan
322
- # df[df>lower_threshold] = np.nan
323
- # return df
324
-
325
-
196
+ df2 .loc [df2 ["above_threshold" ] == False ] = np .nan
197
+ # df[df < upper_threshold] = np.nan
198
+
199
+ if "outside_all_tw" in df2 .columns : df2 .drop (columns = ["outside_all_tw" ], inplace = True )
200
+ if "above_threshold" in df2 .columns : df2 .drop (columns = ["above_threshold" ], inplace = True )
201
+ if "keep_inverted" in df2 .columns : df2 .drop (columns = ["keep_inverted" ], inplace = True )
202
+ return df2
203
+
326
204
def calculate_metrics (dflist , names , index_x = 0 , location = None ):
327
205
"""Calculate metrics between the index_x column and other columns
328
206
@@ -1194,10 +1072,12 @@ def build_godin_plot(
1194
1072
# if p is not None else None for p in gtsp_plot_data]
1195
1073
cfs_to_cms = 0.028316847
1196
1074
gtsp_plot_data = []
1075
+ obs_data_gdf = pp [0 ].gdf
1197
1076
for p in pp :
1198
1077
if p .gdf is not None :
1199
1078
if mask_data :
1200
1079
new_p = remove_data_for_time_windows_thresholds (
1080
+ obs_data_gdf ,
1201
1081
p .gdf ,
1202
1082
time_window_exclusion_list_str = time_window_exclusion_list ,
1203
1083
invert_selection = invert_timewindow_exclusion ,
@@ -1304,10 +1184,12 @@ def build_scatter_plots(
1304
1184
# and there are no data that have been masked. For example, this could happen if only one masking time window is specified,
1305
1185
# and it's outside the time window of the data.
1306
1186
any_data_left = True
1187
+ obs_data_gdf = pp [0 ].gdf
1307
1188
1308
1189
for p in pp :
1309
1190
if mask_data :
1310
1191
gpd = remove_data_for_time_windows_thresholds (
1192
+ obs_data_gdf ,
1311
1193
p .gdf ,
1312
1194
time_window_exclusion_list ,
1313
1195
invert_selection = invert_timewindow_exclusion ,
@@ -1787,9 +1669,12 @@ def build_metrics_table(
1787
1669
# gtsp_plot_data = [p.gdf for p in pp]
1788
1670
gtsp_plot_data = []
1789
1671
gpd = None
1672
+
1673
+ obs_data_gdf = pp [0 ].gdf
1790
1674
for p in pp :
1791
1675
if mask_data :
1792
1676
gpd = remove_data_for_time_windows_thresholds (
1677
+ obs_data_gdf ,
1793
1678
p .gdf ,
1794
1679
time_window_exclusion_list ,
1795
1680
invert_selection = invert_timewindow_exclusion ,
0 commit comments