Skip to content

Commit 0267912

Browse files
committed
Fixed bug in threshold data exclusion: now use only obs data to filter model output
1 parent 36354df commit 0267912

File tree

1 file changed

+54
-169
lines changed

1 file changed

+54
-169
lines changed

pydelmod/calibplot.py

Lines changed: 54 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -115,26 +115,28 @@ def scatterplot(dflist, names, index_x=0):
115115
dfa = dfa.resample("D").mean()
116116
return dfa.hvplot.scatter(x=dfa.columns[index_x], hover_cols="all")
117117

118-
119118
def remove_data_for_time_windows_thresholds(
120-
df: pd.DataFrame,
121-
time_window_exclusion_list_str,
122-
invert_selection=False,
123-
upper_threshold=None,
119+
df1,
120+
df2,
121+
time_window_exclusion_list_str,
122+
invert_selection=False,
123+
upper_threshold=None
124124
):
125-
"""removes data from dataframe that is within time windows in the time_window_exclusion_list
125+
""" Remove data from all godin filtered dataframes using time window exclusion and threshold values. Time windows to remove for threshold values
126+
should be determined using observed data only, to ensure consistency.
126127
Args:
127-
df (DataFrame): The DataFrame from which to remove data
128+
df1 (DataFrame): Assumed to be observed data. Thresholds are applied only to observed data.
129+
df2 (DataFrame): Assumed to be the data we want to process. This DataFrame will be returned. To process observed data,
130+
df1 and df2 should both be the observed time series.
128131
time_window_exclusion_list_str (str): A string consisting of one or more time windows separated by commas, each time window
129132
using the format 'yyyy-mm-dd_yyyy-mm-dd' Data in each of the specified time windows will be excluded from the metrics calculations
130-
invert_selection (bool): If True, keep data in the time windows rather than removing it.
133+
invert_selection (bool): If True, keep data in the time windows rather than removing it. This is for the right hand side plot, showing excluded data.
131134
threshold_value (float): If specified, and if invert_selection==True, then data will be retained if value is above threshold OR
132135
datetime is outside all specified timewindows.
133136
Returns:
134137
DataFrame: DataFrame with data removed
135138
"""
136-
# df = df.copy()
137-
cols = df.columns
139+
cols = df1.columns
138140
if upper_threshold is None:
139141
upper_threshold = 999999
140142
else:
@@ -143,65 +145,38 @@ def remove_data_for_time_windows_thresholds(
143145
else:
144146
upper_threshold = 999999
145147

148+
# parse time window exclusion list
146149
time_window_exclusion_list = None
147-
if (
148-
time_window_exclusion_list_str is not None
149-
and len(time_window_exclusion_list_str.strip()) > 0
150-
):
150+
if (time_window_exclusion_list_str is not None and len(time_window_exclusion_list_str.strip()) > 0):
151151
time_window_exclusion_list = time_window_exclusion_list_str.split(",")
152-
if (
153-
time_window_exclusion_list is not None
154-
and len(time_window_exclusion_list) > 0
155-
and df is not None
156-
):
152+
153+
# set above_threshold in df1
154+
df1["above_threshold"] = False
155+
df1.loc[(df1[cols[0]] >= upper_threshold), "above_threshold"] = True
156+
# instead set df2["above_threshold"] to value matching datetime from df1
157+
df2["above_threshold"] = df1["above_threshold"]
158+
159+
# if time windows have been specified for data exclusion (upper threshold is also handled here)
160+
if (time_window_exclusion_list is not None and len(time_window_exclusion_list) > 0 and df2 is not None):
157161
tw_index = 0
158162
last_tw = None
159163

164+
cols = df2.columns
160165
if invert_selection:
161166
# set all values NOT in any timewindow to nan.
162-
cols = df.columns
163-
df["outside_all_tw"] = True
164-
df["above_threshold"] = False
165-
df["keep_inverted"] = False
167+
df2["outside_all_tw"] = True
168+
# df2["above_threshold"] = False
169+
df2["keep_inverted"] = False
170+
# set outside_all_tw to False for all rows where datetime is inside one of the timewindows
166171
for tw in time_window_exclusion_list:
167172
start_dt_str, end_dt_str = tw.split("_")
168-
df.loc[
169-
((df.index >= start_dt_str) & (df.index < end_dt_str)),
170-
"outside_all_tw",
171-
] = False
172-
df.loc[(df[cols[0]] >= upper_threshold), "above_threshold"] = True
173-
df.loc[
174-
((df["outside_all_tw"] == False) | (df["above_threshold"] == True)),
175-
"keep_inverted",
176-
] = True
177-
df.loc[df["keep_inverted"] == False, cols[0]] = np.nan
178-
df.drop(
179-
columns=["outside_all_tw", "above_threshold", "keep_inverted"],
180-
inplace=True,
181-
)
182-
# df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df.index<pd.Timestamp(start_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
183-
# conditions = [ (df.index >= pd.Timestamp(s)) & (df.index <= pd.Timestamp(e)) for s,e in array_of_tuples] # [(3,5), (19, 38)]
184-
# functools.reduce
185-
# c=conditions[0]
186-
# for c2 in conditions[1:]:
187-
# c = c | c2
188-
# df[c] = np.nan
189-
# date_range_list = []
190-
# start_dt_list = []
191-
# end_dt_list = []
192-
# for tw in time_window_exclusion_list:
193-
# start_dt_str, end_dt_str = tw.split('_')
194-
# # date_range_list.append(pd.date_range(start=pd.Timestamp(start_dt_str), end=pd.Timestamp(end_dt_str, freq='15T')))
195-
# start_dt_list.append(start_dt_str)
196-
# end_dt_list.append(end_dt_str)
197-
# print('*****************************************************************************************')
198-
# print('lengths of start, end date lists='+str(len(start_dt_list))+','+str(len(end_dt_list)))
199-
# print('*****************************************************************************************')
200-
# # if the timestamp is outside every time window, AND is above the threshold
201-
# df[(all((df.index < start_dt) | (df.index >= end_dt)) for start_dt, end_dt in zip(start_dt_list, end_dt_list)) & df>=threshold_value] = np.nan
202-
# # df[all(df.index not in date_range for date_range in date_range_list) & (df[cols[0]] < threshold_value)] = np.nan
203-
# # df[test_function(df, start_dt_list, end_dt_list) & df>=threshold_value] = np.nan
173+
df2.loc[((df2.index >= start_dt_str) & (df2.index < end_dt_str)),"outside_all_tw",] = False
204174

175+
# set keep_inverted to True for all rows that are either outside one of the exclusion time windows or >= threshold value
176+
df2.loc[((df2["outside_all_tw"] == False) | (df2["above_threshold"] == True)),"keep_inverted",] = True
177+
# if keep_inverted is false, set value to nan
178+
df2.loc[df2["keep_inverted"] == False, cols[0]] = np.nan
179+
# df2.drop(columns=["outside_all_tw", "above_threshold", "keep_inverted"], inplace=True,)
205180
for tw in time_window_exclusion_list:
206181
if len(tw) > 0:
207182
start_dt_str, end_dt_str = tw.split("_")
@@ -210,119 +185,22 @@ def remove_data_for_time_windows_thresholds(
210185
# This is the old way: not good for plotting, because it becomes an ITS
211186
# df = df[(df.index < start_dt_str) | (df.index > end_dt_str)]
212187
# df[start_dt_str:end_dt_str] = np.nan
213-
df[
214-
(
215-
(df.index > pd.Timestamp(start_dt_str))
216-
& (df.index <= pd.Timestamp(end_dt_str))
217-
)
218-
| (df[cols[0]] >= upper_threshold)
219-
] = np.nan
220-
# else:
221-
# # keep data in the timewindows, and remove all other data, except those that are above the threshold
222-
# if tw_index == 0:
223-
# df[(df.index<=pd.Timestamp(start_dt_str)) & (df[cols[0]]<threshold_value)] = np.nan
224-
# else:
225-
# # if in any time window
226-
# last_start_dt_str, last_end_dt_str = last_tw.split('_')
227-
# # df[last_end_dt_str:start_dt_str | df < threshold_value] = np.nan
228-
# # # if the timestamp is outside every time window, AND is above the threshold
229-
# # df[(all((df.index < start_dt) | (df.index >= end_dt)) for start_dt, end_dt in zip(start_dt_list, end_dt_list)) & df>=threshold_value] = np.nan
230-
231-
# # df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df.index<pd.Timestamp(start_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
232-
# last_tw = tw
188+
# for the do not invert (left hand side plot) option, remove values that are in the exclusion timewindows or above threshold
189+
df2[((df2.index > pd.Timestamp(start_dt_str)) & (df2.index <= pd.Timestamp(end_dt_str))) | (df2[cols[0]] >= upper_threshold)] = np.nan
233190
tw_index += 1
234-
# now remove the data after the end of the last timewindow
235-
# if invert_selection and last_tw is not None and len(last_tw)>0:
236-
# last_start_dt_str, last_end_dt_str = last_tw.split('_')
237-
# df[(df.index>=pd.Timestamp(last_end_dt_str)) & (df[cols[0]] < threshold_value)] = np.nan
238-
elif upper_threshold is not None:
191+
else:
239192
if not invert_selection:
240-
df[df >= upper_threshold] = np.nan
193+
df2.loc[df2["above_threshold"] == True] = np.nan
194+
# df[df >= upper_threshold] = np.nan
241195
else:
242-
df[df < upper_threshold] = np.nan
243-
return df
244-
245-
246-
# def remove_data_for_time_windows_thresholds(df: pd.DataFrame, time_window_exclusion_list_str, invert_selection=False, upper_threshold=None, \
247-
# lower_threshold=None):
248-
# """removes data from dataframe that is within time windows in the time_window_exclusion_list
249-
# if data masking does not remove any data (which could happen if invert_selection=True and the data masking timewindow is outside the
250-
# time window of the data set), then this will return a dataframe with only nans. Code that calls this method must be prepared to
251-
# deal with this situation.
252-
# Args:
253-
# df (DataFrame): The DataFrame from which to remove data
254-
# time_window_exclusion_list_str (str): A string consisting of one or more time windows separated by commas, each time window
255-
# using the format 'yyyy-mm-dd_yyyy-mm-dd' Data in each of the specified time windows will be excluded from the metrics calculations
256-
# invert_selection (bool): If True, keep data in the time windows rather than removing it.
257-
# upper_threshold (float): If specified, and if invert_selection==True, then data will be retained if value is above threshold OR
258-
# datetime is outside all specified timewindows.
259-
# lower_threshold (float): If specified, and if invert_selection==True, then data will be retained if value is below threshold OR
260-
# datetime is outside all specified timewindows.
261-
# Returns:
262-
# DataFrame: DataFrame with data removed
263-
# """
264-
# # df = df.copy()
265-
# cols = df.columns
266-
# if upper_threshold is None:
267-
# upper_threshold = 999999
268-
# else:
269-
# if(len(str(upper_threshold))>0):
270-
# upper_threshold = float(upper_threshold)
271-
# else:
272-
# upper_threshold = 999999
273-
274-
# if lower_threshold is None:
275-
# lower_threshold = -999999
276-
# else:
277-
# if(len(str(lower_threshold))>0):
278-
# lower_threshold = float(lower_threshold)
279-
# else:
280-
# lower_threshold = -999999
281-
282-
# time_window_exclusion_list = None
283-
# if time_window_exclusion_list_str is not None and len(time_window_exclusion_list_str.strip())>0:
284-
# time_window_exclusion_list = time_window_exclusion_list_str.split(',')
285-
# if (time_window_exclusion_list is not None and len(time_window_exclusion_list) > 0 and df is not None):
286-
# tw_index = 0
287-
# last_tw = None
288-
289-
# if invert_selection:
290-
# # set all values NOT in any timewindow to nan.
291-
# cols = df.columns
292-
# df['outside_all_tw'] = True
293-
# df['above_upper_threshold'] = False
294-
# df['below_lower_threshold'] = False
295-
# df['keep_inverted'] = False
296-
# for tw in time_window_exclusion_list:
297-
# start_dt_str, end_dt_str = tw.split('_')
298-
# df.loc[((df.index>=start_dt_str) & (df.index<end_dt_str)), 'outside_all_tw'] = False
299-
# df.loc[(df[cols[0]]>=upper_threshold), 'above_lower_threshold'] = True
300-
# df.loc[(df[cols[0]]<=lower_threshold), 'below_lower_threshold'] = True
301-
# df.loc[((df['outside_all_tw']==False) | (df['above_upper_threshold']==True) | (df['below_lower_threshold']==True)), 'keep_inverted'] = True
302-
# df.loc[df['keep_inverted']==False, cols[0]] = np.nan
303-
# df.drop(columns=['outside_all_tw', 'above_upper_threshold', 'below_lower_threshold', 'keep_inverted'], inplace=True)
304-
305-
# for tw in time_window_exclusion_list:
306-
# if len(tw)>0:
307-
# start_dt_str, end_dt_str = tw.split('_')
308-
# if not invert_selection:
309-
# # remove data in the time windows
310-
# # This is the old way: not good for plotting, because it becomes an ITS
311-
# # df = df[(df.index < start_dt_str) | (df.index > end_dt_str)]
312-
# # df[start_dt_str:end_dt_str] = np.nan
313-
# df[((df.index>pd.Timestamp(start_dt_str)) & (df.index<=pd.Timestamp(end_dt_str))) | \
314-
# (df[cols[0]]>=upper_threshold) | (df[cols[0]]<=lower_threshold)] = np.nan
315-
# tw_index += 1
316-
# else:
317-
# if not invert_selection:
318-
# df[df>=upper_threshold] = np.nan
319-
# df[df<=lower_threshold] = np.nan
320-
# else:
321-
# df[df<upper_threshold] = np.nan
322-
# df[df>lower_threshold] = np.nan
323-
# return df
324-
325-
196+
df2.loc[df2["above_threshold"] == False] = np.nan
197+
# df[df < upper_threshold] = np.nan
198+
199+
if "outside_all_tw" in df2.columns: df2.drop(columns=["outside_all_tw"], inplace=True)
200+
if "above_threshold" in df2.columns: df2.drop(columns=["above_threshold"], inplace=True)
201+
if "keep_inverted" in df2.columns: df2.drop(columns=["keep_inverted"], inplace=True)
202+
return df2
203+
326204
def calculate_metrics(dflist, names, index_x=0, location=None):
327205
"""Calculate metrics between the index_x column and other columns
328206
@@ -1194,10 +1072,12 @@ def build_godin_plot(
11941072
# if p is not None else None for p in gtsp_plot_data]
11951073
cfs_to_cms = 0.028316847
11961074
gtsp_plot_data = []
1075+
obs_data_gdf = pp[0].gdf
11971076
for p in pp:
11981077
if p.gdf is not None:
11991078
if mask_data:
12001079
new_p = remove_data_for_time_windows_thresholds(
1080+
obs_data_gdf,
12011081
p.gdf,
12021082
time_window_exclusion_list_str=time_window_exclusion_list,
12031083
invert_selection=invert_timewindow_exclusion,
@@ -1304,10 +1184,12 @@ def build_scatter_plots(
13041184
# and there are no data that have been masked. For example, this could happen if only one masking time window is specified,
13051185
# and it's outside the time window of the data.
13061186
any_data_left = True
1187+
obs_data_gdf = pp[0].gdf
13071188

13081189
for p in pp:
13091190
if mask_data:
13101191
gpd = remove_data_for_time_windows_thresholds(
1192+
obs_data_gdf,
13111193
p.gdf,
13121194
time_window_exclusion_list,
13131195
invert_selection=invert_timewindow_exclusion,
@@ -1787,9 +1669,12 @@ def build_metrics_table(
17871669
# gtsp_plot_data = [p.gdf for p in pp]
17881670
gtsp_plot_data = []
17891671
gpd = None
1672+
1673+
obs_data_gdf = pp[0].gdf
17901674
for p in pp:
17911675
if mask_data:
17921676
gpd = remove_data_for_time_windows_thresholds(
1677+
obs_data_gdf,
17931678
p.gdf,
17941679
time_window_exclusion_list,
17951680
invert_selection=invert_timewindow_exclusion,

0 commit comments

Comments
 (0)