9
9
import shapely
10
10
from shapely .geometry import shape
11
11
from shapely .strtree import STRtree
12
+ from sklearn import model_selection
12
13
13
14
14
- def make_week_crosstab (df , divisor , values = None , aggfunc = None , day_of_week_map = None ):
15
- """Return an hour / day-of-week crosstab scaled by a divisor."""
16
- ct = pd .crosstab (
17
- index = df ["datetime" ].dt .dayofweek ,
18
- columns = df ["datetime" ].dt .hour ,
19
- values = values ,
20
- aggfunc = aggfunc ,
21
- )
22
- if day_of_week_map :
23
- ct .rename (index = day_of_week_map , inplace = True )
24
- ct /= divisor # scale crosstab by divisor
25
- return ct
26
-
27
-
28
- def get_crosstab_min_max (
29
- df , col , categories , divisor = None , values_col = None , aggfunc = None
15
+ def min_max_across_crosstabs (
16
+ categories , cat_series , idx_series , col_series , value_series = None , aggfunc = None
30
17
):
31
- """Return the min and max values of weekly crosstabs across all categories.
18
+ """Return the min and max values of crosstabs across all categories.
32
19
33
20
Categories should be an iterable. Used to ensure that different heatmaps
34
21
have the same scale.
35
22
"""
23
+ if value_series is not None and aggfunc is None :
24
+ raise TypeError ("'value_series' requires 'aggfunc' to be specified." )
36
25
max_val = float ("-inf" )
37
26
min_val = float ("inf" )
38
27
for cat in categories :
39
- is_true = df [ col ] .isin ([cat ])
40
- idx = df . loc [is_true , "datetime" ]. dt . dayofweek
41
- cols = df . loc [is_true , "datetime" ]. dt . hour
28
+ is_true = cat_series .isin ([cat ])
29
+ idx = idx_series [is_true ]
30
+ cols = col_series [is_true ]
42
31
values = None
43
32
if aggfunc :
44
- values = df . loc [is_true , values_col ]
33
+ values = value_series [is_true ]
45
34
ct = pd .crosstab (index = idx , columns = cols , values = values , aggfunc = aggfunc )
46
35
47
- min_val = min (min_val , min (ct .min ())) # ct.min() returns pd.Series
36
+ min_val = min (min_val , min (ct .min ())) # ct.min() / max() return pd.Series
48
37
max_val = max (max_val , max (ct .max ()))
49
- if divisor :
50
- min_val /= divisor
51
- max_val /= divisor
52
38
return min_val , max_val
53
39
54
40
@@ -65,7 +51,7 @@ def make_heatmap_labels(
65
51
return ct_labels
66
52
67
53
68
- def date_to_season (dt : datetime .datetime ):
54
+ def date_to_season (dt : datetime .datetime | pd . Timestamp ):
69
55
"""Convert individual datetime or pd.Timestamp to season of year."""
70
56
# day of year corresponding to following dates:
71
57
# 1-Jan, 21-Mar, 21-Jun, 21-Sep, 21-Dec, 31-Dec
@@ -83,8 +69,7 @@ def date_to_season(dt: datetime.datetime):
83
69
84
70
85
71
def read_geojson (shape_file_loc : str , property_name : str ):
86
- """
87
- Return list of geometry ids and list of geometries from geojson.
72
+ """Return list of geometry ids and list of geometries from geojson.
88
73
89
74
Assumes geojson conforms to 2016 geojson convention.
90
75
"""
@@ -96,8 +81,7 @@ def read_geojson(shape_file_loc: str, property_name: str):
96
81
97
82
98
83
def id_nearest_shape (geometry : shapely .Point , r_tree : shapely .STRtree , shape_ids : list ):
99
- """
100
- Return the id (from list of shape_ids) of the nearest shape to input geometry.
84
+ """Return the id (from list of shape_ids) of the nearest shape to input geometry.
101
85
102
86
Uses a Shapely STRtree (R-tree) to perform a faster lookup.
103
87
"""
@@ -125,3 +109,48 @@ def add_location_feature(
125
109
lambda x : id_nearest_shape (x .geometry , tree , geom_ids ), axis = 1
126
110
)
127
111
return gdf
112
+
113
+
114
+ def search_grid (x , y , model , params , score , num_cv = 5 , low_score_best = True ):
115
+ """Perform grid search cross validation then print and return results.
116
+
117
+ Args:
118
+ x (pd.DataFrame, pd.Series, or np.ndarray): Model features.
119
+ y (pd.Series, or np.ndarray): Target.
120
+ model (sklearn model): Model to use in grid search.
121
+ params (dict): Key-value parameters to use in grid search. Key is model
122
+ input name.
123
+ score (str, callable, list, tuple or dict): Strategy to evaluate the
124
+ performance of the cross-validated model on the test set.
125
+ num_cv (int, cv generator or iterable): CV splitting strategy.
126
+ low_score_best (bool): Whether the lowest score is best. False indicates
127
+ that the highest score is best score.
128
+
129
+ Returns:
130
+ list(tup): List of grid search cross-validation results as tuples containing:
131
+ 1) mean test score
132
+ 2) run time in minutes
133
+ 3) parameters used
134
+
135
+ """
136
+ param_grid = model_selection .ParameterGrid (params )
137
+ results = []
138
+ print ("Mean Score" , "\t Run Time(min)" , "\t Parameters" )
139
+ for param in param_grid :
140
+ parameterized_model = model (** param )
141
+ cv_run = model_selection .cross_validate (
142
+ parameterized_model , x , y , scoring = score , cv = num_cv
143
+ )
144
+
145
+ mean_score = sum (cv_run ["test_score" ]) / num_cv
146
+ minutes = (sum (cv_run ["fit_time" ]) + sum (cv_run ["score_time" ])) / 60
147
+ results .append ((mean_score , minutes , param ))
148
+ result_string = f"{ mean_score :.4f} \t \t { minutes :.3f} \t \t { param } "
149
+ print (result_string )
150
+
151
+ results .sort (key = lambda z : z [0 ], reverse = low_score_best )
152
+ best_score = f"\n Best score: { results [0 ][0 ]} \n "
153
+ best_params = f"Best parameters: { results [0 ][2 ]} \n "
154
+ print (best_score + best_params )
155
+
156
+ return results
0 commit comments