Refactor and update source code.

ray310 · ray310 · commit f7ca6fe00740 · 2024-07-31T16:56:41.000-05:00
- Refactor visualizations module
    - Add search_grid ML utility function
    - Clean up docstrings
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1 @@
+"""Source code used in NYC Collisions Analysis."""
diff --git a/src/constants.py b/src/constants.py
@@ -1,4 +1,4 @@
-""" Project constants, parameters, and settings."""
+"""Project constants, parameters, and settings."""
 
 COORD_REF_SYSTEM = "EPSG:4326"  # default geojson CRS
 
diff --git a/src/strings.py b/src/strings.py
@@ -1,17 +1,17 @@
-""" String formatting functions."""
+"""String formatting functions."""
 
 from typing import Iterable
 import numpy as np
 
 
 def add_line_breaks(str_arr: Iterable):
-    """Returns list of strings with optimal line break inserted"""
+    """Return list of strings with optimal line break inserted."""
     max_line_length = max(length_after_line_break(s) for s in str_arr)
     return [insert_line_break(s, max_line_length) for s in str_arr]
 
 
 def insert_line_break(s: str, idx: int):
-    """Returns new string with line break inserted at or before index, idx"""
+    """Return new string with line break inserted at or before index (idx)."""
     spaces = [x for x in get_space_indices(s) if x <= idx]
     if not spaces:
         return s
@@ -22,8 +22,9 @@ def insert_line_break(s: str, idx: int):
 
 
 def length_after_line_break(s: str):
-    """Returns length of longest line after replacing space with line break
-    near string mid-point."""
+    """Return length of longest line after replacing space with line break
+    near string mid-point.
+    """
     spaces = get_space_indices(s)
     if not spaces:
         return len(s)
@@ -33,7 +34,7 @@ def length_after_line_break(s: str):
 
 
 def get_space_indices(s: str):
-    """Returns list of indices for spaces in a string"""
+    """Return list of indices for spaces in a string."""
     if not isinstance(s, str):
         return None
     return [idx for idx, c in enumerate(s) if c == " "]
diff --git a/src/utils.py b/src/utils.py
@@ -9,46 +9,32 @@
 import shapely
 from shapely.geometry import shape
 from shapely.strtree import STRtree
+from sklearn import model_selection
 
 
-def make_week_crosstab(df, divisor, values=None, aggfunc=None, day_of_week_map=None):
-    """Return an hour / day-of-week crosstab scaled by a divisor."""
-    ct = pd.crosstab(
-        index=df["datetime"].dt.dayofweek,
-        columns=df["datetime"].dt.hour,
-        values=values,
-        aggfunc=aggfunc,
-    )
-    if day_of_week_map:
-        ct.rename(index=day_of_week_map, inplace=True)
-    ct /= divisor  # scale crosstab by divisor
-    return ct
-
-
-def get_crosstab_min_max(
-    df, col, categories, divisor=None, values_col=None, aggfunc=None
+def min_max_across_crosstabs(
+    categories, cat_series, idx_series, col_series, value_series=None, aggfunc=None
 ):
-    """Return the min and max values of weekly crosstabs across all categories.
+    """Return the min and max values of crosstabs across all categories.
 
     Categories should be an iterable. Used to ensure that different heatmaps
     have the same scale.
     """
+    if value_series is not None and aggfunc is None:
+        raise TypeError("'value_series' requires 'aggfunc' to be specified.")
     max_val = float("-inf")
     min_val = float("inf")
     for cat in categories:
-        is_true = df[col].isin([cat])
-        idx = df.loc[is_true, "datetime"].dt.dayofweek
-        cols = df.loc[is_true, "datetime"].dt.hour
+        is_true = cat_series.isin([cat])
+        idx = idx_series[is_true]
+        cols = col_series[is_true]
         values = None
         if aggfunc:
-            values = df.loc[is_true, values_col]
+            values = value_series[is_true]
         ct = pd.crosstab(index=idx, columns=cols, values=values, aggfunc=aggfunc)
 
-        min_val = min(min_val, min(ct.min()))  # ct.min() returns pd.Series
+        min_val = min(min_val, min(ct.min()))  # ct.min() / max() return pd.Series
         max_val = max(max_val, max(ct.max()))
-    if divisor:
-        min_val /= divisor
-        max_val /= divisor
     return min_val, max_val
 
 
@@ -65,7 +51,7 @@ def make_heatmap_labels(
     return ct_labels
 
 
-def date_to_season(dt: datetime.datetime):
+def date_to_season(dt: datetime.datetime | pd.Timestamp):
     """Convert individual datetime or pd.Timestamp to season of year."""
     # day of year corresponding to following dates:
     # 1-Jan, 21-Mar, 21-Jun, 21-Sep, 21-Dec, 31-Dec
@@ -83,8 +69,7 @@ def date_to_season(dt: datetime.datetime):
 
 
 def read_geojson(shape_file_loc: str, property_name: str):
-    """
-    Return list of geometry ids and list of geometries from geojson.
+    """Return list of geometry ids and list of geometries from geojson.
 
     Assumes geojson conforms to 2016 geojson convention.
     """
@@ -96,8 +81,7 @@ def read_geojson(shape_file_loc: str, property_name: str):
 
 
 def id_nearest_shape(geometry: shapely.Point, r_tree: shapely.STRtree, shape_ids: list):
-    """
-    Return the id (from list of shape_ids) of the nearest shape to input geometry.
+    """Return the id (from list of shape_ids) of the nearest shape to input geometry.
 
     Uses a Shapely STRtree (R-tree) to perform a faster lookup.
     """
@@ -125,3 +109,48 @@ def add_location_feature(
         lambda x: id_nearest_shape(x.geometry, tree, geom_ids), axis=1
     )
     return gdf
+
+
+def search_grid(x, y, model, params, score, num_cv=5, low_score_best=True):
+    """Perform grid search cross validation then print and return results.
+
+    Args:
+        x (pd.DataFrame, pd.Series, or np.ndarray): Model features.
+        y (pd.Series, or np.ndarray): Target.
+        model (sklearn model): Model to use in grid search.
+        params (dict): Key-value parameters to use in grid search. Key is model
+            input name.
+        score (str, callable, list, tuple or dict): Strategy to evaluate the
+            performance of the cross-validated model on the test set.
+        num_cv (int, cv generator or  iterable): CV splitting strategy.
+        low_score_best (bool): Whether the lowest score is best. False indicates
+            that the highest score is best score.
+
+    Returns:
+        list(tup): List of grid search cross-validation results as tuples containing:
+            1) mean test score
+            2) run time in minutes
+            3) parameters used
+
+    """
+    param_grid = model_selection.ParameterGrid(params)
+    results = []
+    print("Mean Score", "\tRun Time(min)", "\tParameters")
+    for param in param_grid:
+        parameterized_model = model(**param)
+        cv_run = model_selection.cross_validate(
+            parameterized_model, x, y, scoring=score, cv=num_cv
+        )
+
+        mean_score = sum(cv_run["test_score"]) / num_cv
+        minutes = (sum(cv_run["fit_time"]) + sum(cv_run["score_time"])) / 60
+        results.append((mean_score, minutes, param))
+        result_string = f"{mean_score:.4f}\t\t{minutes:.3f}\t\t{param}"
+        print(result_string)
+
+    results.sort(key=lambda z: z[0], reverse=low_score_best)
+    best_score = f"\nBest score: {results[0][0]}\n"
+    best_params = f"Best parameters: {results[0][2]}\n"
+    print(best_score + best_params)
+
+    return results
diff --git a/src/visualizations.py b/src/visualizations.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Source code used in NYC Collisions Analysis."""`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" Project constants, parameters, and settings."""`
	`1`	`+"""Project constants, parameters, and settings."""`
`2`	`2`
`3`	`3`	`COORD_REF_SYSTEM = "EPSG:4326" # default geojson CRS`
`4`	`4`