generated from ediakatos/template-poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
101 lines (86 loc) · 3.66 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Utility functions for the project."""
import pandas as pd
def map_and_drop_columns(raw_data: pd.DataFrame, dictionary: dict) -> pd.DataFrame:
"""Renames columns in the raw_data DataFrame based.
Args:
raw_data (pd.DataFrame): The input DataFrame with raw data.
dictionary (dict): A dictionary where keys are
the new column names and values are the old column names.
Returns:
pd.DataFrame: A DataFrame with columns renamed and unnecessary columns dropped.
"""
rename_mapping = {value: key for key, value in dictionary.items() if value}
return raw_data[list(rename_mapping.keys())].rename(columns=rename_mapping)
def change_data_type(cleaned1_data: pd.DataFrame, json_schema: dict) -> pd.DataFrame:
"""Change the data types of columns in a DataFrame based on a JSON schema.
Args:
cleaned1_data (pd.DataFrame): The DataFrame with data to be type-casted.
json_schema (dict): The JSON schema defining
the desired data types for each column.
Returns:
pd.DataFrame: The DataFrame with columns cast to the specified data types.
"""
for column, properties in json_schema["properties"].items():
if column in cleaned1_data.columns:
column_type = properties.get("type")
if "array" in column_type:
cleaned1_data[column] = cleaned1_data[column].apply(
lambda x: ",".join(map(str, x))
if isinstance(x, list)
else (str(x) if pd.notna(x) else ""),
)
elif "string" in column_type:
cleaned1_data[column] = cleaned1_data[column].astype(str)
elif "number" in column_type:
cleaned1_data[column] = pd.to_numeric(
cleaned1_data[column],
errors="coerce",
)
elif "integer" in column_type:
cleaned1_data[column] = pd.to_numeric(
cleaned1_data[column],
errors="coerce",
).astype("Int64")
elif "null" in column_type:
cleaned1_data[column] = cleaned1_data[column].where(
cleaned1_data[column].notna(),
None,
)
return cleaned1_data
def normalize_event_type(df: pd.DataFrame, event_code_csv: str) -> pd.DataFrame:
"""Normalizes the Event_Type.
The CSV file is expected to have two columns with headers:
- event_code: the normalized event type key.
- event_name: the event type description.
For each row in `df`, if the standardized Event_Type value matches a
description from the CSV, the corresponding normalized key is stored in a
new column, Event_Code. If no match is found, the original Event_Type value
is retained.
Args:
df (pd.DataFrame): The input DataFrame containing an 'Event_Type' column.
event_code_csv (str): The path to the CSV file containing the event code
mapping.
Returns:
pd.DataFrame: The DataFrame with an additional 'Event_Code' column.
"""
event_mapping_df = pd.read_csv(event_code_csv)
event_mapping_df["event_name"] = (
event_mapping_df["event_name"].str.strip().str.upper()
)
event_mapping_df["event_code"] = event_mapping_df["event_code"].str.strip()
mapping = dict(
zip(
event_mapping_df["event_name"],
event_mapping_df["event_code"],
strict=False,
),
)
df["Event_Code"] = (
df["Event_Type"]
.astype(str)
.str.strip()
.str.upper()
.map(mapping)
.fillna(df["Event_Type"])
)
return df