generated from ediakatos/template-poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_normalisation_dc.py
121 lines (99 loc) · 4.05 KB
/
data_normalisation_dc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Data normalisation for the Disaster Charter dataset."""
import json
import re
from pathlib import Path
import pandas as pd
from src.data_consolidation.dictionary import (
DISASTER_CHARTER_MAPPING,
)
from src.utils.azure_blob_utils import read_blob_to_dataframe
from src.utils.util import (
change_data_type,
map_and_drop_columns,
normalize_event_type,
)
SCHEMA_PATH_DISASTER_CHARTER = "./src/disaster_charter/disaster_charter_schema.json"
BLOB_NAME = (
"disaster-impact/raw/disaster-charter/charter_activations_web_scrape_2000_2024.csv"
)
EVENT_CODE_CSV = "./static_data/event_code_table.csv"
def extract_event_type_from_event_name(
df: pd.DataFrame,
event_name_col: str = "Event_Name",
event_type_col: str = "Event_Type",
) -> pd.DataFrame:
"""Extracts the ET from the EN if the event type is missing or empty.
Args:
df (pd.DataFrame): The DataFrame containing event data.
event_name_col (str): The column name for event names. Default is 'Event_Name'.
event_type_col (str): The column name for event types. Default is 'Event_Type'.
Returns:
pd.DataFrame: The DataFrame with the event type column updated.
"""
if event_name_col in df.columns and event_type_col in df.columns:
def extract_event_type(row: pd.Series) -> str | None:
if (
pd.isna(
row[event_type_col],
)
or not row[event_type_col]
) and isinstance(
row[event_name_col],
str,
):
match = re.search(r"^(.*?)\s+in\s+", row[event_name_col])
if match:
return match.group(1).strip()
return row[event_type_col]
df[event_type_col] = df.apply(extract_event_type, axis=1)
return df
def remove_float_suffix(value: str | list) -> str | list:
"""Convert float values to strings without decimal points if they are whole numbers.
Args:
value (float, list): A float or a list of floats to be processed.
Returns:
str, list: A string or a list of
strings with whole number floats converted to integers.
"""
if isinstance(value, list):
cleaned_list = []
for item in value:
if isinstance(item, float) and item.is_integer():
cleaned_list.append(str(int(item)))
else:
cleaned_list.append(str(item))
return cleaned_list
if isinstance(value, float) and value.is_integer():
return str(int(value))
return str(value)
def main() -> None:
"""Normalises the Disaster Charter dataset."""
with Path(SCHEMA_PATH_DISASTER_CHARTER).open() as schema_disaster_charter:
disaster_schema = json.load(schema_disaster_charter)
disaster_charter_df_raw = read_blob_to_dataframe(BLOB_NAME)
cleaned1_df = map_and_drop_columns(
disaster_charter_df_raw,
DISASTER_CHARTER_MAPPING,
)
cleaned1_df = extract_event_type_from_event_name(
cleaned1_df,
event_name_col="Event_Name",
event_type_col="Event_Type",
)
cleaned2_df = change_data_type(cleaned1_df, disaster_schema)
cleaned2_df["Date"] = pd.to_datetime(cleaned2_df["Date"], errors="coerce")
cleaned2_df = normalize_event_type(cleaned2_df, EVENT_CODE_CSV)
schema_order = list(disaster_schema["properties"].keys())
ordered_columns = [col for col in schema_order if col in cleaned2_df.columns]
remaining_columns = [col for col in cleaned2_df.columns if col not in schema_order]
final_columns_order = ordered_columns + remaining_columns
cleaned2_df = cleaned2_df[final_columns_order]
if "Source_Event_IDs" in cleaned2_df.columns:
cleaned2_df["Source_Event_IDs"] = cleaned2_df["Source_Event_IDs"].apply(
remove_float_suffix,
)
Path("./data_mid_1/disaster_charter/").mkdir(parents=True, exist_ok=True)
output_file_path = "./data_mid_1/disaster_charter/disaster_charter_mid1.csv"
cleaned2_df.to_csv(output_file_path, index=False)
if __name__ == "__main__":
main()