generated from ediakatos/template-poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_normalisation_cerf.py
58 lines (46 loc) · 1.84 KB
/
data_normalisation_cerf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""Cerf data normalisation script."""
import json
from pathlib import Path
import pandas as pd
import pycountry
from src.data_consolidation.dictionary import (
CERF_MAPPING,
)
from src.utils.azure_blob_utils import read_blob_to_dataframe
from src.utils.util import (
change_data_type,
map_and_drop_columns,
normalize_event_type,
)
SCHEMA_PATH_CERF = "./src/cerf/cerf_schema.json"
EVENT_CODE_CSV = "./static_data/event_code_table.csv"
def main() -> None:
"""Normalise CERF data."""
blob_name = "disaster-impact/raw/cerf/cerf_emergency_data_dynamic_web_scrape.csv"
cerf_df_raw = read_blob_to_dataframe(blob_name)
with Path(SCHEMA_PATH_CERF).open() as schema_cerf:
cerf_schema = json.load(schema_cerf)
cleaned1_df = map_and_drop_columns(cerf_df_raw, CERF_MAPPING)
def get_iso3_code(country_name: str) -> None:
try:
return pycountry.countries.lookup(country_name).alpha_3
except LookupError:
return None
cleaned1_df["Country_Code"] = cleaned1_df["Country"].apply(get_iso3_code)
cleaned2_df = change_data_type(cleaned1_df, cerf_schema)
cleaned2_df["Date"] = pd.to_datetime(
cleaned2_df["Date"],
errors="coerce",
dayfirst=True,
)
cleaned2_df = normalize_event_type(cleaned2_df, EVENT_CODE_CSV)
schema_order = list(cerf_schema["properties"].keys())
ordered_columns = [col for col in schema_order if col in cleaned2_df.columns]
remaining_columns = [col for col in cleaned2_df.columns if col not in schema_order]
final_columns_order = ordered_columns + remaining_columns
cleaned2_df = cleaned2_df[final_columns_order]
Path("./data_mid_1/cerf/").mkdir(parents=True, exist_ok=True)
output_file_path = "./data_mid_1/cerf/cerf_mid1.csv"
cleaned2_df.to_csv(output_file_path, index=False)
if __name__ == "__main__":
main()