Skip to content

Commit 64dd5b8

Browse files
authored
Merge pull request #53 from OCHA-DAP/main
HDXDSYS-1318 Simplify poverty rate HAPI pipeline to read from global…
2 parents 7cfee46 + 0d0c3e3 commit 64dd5b8

10 files changed

+178
-92
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

7+
## [0.10.36] = 2025-01-30
8+
9+
### Changed
10+
11+
- Row functions in Admins use HXL tags instead of headers
12+
- Poverty rate columns updated
13+
714
## [0.10.35] = 2025-01-27
815

916
### Changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ dependencies = [
3939
"hdx-python-country>= 3.8.7",
4040
"hdx-python-database[postgresql]>= 1.3.4",
4141
"hdx-python-scraper>= 2.5.5",
42-
"hdx-python-utilities>= 3.8.2",
42+
"hdx-python-utilities>= 3.8.3",
4343
"libhxl",
4444
"sqlalchemy"
4545
]

requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ hdx-python-database==1.3.4
6969
# via hapi-pipelines (pyproject.toml)
7070
hdx-python-scraper==2.5.5
7171
# via hapi-pipelines (pyproject.toml)
72-
hdx-python-utilities==3.8.2
72+
hdx-python-utilities==3.8.3
7373
# via
7474
# hapi-pipelines (pyproject.toml)
7575
# hdx-python-api
@@ -295,7 +295,7 @@ xlrd3==1.1.0
295295
# via libhxl
296296
xlsx2csv==0.8.4
297297
# via hdx-python-utilities
298-
xlsxwriter==3.2.1
298+
xlsxwriter==3.2.2
299299
# via tableschema-to-template
300300
xlwt==1.3.0
301301
# via hdx-python-utilities

src/hapi/pipelines/database/admins.py

+61-13
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727

2828
class Admins(BaseUploader):
29-
admin_name_regex = re.compile(r"Admin (\d) Name")
29+
admin_name_regex = re.compile(r"#adm(\d)\+name")
3030

3131
def __init__(
3232
self,
@@ -202,29 +202,77 @@ def get_admin2_ref(
202202
return ref
203203

204204
@classmethod
205-
def get_max_admin_from_headers(cls, headers) -> int:
205+
def get_max_admin_from_hxltags(cls, hxltag_to_header: Dict) -> int:
206206
max_admin_level = 0
207-
for header in headers:
208-
match = cls.admin_name_regex.match(header)
207+
for hxltag in hxltag_to_header:
208+
match = cls.admin_name_regex.match(hxltag)
209209
if match:
210210
admin_level = int(match.group(1))
211211
if admin_level > max_admin_level:
212212
max_admin_level = admin_level
213213
return max_admin_level
214214

215215
@staticmethod
216-
def get_admin_level_from_row(row: Dict, max_admin_level: int) -> int:
216+
def get_admin_level_from_row(
217+
hxltag_to_header: Dict,
218+
row: Dict,
219+
max_admin_level: int,
220+
) -> int:
217221
for i in range(max_admin_level, 0, -1):
218-
admin_name = row.get(f"Admin {i} Name")
222+
admin_name = row.get(hxltag_to_header[f"#adm{i}+name"])
219223
if admin_name:
220224
return i
221225
return 0
222226

227+
def get_admin1_ref_from_row(
228+
self,
229+
hxltag_to_header: Dict,
230+
row: Dict,
231+
dataset_name: str,
232+
pipeline: str,
233+
admin_level: int,
234+
) -> Optional[int]:
235+
if admin_level == 1:
236+
admin_code = row[hxltag_to_header["#adm1+code"]]
237+
if admin_code:
238+
admin1_ref = self.get_admin1_ref(
239+
"adminone",
240+
admin_code,
241+
dataset_name,
242+
pipeline,
243+
self._error_handler,
244+
)
245+
if admin1_ref:
246+
return admin1_ref
247+
admin_code = get_admin1_to_location_connector_code(
248+
row[hxltag_to_header["#country+code"]]
249+
)
250+
return self.get_admin1_ref(
251+
"adminone",
252+
admin_code,
253+
dataset_name,
254+
pipeline,
255+
self._error_handler,
256+
)
257+
if admin_level == 0:
258+
return self.get_admin1_ref(
259+
"national",
260+
row[hxltag_to_header["#country+code"]],
261+
dataset_name,
262+
pipeline,
263+
self._error_handler,
264+
)
265+
223266
def get_admin2_ref_from_row(
224-
self, row: Dict, dataset_name: str, pipeline: str, admin_level: int
267+
self,
268+
hxltag_to_header: Dict,
269+
row: Dict,
270+
dataset_name: str,
271+
pipeline: str,
272+
admin_level: int,
225273
) -> Optional[int]:
226274
if admin_level == 2:
227-
admin_code = row["Admin 2 PCode"]
275+
admin_code = row[hxltag_to_header["#adm2+code"]]
228276
if admin_code:
229277
admin2_ref = self.get_admin2_ref(
230278
"admintwo",
@@ -235,7 +283,7 @@ def get_admin2_ref_from_row(
235283
)
236284
if admin2_ref:
237285
return admin2_ref
238-
admin_code = row["Admin 1 PCode"]
286+
admin_code = row[hxltag_to_header["#adm1+code"]]
239287
if admin_code:
240288
admin_code = get_admin2_to_admin1_connector_code(admin_code)
241289
admin2_ref = self.get_admin2_ref(
@@ -248,7 +296,7 @@ def get_admin2_ref_from_row(
248296
if admin2_ref:
249297
return admin2_ref
250298
admin_code = get_admin2_to_location_connector_code(
251-
row["Country ISO3"]
299+
row[hxltag_to_header["#country+code"]]
252300
)
253301
return self.get_admin2_ref(
254302
"admintwo",
@@ -258,7 +306,7 @@ def get_admin2_ref_from_row(
258306
self._error_handler,
259307
)
260308
if admin_level == 1:
261-
admin_code = row["Admin 1 PCode"]
309+
admin_code = row[hxltag_to_header["#adm1+code"]]
262310
if admin_code:
263311
admin2_ref = self.get_admin2_ref(
264312
"adminone",
@@ -270,7 +318,7 @@ def get_admin2_ref_from_row(
270318
if admin2_ref:
271319
return admin2_ref
272320
admin_code = get_admin1_to_location_connector_code(
273-
row["Country ISO3"]
321+
row[hxltag_to_header["#country+code"]]
274322
)
275323
return self.get_admin2_ref(
276324
"adminone",
@@ -282,7 +330,7 @@ def get_admin2_ref_from_row(
282330
if admin_level == 0:
283331
return self.get_admin2_ref(
284332
"national",
285-
row["Country ISO3"],
333+
row[hxltag_to_header["#country+code"]],
286334
dataset_name,
287335
pipeline,
288336
self._error_handler,

src/hapi/pipelines/database/humanitarian_needs.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from hapi_schema.db_humanitarian_needs import DBHumanitarianNeeds
77
from hdx.api.configuration import Configuration
88
from hdx.scraper.framework.utilities.reader import Read
9+
from hdx.utilities.dictandlist import invert_dictionary
910
from hdx.utilities.text import get_numeric_if_possible
1011
from sqlalchemy.orm import Session
1112

@@ -47,23 +48,27 @@ def populate(self) -> None:
4748
time_period_end = datetime(year, 12, 31, 23, 59, 59)
4849
url = resource["url"]
4950
headers, rows = reader.get_tabular_rows(url, dict_form=True)
50-
max_admin_level = self._admins.get_max_admin_from_headers(headers)
51+
hxltag_to_header = invert_dictionary(next(rows))
52+
max_admin_level = self._admins.get_max_admin_from_hxltags(
53+
hxltag_to_header
54+
)
5155
# Admin 1 PCode,Admin 2 PCode,Sector,Gender,Age Group,Disabled,Population Group,Population,In Need,Targeted,Affected,Reached
5256
for row in rows:
5357
error = row.get("Error")
5458
if error:
5559
continue
56-
countryiso3 = row["Country ISO3"]
57-
if countryiso3 == "#country+code": # ignore HXL row
58-
continue
5960
admin_level = self._admins.get_admin_level_from_row(
60-
row, max_admin_level
61+
hxltag_to_header, row, max_admin_level
6162
)
6263
# Can't handle higher admin levels
6364
if admin_level > 2:
6465
continue
6566
admin2_ref = self._admins.get_admin2_ref_from_row(
66-
row, dataset_name, "HumanitarianNeeds", admin_level
67+
hxltag_to_header,
68+
row,
69+
dataset_name,
70+
"HumanitarianNeeds",
71+
admin_level,
6772
)
6873
if not admin2_ref:
6974
continue

src/hapi/pipelines/database/operational_presence.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
88
from hdx.scraper.framework.utilities.reader import Read
99
from hdx.utilities.dateparse import parse_date
10+
from hdx.utilities.dictandlist import invert_dictionary
1011
from sqlalchemy.orm import Session
1112

1213
from ..utilities.batch_populate import batch_populate
@@ -42,23 +43,23 @@ def populate(self) -> None:
4243
resource = dataset.get_resource()
4344
url = resource["url"]
4445
headers, rows = reader.get_tabular_rows(url, dict_form=True)
45-
max_admin_level = self._admins.get_max_admin_from_headers(headers)
46+
hxltag_to_header = invert_dictionary(next(rows))
47+
max_admin_level = self._admins.get_max_admin_from_hxltags(
48+
hxltag_to_header
49+
)
4650
resources_to_ignore = []
4751
operational_presence_rows = []
4852
# Country ISO3,Admin 1 PCode,Admin 1 Name,Admin 2 PCode,Admin 2 Name,Admin 3 PCode,Admin 3 Name,Org Name,Org Acronym,Org Type,Sector,Start Date,End Date,Resource Id
4953
for row in rows:
5054
resource_id = row["Resource Id"]
5155
if resource_id in resources_to_ignore:
5256
continue
53-
countryiso3 = row["Country ISO3"]
5457
dataset_id = row["Dataset Id"]
55-
if dataset_id[0] == "#":
56-
continue
5758
dataset_name = self._metadata.get_dataset_name(dataset_id)
5859
if not dataset_name:
5960
dataset_name = dataset_id
6061
admin_level = self._admins.get_admin_level_from_row(
61-
row, max_admin_level
62+
hxltag_to_header, row, max_admin_level
6263
)
6364
actual_admin_level = admin_level
6465
# Higher admin levels treat as admin 2
@@ -68,10 +69,16 @@ def populate(self) -> None:
6869
else:
6970
error_when_duplicate = True
7071
admin2_ref = self._admins.get_admin2_ref_from_row(
71-
row, dataset_name, "OperationalPresence", admin_level
72+
hxltag_to_header,
73+
row,
74+
dataset_name,
75+
"OperationalPresence",
76+
admin_level,
7277
)
7378
if not admin2_ref:
7479
continue
80+
81+
countryiso3 = row["Country ISO3"]
7582
provider_admin1_name = get_provider_name(row, "Admin 1 Name")
7683
provider_admin2_name = get_provider_name(row, "Admin 2 Name")
7784

src/hapi/pipelines/database/poverty_rate.py

+23-41
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,12 @@
88
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
99
from hdx.scraper.framework.utilities.reader import Read
1010
from hdx.utilities.dateparse import parse_date
11-
from hdx.utilities.dictandlist import dict_of_lists_add
11+
from hdx.utilities.dictandlist import dict_of_lists_add, invert_dictionary
1212
from hdx.utilities.text import get_numeric_if_possible
1313
from sqlalchemy.orm import Session
1414

1515
from ..utilities.provider_admin_names import get_provider_name
1616
from . import admins
17-
from .admins import get_admin1_to_location_connector_code
1817
from .base_uploader import BaseUploader
1918
from .metadata import Metadata
2019

@@ -36,29 +35,6 @@ def __init__(
3635
self._configuration = configuration
3736
self._error_handler = error_handler
3837

39-
def get_admin1_ref(self, row, dataset_name):
40-
countryiso3 = row["country_code"]
41-
if countryiso3 == "#country+code": # ignore HXL row
42-
return None
43-
admin_code = row["admin1_code"]
44-
if admin_code:
45-
admin_level = "adminone"
46-
else:
47-
admin1_name = row["admin1_name"]
48-
if admin1_name:
49-
admin_level = "adminone"
50-
admin_code = get_admin1_to_location_connector_code(countryiso3)
51-
else:
52-
admin_level = "national"
53-
admin_code = countryiso3
54-
return self._admins.get_admin1_ref(
55-
admin_level,
56-
admin_code,
57-
dataset_name,
58-
"PovertyRate",
59-
self._error_handler,
60-
)
61-
6238
def populate(self) -> None:
6339
logger.info("Populating poverty rate table")
6440
reader = Read.get_reader("hdx")
@@ -69,9 +45,9 @@ def populate(self) -> None:
6945
null_values_by_iso3 = {}
7046

7147
def get_value(row: Dict, in_col: str) -> float:
72-
countryiso3 = row["country_code"]
48+
countryiso3 = row["Country ISO3"]
7349
value = row[in_col]
74-
admin_name = row["admin1_name"]
50+
admin_name = row["Admin 1 Name"]
7551
if not admin_name:
7652
admin_name = countryiso3
7753
if value is None:
@@ -84,18 +60,24 @@ def get_value(row: Dict, in_col: str) -> float:
8460
resource_id = resource["id"]
8561
self._metadata.add_resource(dataset_id, resource)
8662
url = resource["url"]
87-
_, rows = reader.get_tabular_rows(url, dict_form=True)
88-
89-
# country_code,admin1_code,admin1_name,mpi,headcount_ratio,intensity_of_deprivation,vulnerable_to_poverty,in_severe_poverty,reference_period_start,reference_period_end
63+
header, rows = reader.get_tabular_rows(url, dict_form=True)
64+
hxltag_to_header = invert_dictionary(next(rows))
9065
for row in rows:
91-
admin1_ref = self.get_admin1_ref(row, dataset_name)
66+
admin_level = self._admins.get_admin_level_from_row(
67+
hxltag_to_header, row, 1
68+
)
69+
admin1_ref = self._admins.get_admin1_ref_from_row(
70+
hxltag_to_header,
71+
row,
72+
dataset_name,
73+
"PovertyRate",
74+
admin_level,
75+
)
9276
if not admin1_ref:
9377
continue
94-
provider_admin1_name = get_provider_name(row, "admin1_name")
95-
reference_period_start = parse_date(
96-
row["reference_period_start"]
97-
)
98-
reference_period_end = parse_date(row["reference_period_end"])
78+
provider_admin1_name = get_provider_name(row, "Admin 1 Name")
79+
reference_period_start = parse_date(row["Start Date"])
80+
reference_period_end = parse_date(row["End Date"])
9981
key = (
10082
admin1_ref,
10183
provider_admin1_name,
@@ -118,15 +100,15 @@ def get_value(row: Dict, in_col: str) -> float:
118100
provider_admin1_name=provider_admin1_name,
119101
reference_period_start=reference_period_start,
120102
reference_period_end=reference_period_end,
121-
mpi=get_value(row, "mpi"),
122-
headcount_ratio=get_value(row, "headcount_ratio"),
103+
mpi=get_value(row, "MPI"),
104+
headcount_ratio=get_value(row, "Headcount Ratio"),
123105
intensity_of_deprivation=get_value(
124-
row, "intensity_of_deprivation"
106+
row, "Intensity of Deprivation"
125107
),
126108
vulnerable_to_poverty=get_value(
127-
row, "vulnerable_to_poverty"
109+
row, "Vulnerable to Poverty"
128110
),
129-
in_severe_poverty=get_value(row, "in_severe_poverty"),
111+
in_severe_poverty=get_value(row, "In Severe Poverty"),
130112
)
131113
self._session.add(row)
132114
self._session.commit()

tests/fixtures/input/download-global-mpi-trends.csv

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
country_code,admin1_code,admin1_name,mpi,headcount_ratio,intensity_of_deprivation,vulnerable_to_poverty,in_severe_poverty,reference_period_start,reference_period_end
1+
Country ISO3,Admin 1 PCode,Admin 1 Name,MPI,Headcount Ratio,Intensity of Deprivation,Vulnerable to Poverty,In Severe Poverty,Start Date,End Date
22
#country+code,#adm1+code,#adm1+name,#indicator+mpi,#indicator+headcount_ratio,#indicator+intensity_of_deprivation,#indicator+vulnerable_to_poverty,#indicator+in_severe_poverty,#date+start,#date+end
33
AFG,,,0.2342396091002832,46.93584855784794,49.90633306897293,27.381337677259033,20.80265720520784,2015-01-01 00:00:00+00:00,2016-12-31 00:00:00+00:00
44
AFG,,,0.2683302947167732,52.177907473390185,51.42603598153431,26.33666305841249,25.971520762600047,2022-01-01 00:00:00+00:00,2023-12-31 00:00:00+00:00

0 commit comments

Comments
 (0)