Skip to content

Commit 6b28a1f

Browse files
Merge pull request #18 from OCHA-DAP/main
HDXDSYS-941 fuzzy matching and SDN population
2 parents 23fa283 + 1fb2a96 commit 6b28a1f

File tree

6 files changed

+98
-11
lines changed

6 files changed

+98
-11
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

7+
## [0.9.43] - 2024-08-01
8+
9+
### Changed
10+
11+
- Updated SDN population data
12+
13+
## [0.9.42] - 2024-08-01
14+
15+
### Changed
16+
17+
- Reenable fuzzy matching
18+
719
## [0.9.41] - 2024-07-31
820

921
### Changed

src/hapi/pipelines/configs/population.yaml

+23-2
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@ population_national:
840840

841841
population_sdn:
842842
dataset: "cod-ps-sdn"
843-
resource: "sdn_admpop_adm0_2022.csv"
843+
resource: "sdn_admpop_adm0_2024.csv"
844844
format: "csv"
845845
use_hxl: False
846846
admin_single: "SDN"
@@ -1819,7 +1819,7 @@ population_adminone:
18191819

18201820
population_sdn:
18211821
dataset: "cod-ps-sdn"
1822-
resource: "sdn_admpop_adm1_2022.csv"
1822+
resource: "sdn_admpop_adm1_2024.csv"
18231823
format: "csv"
18241824
use_hxl: False
18251825
admin:
@@ -2747,6 +2747,27 @@ population_admintwo:
27472747
- "#population+m+age_80_plus"
27482748
- "#population+age_80_plus+total"
27492749

2750+
population_sdn:
2751+
dataset: "cod-ps-sdn"
2752+
resource: "sdn_admpop_adm2_2024.csv"
2753+
format: "csv"
2754+
use_hxl: False
2755+
admin:
2756+
- ~
2757+
- "ADM2_PCODE"
2758+
input:
2759+
- "F_80Plus"
2760+
- "M_80Plus"
2761+
- "T_80Plus"
2762+
output:
2763+
- "F_80plus"
2764+
- "M_80plus"
2765+
- "T_80plus"
2766+
output_hxl:
2767+
- "#population+f+age_80_plus"
2768+
- "#population+m+age_80_plus"
2769+
- "#population+age_80_plus+total"
2770+
27502771
population_slv:
27512772
dataset: "cod-ps-slv"
27522773
resource: "slv_admpop_adm2_2023.csv"

src/hapi/pipelines/database/org_type.py

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def __init__(
2424
super().__init__(session)
2525
self._datasetinfo = datasetinfo
2626
self.data = org_type_map
27+
self.unmatched = []
2728

2829
def populate(self):
2930
logger.info("Populating org type table")
@@ -67,4 +68,5 @@ def get_org_type_code(self, org_type: str) -> str | None:
6768
return get_code_from_name(
6869
name=org_type,
6970
code_lookup=self.data,
71+
unmatched=self.unmatched,
7072
)

src/hapi/pipelines/database/sector.py

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def __init__(
2424
super().__init__(session)
2525
self._datasetinfo = datasetinfo
2626
self.data = sector_map
27+
self.unmatched = []
2728

2829
def populate(self):
2930
logger.info("Populating sector table")
@@ -64,4 +65,5 @@ def get_sector_code(self, sector: str) -> str | None:
6465
return get_code_from_name(
6566
name=sector,
6667
code_lookup=self.data,
68+
unmatched=self.unmatched,
6769
)

src/hapi/pipelines/utilities/mappings.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict
1+
from typing import Dict, List
22

33
from hdx.location.phonetics import Phonetics
44
from hdx.utilities.text import normalise
@@ -9,14 +9,16 @@
99
def get_code_from_name(
1010
name: str,
1111
code_lookup: Dict[str, str],
12-
fuzzy_match: bool = False,
12+
unmatched: List[str],
13+
fuzzy_match: bool = True,
1314
) -> str | None:
1415
"""
1516
Given a name (org type, sector, etc), return the corresponding code.
1617
1718
Args:
1819
name (str): Name to match
1920
code_lookup (dict): Dictionary of official names and codes
21+
unmatched (List[str]): List of unmatched names
2022
fuzzy_match (bool): Allow fuzzy matching or not
2123
2224
Returns:
@@ -25,22 +27,27 @@ def get_code_from_name(
2527
code = code_lookup.get(name)
2628
if code:
2729
return code
30+
if name in unmatched:
31+
return None
2832
name_clean = normalise(name)
2933
code = code_lookup.get(name_clean)
3034
if code:
3135
code_lookup[name] = code
3236
return code
3337
if len(name) <= MATCH_THRESHOLD:
38+
unmatched.append(name)
3439
return None
3540
if not fuzzy_match:
41+
unmatched.append(name)
3642
return None
37-
names = list(code_lookup.keys())
43+
names = [x for x in code_lookup.keys() if len(x) > MATCH_THRESHOLD]
3844
name_index = Phonetics().match(
3945
possible_names=names,
4046
name=name,
4147
alternative_name=name_clean,
4248
)
4349
if name_index is None:
50+
unmatched.append(name)
4451
return None
4552
code = code_lookup.get(names[name_index])
4653
if code:

tests/test_mappings.py

+49-6
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,51 @@ def test_get_code_from_name_org_type():
3838
get_code_from_name(
3939
"NATIONAL_NGO",
4040
actual_org_type_lookup,
41+
[],
42+
fuzzy_match=False,
4143
)
4244
== "441"
4345
)
4446
assert (
4547
get_code_from_name(
4648
"COOPÉRATION_INTERNATIONALE",
4749
actual_org_type_lookup,
50+
[],
51+
fuzzy_match=False,
52+
)
53+
is None
54+
)
55+
unmatched = []
56+
assert (
57+
get_code_from_name(
58+
"COOPÉRATION_INTERNATIONALE",
59+
actual_org_type_lookup,
60+
unmatched,
61+
fuzzy_match=True,
62+
)
63+
is None
64+
)
65+
assert (
66+
get_code_from_name(
67+
"COOPÉRATION_INTERNATIONALE",
68+
actual_org_type_lookup,
69+
unmatched,
70+
fuzzy_match=True,
71+
)
72+
is None
73+
)
74+
assert (
75+
get_code_from_name(
76+
"NGO", actual_org_type_lookup, [], fuzzy_match=False
4877
)
4978
is None
5079
)
51-
assert get_code_from_name("NGO", actual_org_type_lookup) is None
5280
assert (
5381
get_code_from_name(
5482
"International",
5583
actual_org_type_lookup,
84+
[],
85+
fuzzy_match=False,
5686
)
5787
is None
5888
)
@@ -100,16 +130,29 @@ def test_get_code_from_name_sector():
100130
actual_sector_lookup = {normalise(k): v for k, v in sector_lookup.items()}
101131
actual_sector_lookup.update(sector_map)
102132
assert (
103-
get_code_from_name("education", actual_sector_lookup, fuzzy_match=True)
133+
get_code_from_name(
134+
"education", actual_sector_lookup, [], fuzzy_match=True
135+
)
104136
== "EDU"
105137
)
106138
assert (
107139
get_code_from_name(
108-
"LOGISTIQUE", actual_sector_lookup, fuzzy_match=True
140+
"LOGISTIQUE", actual_sector_lookup, [], fuzzy_match=True
109141
)
110142
== "LOG"
111143
)
112-
assert get_code_from_name("CCCM", actual_sector_lookup) == "CCM"
113-
assert get_code_from_name("Santé", actual_sector_lookup) == "HEA"
144+
assert (
145+
get_code_from_name("CCCM", actual_sector_lookup, [], fuzzy_match=False)
146+
== "CCM"
147+
)
148+
assert (
149+
get_code_from_name(
150+
"Santé", actual_sector_lookup, [], fuzzy_match=False
151+
)
152+
== "HEA"
153+
)
114154
actual_sector_lookup["cccm"] = "CCM"
115-
assert get_code_from_name("CCS", actual_sector_lookup) is None
155+
assert (
156+
get_code_from_name("CCS", actual_sector_lookup, [], fuzzy_match=False)
157+
is None
158+
)

0 commit comments

Comments
 (0)