Skip to content

Commit ef4386d

Browse files
authored
HDXDSYS-868 Use normalise function from HDX Python Utilities (OCHA-DAP#122)
* Use normalise function from HDX Python Utilities * 3 letter names not matched now because we set a minimum length of 4, so add mappings Fix name replacements * Update CHANGELOG
1 parent a6c855a commit ef4386d

File tree

7 files changed

+36
-35
lines changed

7 files changed

+36
-35
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

7+
## [0.9.23] - 2024-07-05
8+
9+
### Changed
10+
11+
- Use normalise function from HDX Python Utilities
12+
- Update mappings for changes in HDX Python Country
13+
714
## [0.9.22] - 2024-07-05
815

916
### Fixed

pyproject.toml

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ classifiers = [
3434
requires-python = ">=3.8"
3535

3636
dependencies = [
37-
"hapi-schema>=0.8.9",
37+
"hapi-schema>=0.8.12",
3838
"hdx-python-api>= 6.2.9",
39-
"hdx-python-country>= 3.7.4",
39+
"hdx-python-country>= 3.7.5",
4040
"hdx-python-database[postgresql]>= 1.3.1",
41-
"hdx-python-scraper>= 2.3.7",
42-
"hdx-python-utilities>= 3.6.9",
41+
"hdx-python-scraper>= 2.3.8",
42+
"hdx-python-utilities>= 3.7.2",
4343
"libhxl",
4444
"sqlalchemy"
4545
]

requirements.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ attrs==23.2.0
1010
# referencing
1111
cachetools==5.3.3
1212
# via google-auth
13-
certifi==2024.6.2
13+
certifi==2024.7.4
1414
# via requests
1515
cffi==1.16.0
1616
# via cryptography
@@ -64,7 +64,7 @@ hdx-python-api==6.3.1
6464
# via
6565
# hapi-pipelines (pyproject.toml)
6666
# hdx-python-scraper
67-
hdx-python-country==3.7.4
67+
hdx-python-country==3.7.5
6868
# via
6969
# hapi-pipelines (pyproject.toml)
7070
# hdx-python-api
@@ -73,7 +73,7 @@ hdx-python-database==1.3.1
7373
# via hapi-pipelines (pyproject.toml)
7474
hdx-python-scraper==2.3.8
7575
# via hapi-pipelines (pyproject.toml)
76-
hdx-python-utilities==3.7.1
76+
hdx-python-utilities==3.7.2
7777
# via
7878
# hapi-pipelines (pyproject.toml)
7979
# hdx-python-api
@@ -166,7 +166,7 @@ pyasn1-modules==0.4.0
166166
# via google-auth
167167
pycparser==2.22
168168
# via cffi
169-
pydantic==2.8.1
169+
pydantic==2.8.2
170170
# via frictionless
171171
pydantic-core==2.20.1
172172
# via pydantic
@@ -295,7 +295,7 @@ urllib3==2.2.2
295295
# via
296296
# libhxl
297297
# requests
298-
validators==0.29.0
298+
validators==0.30.0
299299
# via frictionless
300300
virtualenv==20.26.3
301301
# via pre-commit

src/hapi/pipelines/configs/core.yaml

+7-5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ admin2:
5555
"AF08|Onaba": "AF0805"
5656
"AF14|Khost": "AF1401"
5757
"AF23|Chaghcharan": "AF2301"
58+
"CM008|Ndé": "CM008007"
5859
"CO08|Distrito Especial, Industrial Y Portuario De Barranquilla": "CO08001"
5960
"ET01|C. TIGRAY": "ET0102"
6061
"ET01|NW. TIGRAY": "ET0101"
@@ -187,16 +188,17 @@ admin2:
187188
"UA23|Vilnianskyi": "UA2306"
188189
"UA23|Yakymivskyi": "UA2308"
189190
"YE14|Radman Al Awad": "YE1412"
191+
"YE21|Ain": "YE2106"
190192
"YE24|Craiter": "YE2407"
191193
"YE26|Medghal": "YE2603"
192194

193195
admin_name_replacements:
194196
"COD| city": ""
195-
"ETH|c.": "central"
196-
"ETH|e.": "east"
197-
"ETH|n.": "north"
198-
"ETH|s.": "south"
199-
"ETH|w.": "west"
197+
"ETH|c ": "central "
198+
"ETH|e ": "east "
199+
"ETH|n ": "north "
200+
"ETH|s ": "south "
201+
"ETH|w ": "west "
200202

201203
orphan_admin2s:
202204
SS0001: "SSD"

src/hapi/pipelines/database/operational_presence.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
from hapi_schema.db_operational_presence import DBOperationalPresence
88
from hdx.location.adminlevel import AdminLevel
99
from hdx.utilities.dictandlist import write_list_to_csv
10+
from hdx.utilities.text import normalise
1011
from sqlalchemy.orm import Session
1112

1213
from ..utilities.batch_populate import batch_populate
1314
from ..utilities.logging_helpers import add_message, add_missing_value_message
14-
from ..utilities.mappings import clean_text
1515
from . import admins
1616
from .base_uploader import BaseUploader
1717
from .metadata import Metadata
@@ -160,8 +160,8 @@ def populate(self, debug=False):
160160
)
161161
org_acronym, org_name, org_type = self._org.data[
162162
(
163-
clean_text(org_acronym),
164-
clean_text(org_name),
163+
normalise(org_acronym),
164+
normalise(org_name),
165165
)
166166
]
167167
sector_code = self._sector.get_sector_code(sector_orig)

src/hapi/pipelines/database/org.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from hapi_schema.db_org import DBOrg
77
from hdx.scraper.utilities.reader import Read
88
from hdx.utilities.dictandlist import dict_of_sets_add
9+
from hdx.utilities.text import normalise
910
from sqlalchemy.orm import Session
1011

1112
from ..utilities.batch_populate import batch_populate
12-
from ..utilities.mappings import clean_text
1313
from .base_uploader import BaseUploader
1414

1515
logger = logging.getLogger(__name__)
@@ -57,8 +57,8 @@ def add_or_match_org(
5757
org_type,
5858
):
5959
key = (
60-
clean_text(acronym),
61-
clean_text(org_name),
60+
normalise(acronym),
61+
normalise(org_name),
6262
)
6363
if key in self.data:
6464
org_type_old = self.data[key][2]
@@ -68,8 +68,8 @@ def add_or_match_org(
6868
return
6969
self.data[
7070
(
71-
clean_text(acronym),
72-
clean_text(org_name),
71+
normalise(acronym),
72+
normalise(org_name),
7373
)
7474
] = [acronym, org_name, org_type]
7575

@@ -93,9 +93,9 @@ def get_org_info(self, org_name: str, location: str) -> Dict[str, str]:
9393
org_map_info = org_name_map.get(org_name)
9494
if not org_map_info:
9595
org_name_map_clean = {
96-
clean_text(on): org_name_map[on] for on in org_name_map
96+
normalise(on): org_name_map[on] for on in org_name_map
9797
}
98-
org_name_clean = clean_text(org_name)
98+
org_name_clean = normalise(org_name)
9999
org_map_info = org_name_map_clean.get(org_name_clean)
100100
if not org_map_info:
101101
return {"#org+name": org_name}

src/hapi/pipelines/utilities/mappings.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
from re import sub
21
from typing import Dict
32

4-
from hdx.location.names import clean_name
53
from hdx.location.phonetics import Phonetics
4+
from hdx.utilities.text import normalise
65

76
MATCH_THRESHOLD = 5
87

@@ -30,8 +29,8 @@ def get_code_from_name(
3029
code = code_lookup.get(name)
3130
if code:
3231
return code, name, False
33-
name_clean = clean_text(name)
34-
clean_lookup = {clean_text(c): code_lookup[c] for c in code_lookup}
32+
name_clean = normalise(name)
33+
clean_lookup = {normalise(c): code_lookup[c] for c in code_lookup}
3534
code = clean_lookup.get(name_clean)
3635
if code:
3736
return code, name_clean, False
@@ -54,10 +53,3 @@ def get_code_from_name(
5453
name = names[name_index]
5554
code = code_lookup.get(name, code_mapping.get(name))
5655
return code, name_clean, True
57-
58-
59-
def clean_text(text: str) -> str:
60-
text_clean = clean_name(text)
61-
text_clean = sub(r"[^'a-zA-Z0-9\s]", " ", text_clean)
62-
text_clean = sub(" +", " ", text_clean)
63-
return text_clean.strip()

0 commit comments

Comments
 (0)