Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework state and postal code matching #50

Merged
merged 2 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 45 additions & 30 deletions pyap/source_US/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import string
from typing import List
from typing import Optional


def str_list_to_upper_lower_regex(str_list: List[str]) -> str:
Expand Down Expand Up @@ -986,7 +987,8 @@ def street_type_list_to_regex(street_type_list: list[str]) -> str:


def states_abbrvs_regex() -> str:
state_abbrs = [
# Some abbreviations are non-standard
_STATE_ABBRS = {
"AL",
"AK",
"AZ",
Expand All @@ -995,7 +997,6 @@ def states_abbrvs_regex() -> str:
"CO",
"CT",
"DE",
"DC",
"FL",
"GA",
"HI",
Expand All @@ -1009,7 +1010,7 @@ def states_abbrvs_regex() -> str:
"ME",
"MD",
"MA",
"MI(?:CH)?",
"MI(?:CH)?\.?",
"MN",
"MS",
"MO",
Expand All @@ -1019,7 +1020,7 @@ def states_abbrvs_regex() -> str:
"NH",
"NJ",
"NM",
"NY",
"NY|N\.Y\.",
"NC",
"ND",
"OH",
Expand All @@ -1038,25 +1039,27 @@ def states_abbrvs_regex() -> str:
"WV",
"WI",
"WY",
# unincorporated & commonwealth territories
}
_NON_STATE_ABBRS = {
"AS",
"GU",
"MP",
"PR",
"VI",
]

def to_abbr_with_optional_dots(abbr: str) -> str:
return "".join((c + r"\.?") if c in string.ascii_uppercase else c for c in abbr)

return str_list_to_upper_lower_regex(
[to_abbr_with_optional_dots(abbr) for abbr in state_abbrs]
"D\.?C\.?",
}
return (
r"(?:"
+ str_list_to_upper_lower_regex(list(_STATE_ABBRS | _NON_STATE_ABBRS))
+ r")(?![A-Za-z])"
)


# region1 is actually a "state"
region1 = r"""
(?P<region1>
def make_region1(idx: Optional[str] = None):
maybe_idx = f"_{idx}" if idx else ""
return r"""
(?P<region1{maybe_idx}>
(?:
# states full
[Aa][Ll][Aa][Bb][Aa][Mm][Aa]|
Expand Down Expand Up @@ -1126,8 +1129,9 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
)
)
""".format(
state_abbrvs=states_abbrvs_regex()
)
state_abbrvs=states_abbrvs_regex(), maybe_idx=maybe_idx
)


# TODO: doesn't catch cities containing French characters
# We require short city names to contain a vowel
Expand All @@ -1145,11 +1149,8 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
)
"""

postal_code = r"""
(?P<postal_code>
(?:\d{5}(?:\-\d{4})?(?!\d))
)
"""
postal_code_re = r"""(?:\d{5}(?:\-\d{4})?(?!\d))"""
postal_code = rf"""(?P<postal_code>{postal_code_re})"""

country = r"""
(?:
Expand All @@ -1159,35 +1160,49 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
"""


def make_region1_postal_code(
part_div: str = part_div, postal_code: str = postal_code
) -> str:
"""This should match region1 (state) and postal code each at most once,
but require at least one of the two."""

def _indexed_region1(idx: Optional[str] = None):
return rf"""(?:{part_div} {make_region1(idx)})"""

_postal_code = f"""(?:{part_div}|\-)? {postal_code}"""
return rf"""
(?:{_indexed_region1("a")}?{_postal_code}{_indexed_region1("b")}?
|{_indexed_region1("c")}(?![-,.\ A-Za-z]{{0,10}}{postal_code_re}))
"""


region1_postal_code = make_region1_postal_code()


def make_full_address(
*,
full_street: str = full_street,
part_div: str = part_div,
city: str = city,
region1: str = region1,
region1_postal_code: str = region1_postal_code,
country: str = country,
postal_code: str = postal_code,
phone_number: str = phone_number,
) -> str:

return r"""
(?P<full_address>
{full_street}
(?:{part_div} {phone_number})?
{part_div}{city}
(?:
{part_div} {region1} (?![A-Za-z])
|
(?:{part_div}|\-)? {postal_code}
){{1,2}}
{region1_postal_code}
(?:{part_div} {country})?
)
""".format(
full_street=full_street,
part_div=part_div,
city=city,
region1=region1,
region1_postal_code=region1_postal_code,
country=country,
postal_code=postal_code,
phone_number=phone_number,
)

Expand Down
7 changes: 6 additions & 1 deletion tests/test_parser_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ def test_full_street_positive(input, expected):
[
# positive assertions
("P.O. BOX 10323 PH (205) 595-3511\nBIRMINGHAM, AL 35202", True),
("1100 VIRGINIA DR\nFORT WASHINGTON, PA, 19034", True),
("3602 HIGHPOINT\nSAN ANTONIO TX78217", True),
("8025 BLACK HORSE\nSTE 300\nPLEASANTVILLE NJ 08232", True),
("696 BEAL PKWY NW\nFT WALTON BCH FL 32547", True),
Expand Down Expand Up @@ -633,17 +634,21 @@ def test_postal_code(input, expected):
("Nebraska", True),
("NJ", True),
("DC", True),
("D.C.", True),
("N.Y.", True),
("PuErTO RIco", True),
("oregon", True),
("Tx", True),
("nY", True),
("fl", True),
("MICH", True),
# negative assertions
("NJ.", False),
],
)
def test_region1(input, expected):
"""test exact string match for province"""
execute_matching_test(input, expected, data_us.region1)
execute_matching_test(input, expected, data_us.make_region1())


@pytest.mark.parametrize(
Expand Down
Loading