Skip to content

Commit

Permalink
Merge pull request #50 from argyle-engineering/rework-state-postal-code
Browse files Browse the repository at this point in the history
Rework state and postal code matching
  • Loading branch information
lmelninkas authored Apr 25, 2024
2 parents 1e7903f + eba1105 commit ea976bc
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 31 deletions.
75 changes: 45 additions & 30 deletions pyap/source_US/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import string
from typing import List
from typing import Optional


def str_list_to_upper_lower_regex(str_list: List[str]) -> str:
Expand Down Expand Up @@ -986,7 +987,8 @@ def street_type_list_to_regex(street_type_list: list[str]) -> str:


def states_abbrvs_regex() -> str:
state_abbrs = [
# Some abbreviations are non-standard
_STATE_ABBRS = {
"AL",
"AK",
"AZ",
Expand All @@ -995,7 +997,6 @@ def states_abbrvs_regex() -> str:
"CO",
"CT",
"DE",
"DC",
"FL",
"GA",
"HI",
Expand All @@ -1009,7 +1010,7 @@ def states_abbrvs_regex() -> str:
"ME",
"MD",
"MA",
"MI(?:CH)?",
"MI(?:CH)?\.?",
"MN",
"MS",
"MO",
Expand All @@ -1019,7 +1020,7 @@ def states_abbrvs_regex() -> str:
"NH",
"NJ",
"NM",
"NY",
"NY|N\.Y\.",
"NC",
"ND",
"OH",
Expand All @@ -1038,25 +1039,27 @@ def states_abbrvs_regex() -> str:
"WV",
"WI",
"WY",
# unincorporated & commonwealth territories
}
_NON_STATE_ABBRS = {
"AS",
"GU",
"MP",
"PR",
"VI",
]

def to_abbr_with_optional_dots(abbr: str) -> str:
return "".join((c + r"\.?") if c in string.ascii_uppercase else c for c in abbr)

return str_list_to_upper_lower_regex(
[to_abbr_with_optional_dots(abbr) for abbr in state_abbrs]
"D\.?C\.?",
}
return (
r"(?:"
+ str_list_to_upper_lower_regex(list(_STATE_ABBRS | _NON_STATE_ABBRS))
+ r")(?![A-Za-z])"
)


# region1 is actually a "state"
region1 = r"""
(?P<region1>
def make_region1(idx: Optional[str] = None):
maybe_idx = f"_{idx}" if idx else ""
return r"""
(?P<region1{maybe_idx}>
(?:
# states full
[Aa][Ll][Aa][Bb][Aa][Mm][Aa]|
Expand Down Expand Up @@ -1126,8 +1129,9 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
)
)
""".format(
state_abbrvs=states_abbrvs_regex()
)
state_abbrvs=states_abbrvs_regex(), maybe_idx=maybe_idx
)


# TODO: doesn't catch cities containing French characters
# We require short city names to contain a vowel
Expand All @@ -1145,11 +1149,8 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
)
"""

postal_code = r"""
(?P<postal_code>
(?:\d{5}(?:\-\d{4})?(?!\d))
)
"""
postal_code_re = r"""(?:\d{5}(?:\-\d{4})?(?!\d))"""
postal_code = rf"""(?P<postal_code>{postal_code_re})"""

country = r"""
(?:
Expand All @@ -1159,35 +1160,49 @@ def to_abbr_with_optional_dots(abbr: str) -> str:
"""


def make_region1_postal_code(
part_div: str = part_div, postal_code: str = postal_code
) -> str:
"""This should match region1 (state) and postal code each at most once,
but require at least one of the two."""

def _indexed_region1(idx: Optional[str] = None):
return rf"""(?:{part_div} {make_region1(idx)})"""

_postal_code = f"""(?:{part_div}|\-)? {postal_code}"""
return rf"""
(?:{_indexed_region1("a")}?{_postal_code}{_indexed_region1("b")}?
|{_indexed_region1("c")}(?![-,.\ A-Za-z]{{0,10}}{postal_code_re}))
"""


region1_postal_code = make_region1_postal_code()


def make_full_address(
*,
full_street: str = full_street,
part_div: str = part_div,
city: str = city,
region1: str = region1,
region1_postal_code: str = region1_postal_code,
country: str = country,
postal_code: str = postal_code,
phone_number: str = phone_number,
) -> str:

return r"""
(?P<full_address>
{full_street}
(?:{part_div} {phone_number})?
{part_div}{city}
(?:
{part_div} {region1} (?![A-Za-z])
|
(?:{part_div}|\-)? {postal_code}
){{1,2}}
{region1_postal_code}
(?:{part_div} {country})?
)
""".format(
full_street=full_street,
part_div=part_div,
city=city,
region1=region1,
region1_postal_code=region1_postal_code,
country=country,
postal_code=postal_code,
phone_number=phone_number,
)

Expand Down
7 changes: 6 additions & 1 deletion tests/test_parser_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ def test_full_street_positive(input, expected):
[
# positive assertions
("P.O. BOX 10323 PH (205) 595-3511\nBIRMINGHAM, AL 35202", True),
("1100 VIRGINIA DR\nFORT WASHINGTON, PA, 19034", True),
("3602 HIGHPOINT\nSAN ANTONIO TX78217", True),
("8025 BLACK HORSE\nSTE 300\nPLEASANTVILLE NJ 08232", True),
("696 BEAL PKWY NW\nFT WALTON BCH FL 32547", True),
Expand Down Expand Up @@ -633,17 +634,21 @@ def test_postal_code(input, expected):
("Nebraska", True),
("NJ", True),
("DC", True),
("D.C.", True),
("N.Y.", True),
("PuErTO RIco", True),
("oregon", True),
("Tx", True),
("nY", True),
("fl", True),
("MICH", True),
# negative assertions
("NJ.", False),
],
)
def test_region1(input, expected):
"""test exact string match for province"""
execute_matching_test(input, expected, data_us.region1)
execute_matching_test(input, expected, data_us.make_region1())


@pytest.mark.parametrize(
Expand Down

0 comments on commit ea976bc

Please sign in to comment.