diff --git a/pyap/source_US/data.py b/pyap/source_US/data.py index a239daf..ad2040f 100644 --- a/pyap/source_US/data.py +++ b/pyap/source_US/data.py @@ -16,6 +16,7 @@ import string from typing import List +from typing import Optional def str_list_to_upper_lower_regex(str_list: List[str]) -> str: @@ -986,7 +987,8 @@ def street_type_list_to_regex(street_type_list: list[str]) -> str: def states_abbrvs_regex() -> str: - state_abbrs = [ + # Some abbreviations are non-standard + _STATE_ABBRS = { "AL", "AK", "AZ", @@ -995,7 +997,6 @@ def states_abbrvs_regex() -> str: "CO", "CT", "DE", - "DC", "FL", "GA", "HI", @@ -1009,7 +1010,7 @@ def states_abbrvs_regex() -> str: "ME", "MD", "MA", - "MI(?:CH)?", + "MI(?:CH)?\.?", "MN", "MS", "MO", @@ -1019,7 +1020,7 @@ def states_abbrvs_regex() -> str: "NH", "NJ", "NM", - "NY", + "NY|N\.Y\.", "NC", "ND", "OH", @@ -1038,25 +1039,27 @@ def states_abbrvs_regex() -> str: "WV", "WI", "WY", - # unincorporated & commonwealth territories + } + _NON_STATE_ABBRS = { "AS", "GU", "MP", "PR", "VI", - ] - - def to_abbr_with_optional_dots(abbr: str) -> str: - return "".join((c + r"\.?") if c in string.ascii_uppercase else c for c in abbr) - - return str_list_to_upper_lower_regex( - [to_abbr_with_optional_dots(abbr) for abbr in state_abbrs] + "D\.?C\.?", + } + return ( + r"(?:" + + str_list_to_upper_lower_regex(list(_STATE_ABBRS | _NON_STATE_ABBRS)) + + r")(?![A-Za-z])" ) # region1 is actually a "state" -region1 = r""" - (?P +def make_region1(idx: Optional[str] = None): + maybe_idx = f"_{idx}" if idx else "" + return r""" + (?P (?: # states full [Aa][Ll][Aa][Bb][Aa][Mm][Aa]| @@ -1126,8 +1129,9 @@ def to_abbr_with_optional_dots(abbr: str) -> str: ) ) """.format( - state_abbrvs=states_abbrvs_regex() -) + state_abbrvs=states_abbrvs_regex(), maybe_idx=maybe_idx + ) + # TODO: doesn't catch cities containing French characters # We require short city names to contain a vowel @@ -1145,11 +1149,8 @@ def to_abbr_with_optional_dots(abbr: str) -> str: ) """ -postal_code = r""" - (?P - (?:\d{5}(?:\-\d{4})?(?!\d)) - ) - """ +postal_code_re = r"""(?:\d{5}(?:\-\d{4})?(?!\d))""" +postal_code = rf"""(?P{postal_code_re})""" country = r""" (?: @@ -1159,35 +1160,49 @@ def to_abbr_with_optional_dots(abbr: str) -> str: """ +def make_region1_postal_code( + part_div: str = part_div, postal_code: str = postal_code +) -> str: + """This should match region1 (state) and postal code each at most once, + but require at least one of the two.""" + + def _indexed_region1(idx: Optional[str] = None): + return rf"""(?:{part_div} {make_region1(idx)})""" + + _postal_code = f"""(?:{part_div}|\-)? {postal_code}""" + return rf""" + (?:{_indexed_region1("a")}?{_postal_code}{_indexed_region1("b")}? + |{_indexed_region1("c")}(?![-,.\ A-Za-z]{{0,10}}{postal_code_re})) + """ + + +region1_postal_code = make_region1_postal_code() + + def make_full_address( *, full_street: str = full_street, part_div: str = part_div, city: str = city, - region1: str = region1, + region1_postal_code: str = region1_postal_code, country: str = country, - postal_code: str = postal_code, phone_number: str = phone_number, ) -> str: + return r""" (?P {full_street} (?:{part_div} {phone_number})? {part_div}{city} - (?: - {part_div} {region1} (?![A-Za-z]) - | - (?:{part_div}|\-)? {postal_code} - ){{1,2}} + {region1_postal_code} (?:{part_div} {country})? ) """.format( full_street=full_street, part_div=part_div, city=city, - region1=region1, + region1_postal_code=region1_postal_code, country=country, - postal_code=postal_code, phone_number=phone_number, ) diff --git a/tests/test_parser_us.py b/tests/test_parser_us.py index 4e00040..58bb663 100644 --- a/tests/test_parser_us.py +++ b/tests/test_parser_us.py @@ -476,6 +476,7 @@ def test_full_street_positive(input, expected): [ # positive assertions ("P.O. BOX 10323 PH (205) 595-3511\nBIRMINGHAM, AL 35202", True), + ("1100 VIRGINIA DR\nFORT WASHINGTON, PA, 19034", True), ("3602 HIGHPOINT\nSAN ANTONIO TX78217", True), ("8025 BLACK HORSE\nSTE 300\nPLEASANTVILLE NJ 08232", True), ("696 BEAL PKWY NW\nFT WALTON BCH FL 32547", True), @@ -633,17 +634,21 @@ def test_postal_code(input, expected): ("Nebraska", True), ("NJ", True), ("DC", True), + ("D.C.", True), + ("N.Y.", True), ("PuErTO RIco", True), ("oregon", True), ("Tx", True), ("nY", True), ("fl", True), ("MICH", True), + # negative assertions + ("NJ.", False), ], ) def test_region1(input, expected): """test exact string match for province""" - execute_matching_test(input, expected, data_us.region1) + execute_matching_test(input, expected, data_us.make_region1()) @pytest.mark.parametrize(