Skip to content

Commit a72bda4

Browse files
Merge pull request #39 from mideind/puncterrors
Puncterrors
2 parents 54e2c0a + 460eed6 commit a72bda4

File tree

3 files changed

+131
-29
lines changed

3 files changed

+131
-29
lines changed

src/tokenizer/definitions.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
TelnoTuple = Tuple[str, str]
5555
CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]]
5656

57+
5758
class BIN_Tuple(NamedTuple):
5859
stofn: str
5960
utg: int
@@ -62,13 +63,16 @@ class BIN_Tuple(NamedTuple):
6263
ordmynd: str
6364
beyging: str
6465

66+
6567
BIN_TupleList = Sequence[BIN_Tuple]
6668

69+
6770
class PersonNameTuple(NamedTuple):
6871
name: str
6972
gender: Optional[str]
7073
case: Optional[str]
7174

75+
7276
PersonNameList = Sequence[PersonNameTuple]
7377

7478
# All possible contents of the Tok.val attribute
@@ -193,6 +197,9 @@ class PersonNameTuple(NamedTuple):
193197
PUNCT_ENDING_WORD = frozenset(["'", "²", "³"])
194198
# Punctuation symbols that may occur together
195199
PUNCT_COMBINATIONS = frozenset(["?", "!", "…"])
200+
# Punctuation in end of indirect speech that doesn't necessarily end sentences
201+
PUNCT_INDIRECT_SPEECH = frozenset(["?", "!"])
202+
196203

197204
# Single and double quotes
198205
SQUOTES = "'‚‛‘´"
@@ -204,7 +211,13 @@ class PersonNameTuple(NamedTuple):
204211
TELNO_PREFIXES = "45678"
205212

206213
# Known telephone country codes
207-
COUNTRY_CODES = frozenset(("354", "+354", "00354",))
214+
COUNTRY_CODES = frozenset(
215+
(
216+
"354",
217+
"+354",
218+
"00354",
219+
)
220+
)
208221

209222
# Words that can precede a year number; will be assimilated into the year token
210223
YEAR_WORD = frozenset(("árið", "ársins", "árinu"))
@@ -516,9 +529,9 @@ class PersonNameTuple(NamedTuple):
516529
"N": "Norður",
517530
}
518531

519-
_unit_lambda: Callable[[str], str] = lambda unit: unit + r"(?!\w)" if unit[
520-
-1
521-
].isalpha() else unit
532+
_unit_lambda: Callable[[str], str] = (
533+
lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit
534+
)
522535

523536
SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
524537
SI_UNITS_REGEX_STRING = r"|".join(
@@ -627,7 +640,7 @@ class PersonNameTuple(NamedTuple):
627640

628641

629642
def roman_to_int(s: str) -> int:
630-
""" Quick and dirty conversion of an already validated Roman numeral to integer """
643+
"""Quick and dirty conversion of an already validated Roman numeral to integer"""
631644
# Adapted from http://code.activestate.com/recipes/81611-roman-numerals/
632645
i = result = 0
633646
for integer, numeral in ROMAN_NUMERAL_MAP:
@@ -637,6 +650,7 @@ def roman_to_int(s: str) -> int:
637650
assert i == len(s)
638651
return result
639652

653+
640654
NUMBER_ABBREV = {
641655
"þús.": 1000,
642656
"millj.": 10 ** 6,
@@ -1147,7 +1161,7 @@ def roman_to_int(s: str) -> int:
11471161

11481162

11491163
def valid_ssn(kt: str) -> bool:
1150-
""" Validate Icelandic social security number """
1164+
"""Validate Icelandic social security number"""
11511165
if not kt or len(kt) != 11 or kt[6] != "-":
11521166
return False
11531167
m = 11 - sum((ord(kt[i]) - 48) * KT_MAGIC[i] for i in range(9)) % 11

src/tokenizer/tokenizer.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,20 +1674,21 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
16741674
# Normalize two periods to one
16751675
dots, rt = rt.split(2)
16761676
yield TOK.Punctuation(dots, normalized=".")
1677-
elif rtxt.startswith(",,") and rtxt[2:3].isalpha():
1678-
# Probably someone trying to type opening double quotes with commas
1679-
punct, rt = rt.split(2)
1680-
yield TOK.Punctuation(punct, normalized="„")
16811677
elif rtxt.startswith(",,"):
1682-
# Coalesce multiple commas into one normalized comma
1683-
numcommas = 2
1684-
for c in rtxt[2:]:
1685-
if c == ",":
1686-
numcommas += 1
1687-
else:
1688-
break
1689-
punct, rt = rt.split(numcommas)
1690-
yield TOK.Punctuation(punct, normalized=",")
1678+
if rtxt[2:3].isalnum():
1679+
# Probably someone trying to type opening double quotes with commas
1680+
punct, rt = rt.split(2)
1681+
yield TOK.Punctuation(punct, normalized="„")
1682+
else:
1683+
# Coalesce multiple commas into one normalized comma
1684+
numcommas = 2
1685+
for c in rtxt[2:]:
1686+
if c == ",":
1687+
numcommas += 1
1688+
else:
1689+
break
1690+
punct, rt = rt.split(numcommas)
1691+
yield TOK.Punctuation(punct, normalized=",")
16911692
elif rtxt[0] in HYPHENS:
16921693
# Normalize all hyphens the same way
16931694
punct, rt = rt.split(1)
@@ -2459,6 +2460,23 @@ def parse_sentences(token_stream: Iterator[Tok]) -> Iterator[Tok]:
24592460
# This token starts a new sentence
24602461
yield tok_begin_sentence
24612462
in_sentence = True
2463+
if (
2464+
token.punctuation in PUNCT_INDIRECT_SPEECH
2465+
and next_token.punctuation in DQUOTES
2466+
):
2467+
yield token
2468+
token = next_token
2469+
next_token = next(token_stream)
2470+
if next_token.txt.islower():
2471+
# Probably indirect speech
2472+
# „Er einhver þarna?“ sagði konan.
2473+
yield token
2474+
token = next_token
2475+
next_token = next(token_stream)
2476+
else:
2477+
yield token
2478+
token = tok_end_sentence
2479+
in_sentence = False
24622480
if token.punctuation in END_OF_SENTENCE and not (
24632481
token.punctuation
24642482
== "…" # Excluding sentences with ellipsis in the middle
@@ -2535,7 +2553,6 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]:
25352553
# Maintain a one-token lookahead
25362554
token = next(token_stream)
25372555
while True:
2538-
25392556
next_token = next(token_stream)
25402557
# Coalesce abbreviations and trailing period
25412558
if token.kind == TOK.WORD and next_token.txt == ".":

test/test_tokenizer.py

Lines changed: 80 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ def strip_originals(tokens: List[Tok]) -> List[Tok]:
5858
return tokens
5959

6060

61+
def get_text_and_norm(orig: str) -> Tuple[str, str]:
62+
toklist = list(t.tokenize(orig))
63+
return t.text_from_tokens(toklist), t.normalized_text_from_tokens(toklist)
64+
65+
6166
def test_single_tokens() -> None:
6267

6368
TEST_CASES = [
@@ -2319,16 +2324,12 @@ def gen(s: Iterable[str]) -> Iterator[str]:
23192324
g = t.split_into_sentences("Athugum [hvort [setningin sé rétt skilin]].")
23202325
sents = list(g)
23212326
assert len(sents) == 1
2322-
assert sents == [
2323-
"Athugum [ hvort [ setningin sé rétt skilin ] ] ."
2324-
]
2327+
assert sents == ["Athugum [ hvort [ setningin sé rétt skilin ] ] ."]
23252328

23262329
g = t.split_into_sentences("Þessi [ætti [líka að]] vera rétt skilin.")
23272330
sents = list(g)
23282331
assert len(sents) == 1
2329-
assert sents == [
2330-
"Þessi [ ætti [ líka að ] ] vera rétt skilin ."
2331-
]
2332+
assert sents == ["Þessi [ ætti [ líka að ] ] vera rétt skilin ."]
23322333

23332334
# g = t.split_into_sentences("Þessi á [[líka að]] vera rétt skilin.")
23342335
# sents = list(g)
@@ -2340,11 +2341,81 @@ def gen(s: Iterable[str]) -> Iterator[str]:
23402341
# ]
23412342
# Test onesentperline
23422343

2344+
# Test whether indirect speech is split up
2345+
g = t.split_into_sentences("„Er einhver þarna?“ sagði konan.")
2346+
sents = list(g)
2347+
assert len(sents) == 1
2348+
assert sents == ["„ Er einhver þarna ? “ sagði konan ."]
2349+
2350+
g = t.split_into_sentences("„Er einhver þarna?“ Maðurinn þorði varla fram.")
2351+
sents = list(g)
2352+
assert len(sents) == 2
2353+
assert sents == ["„ Er einhver þarna ? “", "Maðurinn þorði varla fram ."]
2354+
2355+
g = t.split_into_sentences("„Hún hló,“ sagði barnið.")
2356+
sents = list(g)
2357+
assert len(sents) == 1
2358+
assert sents == ["„ Hún hló , “ sagði barnið ."]
2359+
2360+
# g = t.split_into_sentences("„Hvað meinarðu??“ sagði barnið.")
2361+
# sents = list(g)
2362+
# assert len(sents) == 1
2363+
# assert sents == ["„ Hvað meinarðu ?? “ sagði barnið ."]
2364+
23432365

23442366
def test_normalization() -> None:
2345-
toklist = list(t.tokenize('Hann sagði: "Þú ert ágæt!".'))
2346-
assert t.text_from_tokens(toklist) == 'Hann sagði : " Þú ert ágæt ! " .'
2347-
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ! “ ."
2367+
text, norm = get_text_and_norm('Hann sagði: "Þú ert ágæt!".')
2368+
2369+
assert text == 'Hann sagði : " Þú ert ágæt ! " .'
2370+
assert norm == "Hann sagði : „ Þú ert ágæt ! “ ."
2371+
2372+
text, norm = get_text_and_norm("Hún vinnur í fjármála-og efnahagsráðuneytinu.")
2373+
assert text == "Hún vinnur í fjármála- og efnahagsráðuneytinu ."
2374+
assert norm == "Hún vinnur í fjármála- og efnahagsráðuneytinu ."
2375+
2376+
text, norm = get_text_and_norm("Þetta er tyrfið...")
2377+
assert text == "Þetta er tyrfið ..."
2378+
assert norm == "Þetta er tyrfið …"
2379+
2380+
text, norm = get_text_and_norm("Þetta er gaman..")
2381+
assert text == "Þetta er gaman .."
2382+
assert norm == "Þetta er gaman ."
2383+
2384+
text, norm = get_text_and_norm("Þetta er hvellur.....")
2385+
assert text == "Þetta er hvellur ....."
2386+
assert norm == "Þetta er hvellur …"
2387+
2388+
text, norm = get_text_and_norm("Þetta er mergjað………")
2389+
assert text == "Þetta er mergjað ………"
2390+
assert norm == "Þetta er mergjað …"
2391+
2392+
text, norm = get_text_and_norm("Haldið var áfram [...] eftir langt hlé.")
2393+
assert text == "Haldið var áfram [...] eftir langt hlé ."
2394+
assert norm == "Haldið var áfram […] eftir langt hlé ."
2395+
2396+
text, norm = get_text_and_norm("Þetta er tyrfið,, en við höldum áfram.")
2397+
assert text == "Þetta er tyrfið ,, en við höldum áfram ."
2398+
assert norm == "Þetta er tyrfið , en við höldum áfram ."
2399+
2400+
text, norm = get_text_and_norm('Hinn svokallaði ,,Galileóhestur" hvarf.')
2401+
assert text == 'Hinn svokallaði ,, Galileóhestur " hvarf .'
2402+
assert norm == "Hinn svokallaði „ Galileóhestur “ hvarf ."
2403+
2404+
text, norm = get_text_and_norm("Mars - hin rauða pláneta - skín bjart í nótt.")
2405+
assert text == "Mars - hin rauða pláneta - skín bjart í nótt ."
2406+
assert norm == "Mars - hin rauða pláneta - skín bjart í nótt ."
2407+
2408+
text, norm = get_text_and_norm("Mars – hin rauða pláneta – skín bjart í nótt.")
2409+
assert text == "Mars – hin rauða pláneta – skín bjart í nótt ."
2410+
assert norm == "Mars - hin rauða pláneta - skín bjart í nótt ."
2411+
2412+
text, norm = get_text_and_norm("Mars — hin rauða pláneta — skín bjart í nótt.")
2413+
assert text == "Mars — hin rauða pláneta — skín bjart í nótt ."
2414+
assert norm == "Mars - hin rauða pláneta - skín bjart í nótt ."
2415+
2416+
text, norm = get_text_and_norm("Hvernig gastu gert þetta???!!!!!")
2417+
assert text == "Hvernig gastu gert þetta ???!!!!!"
2418+
assert norm == "Hvernig gastu gert þetta ?"
23482419

23492420
toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt!!??!".'))
23502421
assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt !!??! " .'

0 commit comments

Comments
 (0)