Skip to content

Commit 62ccc51

Browse files
committed
Different approach tested, not finished
1 parent 105c56c commit 62ccc51

File tree

1 file changed

+62
-32
lines changed

1 file changed

+62
-32
lines changed

src/tokenizer/tokenizer.py

Lines changed: 62 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,19 +1222,56 @@ def lookup(abbrev):
12221222
):
12231223
# Abbreviation ending with period: make a special token for it
12241224
# and advance the input stream
1225-
abbrev = token.txt + "."
12261225
follow_token = next(token_stream)
1226+
abbrev = token.txt + "."
12271227

1228-
if abbrev in Abbreviations.NOT_FINISHERS and could_be_end_of_sentence(follow_token, TOK.TEXT):
1229-
# This is a potential abbreviation that we dont interpret
1230-
# as such if it's at the end of a sentence
1231-
# ('dags.', 'próf.', 'mín.')
1232-
yield token
1233-
token = next_token
1234-
next_token = follow_token
1228+
# Check whether we might be at the end of a sentence, i.e.
1229+
# the following token is an end-of-sentence or end-of-paragraph,
1230+
# or uppercase (and not a month name misspelled in upper case).
1231+
1232+
if abbrev in Abbreviations.NAME_FINISHERS:
1233+
# For name finishers (such as 'próf.') we don't consider a
1234+
# following person name as an indicator of an end-of-sentence
1235+
# !!! TODO: This does not work as intended because person names
1236+
# !!! have not been recognized at this phase in the token pipeline.
1237+
# TODO JAÐAR Skoða þetta betur í jaðartilvikum.
1238+
test_set = TOK.TEXT_EXCL_PERSON
12351239
else:
1240+
test_set = TOK.TEXT
1241+
1242+
# TODO STILLING í MONTHS eru einhverjar villur eins og "septembers",
1243+
# þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum
1244+
# geta merkt það sem villu. Ætti líklega að setja í sérlista,
1245+
# WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann.
1246+
finish = could_be_end_of_sentence(
1247+
follow_token, test_set, abbrev in MULTIPLIERS
1248+
)
1249+
if finish:
1250+
# Potentially at the end of a sentence
1251+
if abbrev in Abbreviations.FINISHERS:
1252+
# We see this as an abbreviation even if the next sentence
1253+
# seems to be starting just after it.
1254+
# Yield the abbreviation without a trailing dot,
1255+
# and then an 'extra' period token to end the current sentence.
1256+
token = TOK.Word(token.txt, lookup(abbrev))
1257+
yield token
1258+
# Set token to the period
1259+
token = next_token
1260+
elif abbrev in Abbreviations.NOT_FINISHERS:
1261+
# This is a potential abbreviation that we don't interpret
1262+
# as such if it's at the end of a sentence
1263+
# ('dags.', 'próf.', 'mín.')
1264+
yield token
1265+
token = next_token
1266+
else:
1267+
# Substitute the abbreviation and eat the period
1268+
token = TOK.Word(abbrev, lookup(abbrev))
1269+
else:
1270+
# 'Regular' abbreviation in the middle of a sentence:
1271+
# Eat the period and yield the abbreviation as a single token
12361272
token = TOK.Word(abbrev, lookup(abbrev))
1237-
next_token = follow_token
1273+
1274+
next_token = follow_token
12381275

12391276
# Coalesce 'klukkan'/[kl.] + time or number into a time
12401277
if next_token.kind == TOK.TIME or next_token.kind == TOK.NUMBER:
@@ -1522,7 +1559,6 @@ def parse_sentences(token_stream):
15221559
exclamation marks, etc.) """
15231560

15241561
in_sentence = False
1525-
found_end = False
15261562
token = None
15271563
tok_begin_sentence = TOK.Begin_Sentence()
15281564
tok_end_sentence = TOK.End_Sentence()
@@ -1532,11 +1568,9 @@ def parse_sentences(token_stream):
15321568
# Maintain a one-token lookahead
15331569
token = next(token_stream)
15341570
while True:
1535-
#print(token)
15361571
next_token = next(token_stream)
15371572
if token.kind == TOK.P_BEGIN or token.kind == TOK.P_END:
15381573
# Block start or end: finish the current sentence, if any
1539-
#print("\t1")
15401574
if in_sentence:
15411575
yield tok_end_sentence
15421576
in_sentence = False
@@ -1548,30 +1582,20 @@ def parse_sentences(token_stream):
15481582
token = next(token_stream)
15491583
continue
15501584
elif token.kind == TOK.X_END:
1551-
#print("\t2")
15521585
assert not in_sentence
15531586
elif token.kind == TOK.S_SPLIT:
15541587
# Empty line in input: make sure to finish the current
15551588
# sentence, if any, even if no ending punctuation has
15561589
# been encountered
1557-
#print("\t3")
15581590
if in_sentence:
15591591
yield tok_end_sentence
15601592
in_sentence = False
15611593
# Swallow the S_SPLIT token
15621594
token = next_token
15631595
continue
1564-
elif token.kind == TOK.WORD and token.txt[-1] == ".":
1565-
#print(token)
1566-
#print("\t4")
1567-
if could_be_end_of_sentence(next_token, TOK.TEXT) and in_sentence and token.txt in Abbreviations.FINISHERS:
1568-
#print("\tJá!")
1569-
found_end = True
1570-
15711596
else:
15721597
if not in_sentence:
15731598
# This token starts a new sentence
1574-
#print("\t5")
15751599
yield tok_begin_sentence
15761600
in_sentence = True
15771601
if (
@@ -1583,7 +1607,6 @@ def parse_sentences(token_stream):
15831607
)
15841608
):
15851609
# Combining punctuation ('??!!!')
1586-
#print("\t6")
15871610
while (
15881611
token.val[1] in PUNCT_COMBINATIONS
15891612
and next_token.txt in PUNCT_COMBINATIONS
@@ -1608,12 +1631,8 @@ def parse_sentences(token_stream):
16081631
yield token
16091632
token = tok_end_sentence
16101633
in_sentence = False
1611-
#print("\t7")
1634+
16121635
yield token
1613-
if found_end:
1614-
yield tok_end_sentence
1615-
in_sentence = False
1616-
found_end = False
16171636
token = next_token
16181637

16191638
except StopIteration:
@@ -1663,18 +1682,28 @@ def parse_phrases_1(token_stream):
16631682
while True:
16641683

16651684
next_token = next(token_stream)
1685+
# Coalesce abbreviations and trailing period
1686+
if token.kind == TOK.WORD and next_token.txt == ".":
1687+
abbrev = token.txt + next_token.txt
1688+
if abbrev in Abbreviations.FINISHERS:
1689+
token = TOK.Word(abbrev)
1690+
next_token = next(token_stream)
16661691

16671692
# Coalesce [year|number] + ['e.Kr.'|'f.Kr.'] into year
16681693
if token.kind == TOK.YEAR or token.kind == TOK.NUMBER:
16691694
val = token.val if token.kind == TOK.YEAR else token.val[0]
1695+
nval = ""
16701696
if next_token.txt in BCE: # f.Kr.
16711697
# Yes, we set year X BCE as year -X ;-)
1672-
token = TOK.Year(token.txt + " " + next_token.txt, -val)
1673-
next_token = next(token_stream)
1698+
nval = -val
16741699
elif next_token.txt in CE: # e.Kr.
1675-
token = TOK.Year(token.txt + " " + next_token.txt, val)
1700+
nval = val
1701+
if nval:
1702+
token = TOK.Year(token.txt + " " + next_token.txt, nval)
16761703
next_token = next(token_stream)
1677-
1704+
if next_token.txt == ".":
1705+
token = TOK.Year(token.txt + next_token.txt, nval)
1706+
next_token = next(token_stream)
16781707
# TODO: "5 mars" greinist sem dagsetning, vantar punktinn.
16791708
# Check for [number | ordinal] [month name]
16801709
if (
@@ -1752,6 +1781,7 @@ def parse_date_and_time(token_stream):
17521781
token = next(token_stream)
17531782

17541783
while True:
1784+
17551785
next_token = next(token_stream)
17561786

17571787
# TODO: "5 mars" endar sem dagsetning. Þarf að geta merkt.

0 commit comments

Comments
 (0)