@@ -1222,19 +1222,56 @@ def lookup(abbrev):
1222
1222
):
1223
1223
# Abbreviation ending with period: make a special token for it
1224
1224
# and advance the input stream
1225
- abbrev = token .txt + "."
1226
1225
follow_token = next (token_stream )
1226
+ abbrev = token .txt + "."
1227
1227
1228
- if abbrev in Abbreviations .NOT_FINISHERS and could_be_end_of_sentence (follow_token , TOK .TEXT ):
1229
- # This is a potential abbreviation that we dont interpret
1230
- # as such if it's at the end of a sentence
1231
- # ('dags.', 'próf.', 'mín.')
1232
- yield token
1233
- token = next_token
1234
- next_token = follow_token
1228
+ # Check whether we might be at the end of a sentence, i.e.
1229
+ # the following token is an end-of-sentence or end-of-paragraph,
1230
+ # or uppercase (and not a month name misspelled in upper case).
1231
+
1232
+ if abbrev in Abbreviations .NAME_FINISHERS :
1233
+ # For name finishers (such as 'próf.') we don't consider a
1234
+ # following person name as an indicator of an end-of-sentence
1235
+ # !!! TODO: This does not work as intended because person names
1236
+ # !!! have not been recognized at this phase in the token pipeline.
1237
+ # TODO JAÐAR Skoða þetta betur í jaðartilvikum.
1238
+ test_set = TOK .TEXT_EXCL_PERSON
1235
1239
else :
1240
+ test_set = TOK .TEXT
1241
+
1242
+ # TODO STILLING í MONTHS eru einhverjar villur eins og "septembers",
1243
+ # þær þarf að vera hægt að sameina í þessa flóknari tóka en viljum
1244
+ # geta merkt það sem villu. Ætti líklega að setja í sérlista,
1245
+ # WRONG_MONTHS, og sérif-lykkju og setja inn villu í tókann.
1246
+ finish = could_be_end_of_sentence (
1247
+ follow_token , test_set , abbrev in MULTIPLIERS
1248
+ )
1249
+ if finish :
1250
+ # Potentially at the end of a sentence
1251
+ if abbrev in Abbreviations .FINISHERS :
1252
+ # We see this as an abbreviation even if the next sentence
1253
+ # seems to be starting just after it.
1254
+ # Yield the abbreviation without a trailing dot,
1255
+ # and then an 'extra' period token to end the current sentence.
1256
+ token = TOK .Word (token .txt , lookup (abbrev ))
1257
+ yield token
1258
+ # Set token to the period
1259
+ token = next_token
1260
+ elif abbrev in Abbreviations .NOT_FINISHERS :
1261
+ # This is a potential abbreviation that we don't interpret
1262
+ # as such if it's at the end of a sentence
1263
+ # ('dags.', 'próf.', 'mín.')
1264
+ yield token
1265
+ token = next_token
1266
+ else :
1267
+ # Substitute the abbreviation and eat the period
1268
+ token = TOK .Word (abbrev , lookup (abbrev ))
1269
+ else :
1270
+ # 'Regular' abbreviation in the middle of a sentence:
1271
+ # Eat the period and yield the abbreviation as a single token
1236
1272
token = TOK .Word (abbrev , lookup (abbrev ))
1237
- next_token = follow_token
1273
+
1274
+ next_token = follow_token
1238
1275
1239
1276
# Coalesce 'klukkan'/[kl.] + time or number into a time
1240
1277
if next_token .kind == TOK .TIME or next_token .kind == TOK .NUMBER :
@@ -1522,7 +1559,6 @@ def parse_sentences(token_stream):
1522
1559
exclamation marks, etc.) """
1523
1560
1524
1561
in_sentence = False
1525
- found_end = False
1526
1562
token = None
1527
1563
tok_begin_sentence = TOK .Begin_Sentence ()
1528
1564
tok_end_sentence = TOK .End_Sentence ()
@@ -1532,11 +1568,9 @@ def parse_sentences(token_stream):
1532
1568
# Maintain a one-token lookahead
1533
1569
token = next (token_stream )
1534
1570
while True :
1535
- #print(token)
1536
1571
next_token = next (token_stream )
1537
1572
if token .kind == TOK .P_BEGIN or token .kind == TOK .P_END :
1538
1573
# Block start or end: finish the current sentence, if any
1539
- #print("\t1")
1540
1574
if in_sentence :
1541
1575
yield tok_end_sentence
1542
1576
in_sentence = False
@@ -1548,30 +1582,20 @@ def parse_sentences(token_stream):
1548
1582
token = next (token_stream )
1549
1583
continue
1550
1584
elif token .kind == TOK .X_END :
1551
- #print("\t2")
1552
1585
assert not in_sentence
1553
1586
elif token .kind == TOK .S_SPLIT :
1554
1587
# Empty line in input: make sure to finish the current
1555
1588
# sentence, if any, even if no ending punctuation has
1556
1589
# been encountered
1557
- #print("\t3")
1558
1590
if in_sentence :
1559
1591
yield tok_end_sentence
1560
1592
in_sentence = False
1561
1593
# Swallow the S_SPLIT token
1562
1594
token = next_token
1563
1595
continue
1564
- elif token .kind == TOK .WORD and token .txt [- 1 ] == "." :
1565
- #print(token)
1566
- #print("\t4")
1567
- if could_be_end_of_sentence (next_token , TOK .TEXT ) and in_sentence and token .txt in Abbreviations .FINISHERS :
1568
- #print("\tJá!")
1569
- found_end = True
1570
-
1571
1596
else :
1572
1597
if not in_sentence :
1573
1598
# This token starts a new sentence
1574
- #print("\t5")
1575
1599
yield tok_begin_sentence
1576
1600
in_sentence = True
1577
1601
if (
@@ -1583,7 +1607,6 @@ def parse_sentences(token_stream):
1583
1607
)
1584
1608
):
1585
1609
# Combining punctuation ('??!!!')
1586
- #print("\t6")
1587
1610
while (
1588
1611
token .val [1 ] in PUNCT_COMBINATIONS
1589
1612
and next_token .txt in PUNCT_COMBINATIONS
@@ -1608,12 +1631,8 @@ def parse_sentences(token_stream):
1608
1631
yield token
1609
1632
token = tok_end_sentence
1610
1633
in_sentence = False
1611
- #print("\t7")
1634
+
1612
1635
yield token
1613
- if found_end :
1614
- yield tok_end_sentence
1615
- in_sentence = False
1616
- found_end = False
1617
1636
token = next_token
1618
1637
1619
1638
except StopIteration :
@@ -1663,18 +1682,28 @@ def parse_phrases_1(token_stream):
1663
1682
while True :
1664
1683
1665
1684
next_token = next (token_stream )
1685
+ # Coalesce abbreviations and trailing period
1686
+ if token .kind == TOK .WORD and next_token .txt == "." :
1687
+ abbrev = token .txt + next_token .txt
1688
+ if abbrev in Abbreviations .FINISHERS :
1689
+ token = TOK .Word (abbrev )
1690
+ next_token = next (token_stream )
1666
1691
1667
1692
# Coalesce [year|number] + ['e.Kr.'|'f.Kr.'] into year
1668
1693
if token .kind == TOK .YEAR or token .kind == TOK .NUMBER :
1669
1694
val = token .val if token .kind == TOK .YEAR else token .val [0 ]
1695
+ nval = ""
1670
1696
if next_token .txt in BCE : # f.Kr.
1671
1697
# Yes, we set year X BCE as year -X ;-)
1672
- token = TOK .Year (token .txt + " " + next_token .txt , - val )
1673
- next_token = next (token_stream )
1698
+ nval = - val
1674
1699
elif next_token .txt in CE : # e.Kr.
1675
- token = TOK .Year (token .txt + " " + next_token .txt , val )
1700
+ nval = val
1701
+ if nval :
1702
+ token = TOK .Year (token .txt + " " + next_token .txt , nval )
1676
1703
next_token = next (token_stream )
1677
-
1704
+ if next_token .txt == "." :
1705
+ token = TOK .Year (token .txt + next_token .txt , nval )
1706
+ next_token = next (token_stream )
1678
1707
# TODO: "5 mars" greinist sem dagsetning, vantar punktinn.
1679
1708
# Check for [number | ordinal] [month name]
1680
1709
if (
@@ -1752,6 +1781,7 @@ def parse_date_and_time(token_stream):
1752
1781
token = next (token_stream )
1753
1782
1754
1783
while True :
1784
+
1755
1785
next_token = next (token_stream )
1756
1786
1757
1787
# TODO: "5 mars" endar sem dagsetning. Þarf að geta merkt.
0 commit comments