257
257
])
258
258
259
259
# 'Current Era', 'Before Current Era'
260
- CE = frozenset (("e.Kr" , "e.Kr." )) # !!! Add AD and CE here?
261
- BCE = frozenset (("f.Kr" , "f.Kr." )) # !!! Add BCE here?
260
+ CE = frozenset (("e.Kr" , "e.Kr." )) # !!! Add AD and CE here?
261
+ BCE = frozenset (("f.Kr" , "f.Kr." )) # !!! Add BCE here?
262
+ CE_BCE = CE | BCE
263
+
264
+ # Supported ISO currency codes
265
+ CURRENCY_ABBREV = frozenset ((
266
+ "DKK" ,
267
+ "ISK" ,
268
+ "NOK" ,
269
+ "SEK" ,
270
+ "GBP" ,
271
+ "USD" ,
272
+ "CAD" ,
273
+ "AUD" ,
274
+ "CHF" ,
275
+ "JPY" ,
276
+ "PLN" ,
277
+ "RUB" ,
278
+ "INR" , # Indian rupee
279
+ "IDR" , # Indonesian rupiah
280
+ "CNY" ,
281
+ "RMB"
282
+ ))
262
283
263
284
# Derived unit : (base SI unit, conversion factor/function)
264
285
SI_UNITS = {
@@ -637,7 +658,8 @@ def parse_digits(w):
637
658
p = w .split ('/' )
638
659
m = int (p [1 ])
639
660
d = int (p [0 ])
640
- if p [0 ][0 ] != '0' and p [1 ][0 ] != '0' and ((d <= 5 and m <= 6 ) or (d == 1 and m <= 10 )):
661
+ if (p [0 ][0 ] != '0' and p [1 ][0 ] != '0' and
662
+ ((d <= 5 and m <= 6 ) or (d == 1 and m <= 10 ))):
641
663
# This is probably a fraction, not a date
642
664
# (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.)
643
665
# Return a number
@@ -816,7 +838,8 @@ def parse_tokens(txt):
816
838
# so they won't be caught by the isalpha() check below)
817
839
yield TOK .Word (w , None )
818
840
w = ""
819
- if w and (w .startswith ("http://" ) or w .startswith ("https://" ) or w .startswith ("www." )):
841
+ if w and (w .startswith ("http://" ) or
842
+ w .startswith ("https://" ) or w .startswith ("www." )):
820
843
# Handle URL: cut RIGHT_PUNCTUATION characters off its end,
821
844
# even though many of them are actually allowed according to
822
845
# the IETF RFC
@@ -832,7 +855,8 @@ def parse_tokens(txt):
832
855
ate = True
833
856
i = 1
834
857
lw = len (w )
835
- while i < lw and (w [i ].isalpha () or (w [i ] in PUNCT_INSIDE_WORD and (i + 1 == lw or w [i + 1 ].isalpha ()))):
858
+ while i < lw and (w [i ].isalpha () or
859
+ (w [i ] in PUNCT_INSIDE_WORD and (i + 1 == lw or w [i + 1 ].isalpha ()))):
836
860
# We allow dots to occur inside words in the case of
837
861
# abbreviations; also apostrophes are allowed within words and at the end
838
862
# (O'Malley, Mary's, it's, childrens', O‘Donnell)
@@ -929,21 +953,24 @@ def lookup(abbrev):
929
953
if token .kind == TOK .PUNCTUATION and token .txt == '$' and \
930
954
next_token .kind == TOK .NUMBER :
931
955
932
- token = TOK .Amount (token .txt + next_token .txt , "USD" , next_token .val [0 ])
956
+ token = TOK .Amount (token .txt + next_token .txt ,
957
+ "USD" , next_token .val [0 ])
933
958
next_token = next (token_stream )
934
959
935
960
# Check for €[number]
936
961
if token .kind == TOK .PUNCTUATION and token .txt == '€' and \
937
962
next_token .kind == TOK .NUMBER :
938
963
939
- token = TOK .Amount (token .txt + next_token .txt , "EUR" , next_token .val [0 ])
964
+ token = TOK .Amount (token .txt + next_token .txt ,
965
+ "EUR" , next_token .val [0 ])
940
966
next_token = next (token_stream )
941
967
942
968
# Coalesce abbreviations ending with a period into a single
943
969
# abbreviation token
944
970
if next_token .kind == TOK .PUNCTUATION and next_token .txt == '.' :
945
971
946
- if token .kind == TOK .WORD and token .txt [- 1 ] != '.' and is_abbr_with_period (token .txt ):
972
+ if (token .kind == TOK .WORD and token .txt [- 1 ] != '.' and
973
+ is_abbr_with_period (token .txt )):
947
974
# Abbreviation ending with period: make a special token for it
948
975
# and advance the input stream
949
976
@@ -969,7 +996,8 @@ def lookup(abbrev):
969
996
(follow_token .kind in test_set and
970
997
follow_token .txt [0 ].isupper () and
971
998
follow_token .txt .lower () not in MONTHS and
972
- not RE_ROMAN_NUMERAL .match (follow_token .txt )
999
+ not RE_ROMAN_NUMERAL .match (follow_token .txt ) and
1000
+ not (abbrev in MULTIPLIERS and follow_token .txt in CURRENCY_ABBREV )
973
1001
)
974
1002
)
975
1003
@@ -1000,11 +1028,13 @@ def lookup(abbrev):
1000
1028
1001
1029
# Coalesce 'klukkan'/[kl.] + time or number into a time
1002
1030
if next_token .kind == TOK .TIME or next_token .kind == TOK .NUMBER :
1003
- if clock or (token .kind == TOK .WORD and token .txt .lower () == CLOCK_WORD ):
1031
+ if clock or (token .kind == TOK .WORD and
1032
+ token .txt .lower () == CLOCK_WORD ):
1004
1033
# Match: coalesce and step to next token
1005
1034
txt = CLOCK_ABBREV + "." if clock else token .txt
1006
1035
if next_token .kind == TOK .NUMBER :
1007
- token = TOK .Time (txt + " " + next_token .txt , next_token .val [0 ], 0 , 0 )
1036
+ token = TOK .Time (txt + " " + next_token .txt ,
1037
+ next_token .val [0 ], 0 , 0 )
1008
1038
else :
1009
1039
# next_token.kind is TOK.TIME
1010
1040
token = TOK .Time (txt + " " + next_token .txt ,
@@ -1013,19 +1043,21 @@ def lookup(abbrev):
1013
1043
1014
1044
# Coalesce 'klukkan/kl. átta/hálfátta' into a time
1015
1045
elif next_token .txt in CLOCK_NUMBERS :
1016
- if clock or (token .kind == TOK .WORD and token .txt .lower () == CLOCK_WORD ):
1046
+ if clock or (token .kind == TOK .WORD and
1047
+ token .txt .lower () == CLOCK_WORD ):
1017
1048
txt = CLOCK_ABBREV + "." if clock else token .txt
1018
1049
# Match: coalesce and step to next token
1019
- token = TOK .Time (txt + " " + next_token .txt , * CLOCK_NUMBERS [next_token .txt ])
1050
+ token = TOK .Time (txt + " " + next_token .txt ,
1051
+ * CLOCK_NUMBERS [next_token .txt ])
1020
1052
next_token = next (token_stream )
1021
1053
1022
1054
# Words like 'hálftólf' only used in temporal expressions so can stand alone
1023
1055
if token .txt in CLOCK_HALF :
1024
1056
token = TOK .Time (token .txt , * CLOCK_NUMBERS [token .txt ])
1025
1057
1026
1058
# Coalesce 'árið' + [year|number] into year
1027
- if (token .kind == TOK .WORD and token .txt .lower () in YEAR_WORD ) and \
1028
- (next_token .kind == TOK .YEAR or next_token .kind == TOK .NUMBER ):
1059
+ if (( token .kind == TOK .WORD and token .txt .lower () in YEAR_WORD ) and
1060
+ (next_token .kind == TOK .YEAR or next_token .kind == TOK .NUMBER )) :
1029
1061
token = TOK .Year (token .txt + " " + next_token .txt ,
1030
1062
next_token .val if next_token .kind == TOK .YEAR else next_token .val [0 ])
1031
1063
next_token = next (token_stream )
@@ -1040,14 +1072,14 @@ def lookup(abbrev):
1040
1072
1041
1073
# Coalesce ordinals (1. = first, 2. = second...) into a single token
1042
1074
if next_token .kind == TOK .PUNCTUATION and next_token .txt == '.' :
1043
- if (token .kind == TOK .NUMBER and not ('.' in token .txt or ',' in token .txt )) or \
1044
- (token .kind == TOK .WORD and RE_ROMAN_NUMERAL .match (token .txt )):
1075
+ if (( token .kind == TOK .NUMBER and not ('.' in token .txt or ',' in token .txt )) or
1076
+ (token .kind == TOK .WORD and RE_ROMAN_NUMERAL .match (token .txt ))) :
1045
1077
# Ordinal, i.e. whole number or Roman numeral followed by period: convert to an ordinal token
1046
1078
follow_token = next (token_stream )
1047
- if follow_token .kind in TOK .END or \
1048
- (follow_token .kind == TOK .PUNCTUATION and follow_token .txt in {'„' , '"' }) or \
1079
+ if ( follow_token .kind in TOK .END or
1080
+ (follow_token .kind == TOK .PUNCTUATION and follow_token .txt in {'„' , '"' }) or
1049
1081
(follow_token .kind == TOK .WORD and follow_token .txt [0 ].isupper () and
1050
- follow_token .txt .lower () not in MONTHS ):
1082
+ follow_token .txt .lower () not in MONTHS )) :
1051
1083
# Next token is a sentence or paragraph end,
1052
1084
# or opening quotes,
1053
1085
# or an uppercase word (and not a month name misspelled in upper case):
@@ -1123,7 +1155,8 @@ def parse_sentences(token_stream):
1123
1155
if token .kind == TOK .PUNCTUATION and token .txt in END_OF_SENTENCE :
1124
1156
# We may be finishing a sentence with not only a period but also
1125
1157
# right parenthesis and quotation marks
1126
- while next_token .kind == TOK .PUNCTUATION and next_token .txt in SENTENCE_FINISHERS :
1158
+ while (next_token .kind == TOK .PUNCTUATION and
1159
+ next_token .txt in SENTENCE_FINISHERS ):
1127
1160
yield token
1128
1161
token = next_token
1129
1162
next_token = next (token_stream )
@@ -1198,9 +1231,12 @@ def parse_sentences(token_stream):
1198
1231
"þús." : 1000 ,
1199
1232
"milljón" : 1e6 ,
1200
1233
"milla" : 1e6 ,
1234
+ "millj." : 1e6 ,
1235
+ "mljó." : 1e6 ,
1201
1236
"milljarður" : 1e9 ,
1202
1237
"miljarður" : 1e9 ,
1203
- "ma." : 1e9
1238
+ "ma." : 1e9 ,
1239
+ "mrð." : 1e9
1204
1240
}
1205
1241
1206
1242
# Recognize words for percentages
@@ -1214,15 +1250,26 @@ def parse_sentences(token_stream):
1214
1250
# Amount abbreviations including 'kr' for the ISK
1215
1251
# Corresponding abbreviations are found in Abbrev.conf
1216
1252
AMOUNT_ABBREV = {
1253
+ "kr" : 1 ,
1254
+ "kr." : 1 ,
1217
1255
"þ.kr." : 1e3 ,
1256
+ "þ.kr" : 1e3 ,
1218
1257
"þús.kr." : 1e3 ,
1258
+ "þús.kr" : 1e3 ,
1219
1259
"m.kr." : 1e6 ,
1260
+ "m.kr" : 1e6 ,
1220
1261
"mkr." : 1e6 ,
1262
+ "mkr" : 1e6 ,
1221
1263
"millj.kr." : 1e6 ,
1264
+ "millj.kr" : 1e6 ,
1222
1265
"mljó.kr." : 1e6 ,
1266
+ "mljó.kr" : 1e6 ,
1223
1267
"ma.kr." : 1e9 ,
1268
+ "ma.kr" : 1e9 ,
1224
1269
"mö.kr." : 1e9 ,
1225
- "mlja.kr." : 1e9
1270
+ "mö.kr" : 1e9 ,
1271
+ "mlja.kr." : 1e9 ,
1272
+ "mlja.kr" : 1e9
1226
1273
}
1227
1274
1228
1275
@@ -1248,16 +1295,17 @@ def parse_phrases_1(token_stream):
1248
1295
# Coalesce [year|number] + ['e.Kr.'|'f.Kr.'] into year
1249
1296
if token .kind == TOK .YEAR or token .kind == TOK .NUMBER :
1250
1297
val = token .val if token .kind == TOK .YEAR else token .val [0 ]
1251
- if next_token .txt in BCE : # f.Kr.
1298
+ if next_token .txt in BCE : # f.Kr.
1252
1299
# Yes, we set year X BCE as year -X ;-)
1253
1300
token = TOK .Year (token .txt + " " + next_token .txt , - val )
1254
1301
next_token = next (token_stream )
1255
- elif next_token .txt in CE : # e.Kr.
1302
+ elif next_token .txt in CE : # e.Kr.
1256
1303
token = TOK .Year (token .txt + " " + next_token .txt , val )
1257
1304
next_token = next (token_stream )
1258
1305
1259
1306
# Check for [number | ordinal] [month name]
1260
- if (token .kind == TOK .ORDINAL or token .kind == TOK .NUMBER ) and next_token .kind == TOK .WORD :
1307
+ if ((token .kind == TOK .ORDINAL or token .kind == TOK .NUMBER ) and
1308
+ next_token .kind == TOK .WORD ):
1261
1309
1262
1310
month = match_stem_list (next_token , MONTHS )
1263
1311
if month is not None :
@@ -1313,8 +1361,10 @@ def parse_date_and_time(token_stream):
1313
1361
1314
1362
# DATEABS and DATEREL made
1315
1363
# Check for [number | ordinal] [month name]
1316
- if (token .kind == TOK .ORDINAL or token .kind == TOK .NUMBER or
1317
- (token .txt and token .txt .lower () in DAYS_OF_MONTH )) and next_token .kind == TOK .WORD :
1364
+ if ((token .kind == TOK .ORDINAL or token .kind == TOK .NUMBER or
1365
+ (token .txt and token .txt .lower () in DAYS_OF_MONTH )) and
1366
+ next_token .kind == TOK .WORD ):
1367
+
1318
1368
month = match_stem_list (next_token , MONTHS )
1319
1369
if month is not None :
1320
1370
token = TOK .Date (token .txt + " " + next_token .txt ,
@@ -1348,7 +1398,8 @@ def parse_date_and_time(token_stream):
1348
1398
else next_token .val [0 ] if 1776 <= next_token .val [0 ] <= 2100
1349
1399
else 0 )
1350
1400
if year != 0 :
1351
- token = TOK .Date (token .txt + " " + next_token .txt , y = year , m = month , d = 0 )
1401
+ token = TOK .Date (token .txt + " " + next_token .txt ,
1402
+ y = year , m = month , d = 0 )
1352
1403
# Eat the year token
1353
1404
next_token = next (token_stream )
1354
1405
@@ -1365,9 +1416,11 @@ def parse_date_and_time(token_stream):
1365
1416
# Split DATE into DATEABS and DATEREL
1366
1417
if token .kind == TOK .DATE :
1367
1418
if token .val [0 ] and token .val [1 ] and token .val [2 ]:
1368
- token = TOK .Dateabs (token .txt , y = token .val [0 ], m = token .val [1 ], d = token .val [2 ])
1419
+ token = TOK .Dateabs (token .txt ,
1420
+ y = token .val [0 ], m = token .val [1 ], d = token .val [2 ])
1369
1421
else :
1370
- token = TOK .Daterel (token .txt , y = token .val [0 ], m = token .val [1 ], d = token .val [2 ])
1422
+ token = TOK .Daterel (token .txt ,
1423
+ y = token .val [0 ], m = token .val [1 ], d = token .val [2 ])
1371
1424
1372
1425
# Split TIMESTAMP into TIMESTAMPABS and TIMESTAMPREL
1373
1426
if token .kind == TOK .TIMESTAMP :
@@ -1379,12 +1432,13 @@ def parse_date_and_time(token_stream):
1379
1432
1380
1433
# Swallow "e.Kr." and "f.Kr." postfixes
1381
1434
if token .kind == TOK .DATEABS :
1382
- if next_token .kind == TOK .WORD and next_token .txt in { "e.Kr." , "e.Kr" , "f.Kr." , "f.Kr" } :
1435
+ if next_token .kind == TOK .WORD and next_token .txt in CE_BCE :
1383
1436
y = token .val [0 ]
1384
- if next_token .txt in { "f.Kr." , "f.Kr" } :
1437
+ if next_token .txt in BCE :
1385
1438
# Change year to negative number
1386
1439
y = - y
1387
- token = TOK .Dateabs (token .txt + " " + next_token .txt , y = y , m = token .val [1 ], d = token .val [2 ])
1440
+ token = TOK .Dateabs (token .txt + " " + next_token .txt ,
1441
+ y = y , m = token .val [1 ], d = token .val [2 ])
1388
1442
# Swallow the postfix
1389
1443
next_token = next (token_stream )
1390
1444
@@ -1473,6 +1527,11 @@ def convert_to_num(token):
1473
1527
token = TOK .Amount (token .txt + " " + next_token .txt , "ISK" ,
1474
1528
token .val [0 ] * AMOUNT_ABBREV [next_token .txt ])
1475
1529
next_token = next (token_stream )
1530
+ elif next_token .txt in CURRENCY_ABBREV :
1531
+ # A number followed by an ISO currency abbreviation
1532
+ token = TOK .Amount (token .txt + " " + next_token .txt , next_token .txt ,
1533
+ token .val [0 ])
1534
+ next_token = next (token_stream )
1476
1535
else :
1477
1536
# Check for [number] 'percent'
1478
1537
percentage = match_stem_list (next_token , PERCENTAGES )
@@ -1490,11 +1549,13 @@ def convert_to_num(token):
1490
1549
# 'stjórnskipunar- og eftirlitsnefnd'
1491
1550
# 'viðskipta- og iðnaðarráðherra'
1492
1551
# 'marg-ítrekaðri'
1493
- if token .kind == TOK .WORD and \
1494
- next_token .kind == TOK .PUNCTUATION and next_token .txt == COMPOSITE_HYPHEN :
1552
+ if (token .kind == TOK .WORD and
1553
+ next_token .kind == TOK .PUNCTUATION and
1554
+ next_token .txt == COMPOSITE_HYPHEN ):
1495
1555
1496
1556
og_token = next (token_stream )
1497
- if og_token .kind != TOK .WORD or (og_token .txt != "og" and og_token .txt != "eða" ):
1557
+ if (og_token .kind != TOK .WORD or
1558
+ (og_token .txt != "og" and og_token .txt != "eða" )):
1498
1559
# Incorrect prediction: make amends and continue
1499
1560
handled = False
1500
1561
if og_token .kind == TOK .WORD :
@@ -1524,8 +1585,8 @@ def convert_to_num(token):
1524
1585
# the last word, but an amalgamated token text.
1525
1586
# Note: there is no meaning check for the first
1526
1587
# part of the composition, so it can be an unknown word.
1527
- txt = token .txt + "- " + og_token .txt + \
1528
- " " + final_token .txt
1588
+ txt = ( token .txt + "- " + og_token .txt +
1589
+ " " + final_token .txt )
1529
1590
token = TOK .Word (txt )
1530
1591
next_token = next (token_stream )
1531
1592
@@ -1636,6 +1697,7 @@ def valid_sent(sent):
1636
1697
)
1637
1698
RE_SPLIT = re .compile (RE_SPLIT_STR )
1638
1699
1700
+
1639
1701
def correct_spaces (s ):
1640
1702
""" Utility function to split and re-compose a string with correct spacing between tokens"""
1641
1703
r = []
0 commit comments