Skip to content

Commit b430e86

Browse files
Fixed handling of spelled-out amounts, especially with ISO currency codes
1 parent 44dee4f commit b430e86

File tree

3 files changed

+135
-59
lines changed

3 files changed

+135
-59
lines changed

src/tokenizer/Abbrev.conf

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,6 @@ nr. = "númer" hk
213213
ca. = "circa" hk
214214
ath. = "athuga" so
215215
kl. = "klukkan" ao
216-
kr.* = "krónur" kvk
217-
þús.* = "þúsund" hk
218-
millj.* = "milljónir" kvk # Í einhverjum tilvikum milljarðar?
219-
ma.* = "milljarðar" kk
220216
klst.* = "klukkustund" kvk
221217
mín.! = "mínúta" kvk # !!! Rekst á orðið 'mín' sem er oft í enda málsgreinar
222218
sek.! = "sekúnda" kvk # !!! Rekst á orðið 'sek'
@@ -402,15 +398,22 @@ CFC = "Controlled Foreign Corporation" hk erl
402398

403399
# Sjá lista í tokenizer.py (AMOUNT_ABBREV)
404400

401+
kr.* = "krónur" kvk
402+
þús.* = "þúsund" hk
403+
millj.* = "milljónir" kvk # Í einhverjum tilvikum milljarðar?
404+
mljó.* = "milljónir" kk
405+
ma.* = "milljarðar" kk
406+
mrð.* = "milljarðar" kk
407+
mlja.* = "milljarðar" kk
408+
405409
þ.kr.* = "þúsundir króna" kvk
406410
þús.kr.* = "þúsundir króna" kvk
407411
m.kr.* = "milljónir króna" kvk
408412
mkr.* = "milljónir króna" kvk
409413
millj.kr.* = "milljónir króna" kvk
410-
mljó.* = "milljónir" kk
411414
ma.kr.* = "milljarðar króna" kk
412415
mö.kr.* = "milljörðum króna" kk
413-
mlja.* = "milljarðar" kk
416+
mrð.kr.* = "milljarðar króna" kk
414417

415418
A. = "A"
416419
Á. = "Á"

src/tokenizer/tokenizer.py

Lines changed: 101 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,29 @@
257257
])
258258

259259
# 'Current Era', 'Before Current Era'
260-
CE = frozenset(("e.Kr", "e.Kr.")) # !!! Add AD and CE here?
261-
BCE = frozenset(("f.Kr", "f.Kr.")) # !!! Add BCE here?
260+
CE = frozenset(("e.Kr", "e.Kr.")) # !!! Add AD and CE here?
261+
BCE = frozenset(("f.Kr", "f.Kr.")) # !!! Add BCE here?
262+
CE_BCE = CE | BCE
263+
264+
# Supported ISO currency codes
265+
CURRENCY_ABBREV = frozenset((
266+
"DKK",
267+
"ISK",
268+
"NOK",
269+
"SEK",
270+
"GBP",
271+
"USD",
272+
"CAD",
273+
"AUD",
274+
"CHF",
275+
"JPY",
276+
"PLN",
277+
"RUB",
278+
"INR", # Indian rupee
279+
"IDR", # Indonesian rupiah
280+
"CNY",
281+
"RMB"
282+
))
262283

263284
# Derived unit : (base SI unit, conversion factor/function)
264285
SI_UNITS = {
@@ -637,7 +658,8 @@ def parse_digits(w):
637658
p = w.split('/')
638659
m = int(p[1])
639660
d = int(p[0])
640-
if p[0][0] != '0' and p[1][0] != '0' and ((d <= 5 and m <= 6) or (d == 1 and m <= 10)):
661+
if (p[0][0] != '0' and p[1][0] != '0' and
662+
((d <= 5 and m <= 6) or (d == 1 and m <= 10))):
641663
# This is probably a fraction, not a date
642664
# (1/2, 1/3, 1/4, 1/5, 1/6, 2/3, 2/5, 5/6 etc.)
643665
# Return a number
@@ -816,7 +838,8 @@ def parse_tokens(txt):
816838
# so they won't be caught by the isalpha() check below)
817839
yield TOK.Word(w, None)
818840
w = ""
819-
if w and (w.startswith("http://") or w.startswith("https://") or w.startswith("www.")):
841+
if w and (w.startswith("http://") or
842+
w.startswith("https://") or w.startswith("www.")):
820843
# Handle URL: cut RIGHT_PUNCTUATION characters off its end,
821844
# even though many of them are actually allowed according to
822845
# the IETF RFC
@@ -832,7 +855,8 @@ def parse_tokens(txt):
832855
ate = True
833856
i = 1
834857
lw = len(w)
835-
while i < lw and (w[i].isalpha() or (w[i] in PUNCT_INSIDE_WORD and (i+1 == lw or w[i+1].isalpha()))):
858+
while i < lw and (w[i].isalpha() or
859+
(w[i] in PUNCT_INSIDE_WORD and (i+1 == lw or w[i+1].isalpha()))):
836860
# We allow dots to occur inside words in the case of
837861
# abbreviations; also apostrophes are allowed within words and at the end
838862
# (O'Malley, Mary's, it's, childrens', O‘Donnell)
@@ -929,21 +953,24 @@ def lookup(abbrev):
929953
if token.kind == TOK.PUNCTUATION and token.txt == '$' and \
930954
next_token.kind == TOK.NUMBER:
931955

932-
token = TOK.Amount(token.txt + next_token.txt, "USD", next_token.val[0])
956+
token = TOK.Amount(token.txt + next_token.txt,
957+
"USD", next_token.val[0])
933958
next_token = next(token_stream)
934959

935960
# Check for €[number]
936961
if token.kind == TOK.PUNCTUATION and token.txt == '€' and \
937962
next_token.kind == TOK.NUMBER:
938963

939-
token = TOK.Amount(token.txt + next_token.txt, "EUR", next_token.val[0])
964+
token = TOK.Amount(token.txt + next_token.txt,
965+
"EUR", next_token.val[0])
940966
next_token = next(token_stream)
941967

942968
# Coalesce abbreviations ending with a period into a single
943969
# abbreviation token
944970
if next_token.kind == TOK.PUNCTUATION and next_token.txt == '.':
945971

946-
if token.kind == TOK.WORD and token.txt[-1] != '.' and is_abbr_with_period(token.txt):
972+
if (token.kind == TOK.WORD and token.txt[-1] != '.' and
973+
is_abbr_with_period(token.txt)):
947974
# Abbreviation ending with period: make a special token for it
948975
# and advance the input stream
949976

@@ -969,7 +996,8 @@ def lookup(abbrev):
969996
(follow_token.kind in test_set and
970997
follow_token.txt[0].isupper() and
971998
follow_token.txt.lower() not in MONTHS and
972-
not RE_ROMAN_NUMERAL.match(follow_token.txt)
999+
not RE_ROMAN_NUMERAL.match(follow_token.txt) and
1000+
not (abbrev in MULTIPLIERS and follow_token.txt in CURRENCY_ABBREV)
9731001
)
9741002
)
9751003

@@ -1000,11 +1028,13 @@ def lookup(abbrev):
10001028

10011029
# Coalesce 'klukkan'/[kl.] + time or number into a time
10021030
if next_token.kind == TOK.TIME or next_token.kind == TOK.NUMBER:
1003-
if clock or (token.kind == TOK.WORD and token.txt.lower() == CLOCK_WORD):
1031+
if clock or (token.kind == TOK.WORD and
1032+
token.txt.lower() == CLOCK_WORD):
10041033
# Match: coalesce and step to next token
10051034
txt = CLOCK_ABBREV + "." if clock else token.txt
10061035
if next_token.kind == TOK.NUMBER:
1007-
token = TOK.Time(txt + " " + next_token.txt, next_token.val[0], 0, 0)
1036+
token = TOK.Time(txt + " " + next_token.txt,
1037+
next_token.val[0], 0, 0)
10081038
else:
10091039
# next_token.kind is TOK.TIME
10101040
token = TOK.Time(txt + " " + next_token.txt,
@@ -1013,19 +1043,21 @@ def lookup(abbrev):
10131043

10141044
# Coalesce 'klukkan/kl. átta/hálfátta' into a time
10151045
elif next_token.txt in CLOCK_NUMBERS:
1016-
if clock or (token.kind == TOK.WORD and token.txt.lower() == CLOCK_WORD):
1046+
if clock or (token.kind == TOK.WORD and
1047+
token.txt.lower() == CLOCK_WORD):
10171048
txt = CLOCK_ABBREV + "." if clock else token.txt
10181049
# Match: coalesce and step to next token
1019-
token = TOK.Time(txt + " " + next_token.txt, *CLOCK_NUMBERS[next_token.txt])
1050+
token = TOK.Time(txt + " " + next_token.txt,
1051+
*CLOCK_NUMBERS[next_token.txt])
10201052
next_token = next(token_stream)
10211053

10221054
# Words like 'hálftólf' only used in temporal expressions so can stand alone
10231055
if token.txt in CLOCK_HALF:
10241056
token = TOK.Time(token.txt, *CLOCK_NUMBERS[token.txt])
10251057

10261058
# Coalesce 'árið' + [year|number] into year
1027-
if (token.kind == TOK.WORD and token.txt.lower() in YEAR_WORD) and \
1028-
(next_token.kind == TOK.YEAR or next_token.kind == TOK.NUMBER):
1059+
if ((token.kind == TOK.WORD and token.txt.lower() in YEAR_WORD) and
1060+
(next_token.kind == TOK.YEAR or next_token.kind == TOK.NUMBER)):
10291061
token = TOK.Year(token.txt + " " + next_token.txt,
10301062
next_token.val if next_token.kind == TOK.YEAR else next_token.val[0])
10311063
next_token = next(token_stream)
@@ -1040,14 +1072,14 @@ def lookup(abbrev):
10401072

10411073
# Coalesce ordinals (1. = first, 2. = second...) into a single token
10421074
if next_token.kind == TOK.PUNCTUATION and next_token.txt == '.':
1043-
if (token.kind == TOK.NUMBER and not ('.' in token.txt or ',' in token.txt)) or \
1044-
(token.kind == TOK.WORD and RE_ROMAN_NUMERAL.match(token.txt)):
1075+
if ((token.kind == TOK.NUMBER and not ('.' in token.txt or ',' in token.txt)) or
1076+
(token.kind == TOK.WORD and RE_ROMAN_NUMERAL.match(token.txt))):
10451077
# Ordinal, i.e. whole number or Roman numeral followed by period: convert to an ordinal token
10461078
follow_token = next(token_stream)
1047-
if follow_token.kind in TOK.END or \
1048-
(follow_token.kind == TOK.PUNCTUATION and follow_token.txt in {'„', '"'}) or \
1079+
if (follow_token.kind in TOK.END or
1080+
(follow_token.kind == TOK.PUNCTUATION and follow_token.txt in {'„', '"'}) or
10491081
(follow_token.kind == TOK.WORD and follow_token.txt[0].isupper() and
1050-
follow_token.txt.lower() not in MONTHS):
1082+
follow_token.txt.lower() not in MONTHS)):
10511083
# Next token is a sentence or paragraph end,
10521084
# or opening quotes,
10531085
# or an uppercase word (and not a month name misspelled in upper case):
@@ -1123,7 +1155,8 @@ def parse_sentences(token_stream):
11231155
if token.kind == TOK.PUNCTUATION and token.txt in END_OF_SENTENCE:
11241156
# We may be finishing a sentence with not only a period but also
11251157
# right parenthesis and quotation marks
1126-
while next_token.kind == TOK.PUNCTUATION and next_token.txt in SENTENCE_FINISHERS:
1158+
while (next_token.kind == TOK.PUNCTUATION and
1159+
next_token.txt in SENTENCE_FINISHERS):
11271160
yield token
11281161
token = next_token
11291162
next_token = next(token_stream)
@@ -1198,9 +1231,12 @@ def parse_sentences(token_stream):
11981231
"þús.": 1000,
11991232
"milljón": 1e6,
12001233
"milla": 1e6,
1234+
"millj.": 1e6,
1235+
"mljó.": 1e6,
12011236
"milljarður": 1e9,
12021237
"miljarður": 1e9,
1203-
"ma.": 1e9
1238+
"ma.": 1e9,
1239+
"mrð.": 1e9
12041240
}
12051241

12061242
# Recognize words for percentages
@@ -1214,15 +1250,26 @@ def parse_sentences(token_stream):
12141250
# Amount abbreviations including 'kr' for the ISK
12151251
# Corresponding abbreviations are found in Abbrev.conf
12161252
AMOUNT_ABBREV = {
1253+
"kr": 1,
1254+
"kr.": 1,
12171255
"þ.kr.": 1e3,
1256+
"þ.kr": 1e3,
12181257
"þús.kr.": 1e3,
1258+
"þús.kr": 1e3,
12191259
"m.kr.": 1e6,
1260+
"m.kr": 1e6,
12201261
"mkr.": 1e6,
1262+
"mkr": 1e6,
12211263
"millj.kr.": 1e6,
1264+
"millj.kr": 1e6,
12221265
"mljó.kr.": 1e6,
1266+
"mljó.kr": 1e6,
12231267
"ma.kr.": 1e9,
1268+
"ma.kr": 1e9,
12241269
"mö.kr.": 1e9,
1225-
"mlja.kr.": 1e9
1270+
"mö.kr": 1e9,
1271+
"mlja.kr.": 1e9,
1272+
"mlja.kr": 1e9
12261273
}
12271274

12281275

@@ -1248,16 +1295,17 @@ def parse_phrases_1(token_stream):
12481295
# Coalesce [year|number] + ['e.Kr.'|'f.Kr.'] into year
12491296
if token.kind == TOK.YEAR or token.kind == TOK.NUMBER:
12501297
val = token.val if token.kind == TOK.YEAR else token.val[0]
1251-
if next_token.txt in BCE: # f.Kr.
1298+
if next_token.txt in BCE: # f.Kr.
12521299
# Yes, we set year X BCE as year -X ;-)
12531300
token = TOK.Year(token.txt + " " + next_token.txt, -val)
12541301
next_token = next(token_stream)
1255-
elif next_token.txt in CE: # e.Kr.
1302+
elif next_token.txt in CE: # e.Kr.
12561303
token = TOK.Year(token.txt + " " + next_token.txt, val)
12571304
next_token = next(token_stream)
12581305

12591306
# Check for [number | ordinal] [month name]
1260-
if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and next_token.kind == TOK.WORD:
1307+
if ((token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER) and
1308+
next_token.kind == TOK.WORD):
12611309

12621310
month = match_stem_list(next_token, MONTHS)
12631311
if month is not None:
@@ -1313,8 +1361,10 @@ def parse_date_and_time(token_stream):
13131361

13141362
# DATEABS and DATEREL made
13151363
# Check for [number | ordinal] [month name]
1316-
if (token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER or
1317-
(token.txt and token.txt.lower() in DAYS_OF_MONTH)) and next_token.kind == TOK.WORD:
1364+
if ((token.kind == TOK.ORDINAL or token.kind == TOK.NUMBER or
1365+
(token.txt and token.txt.lower() in DAYS_OF_MONTH)) and
1366+
next_token.kind == TOK.WORD):
1367+
13181368
month = match_stem_list(next_token, MONTHS)
13191369
if month is not None:
13201370
token = TOK.Date(token.txt + " " + next_token.txt,
@@ -1348,7 +1398,8 @@ def parse_date_and_time(token_stream):
13481398
else next_token.val[0] if 1776 <= next_token.val[0] <= 2100
13491399
else 0)
13501400
if year != 0:
1351-
token = TOK.Date(token.txt + " " + next_token.txt, y = year, m = month, d = 0)
1401+
token = TOK.Date(token.txt + " " + next_token.txt,
1402+
y = year, m = month, d = 0)
13521403
# Eat the year token
13531404
next_token = next(token_stream)
13541405

@@ -1365,9 +1416,11 @@ def parse_date_and_time(token_stream):
13651416
# Split DATE into DATEABS and DATEREL
13661417
if token.kind == TOK.DATE:
13671418
if token.val[0] and token.val[1] and token.val[2]:
1368-
token = TOK.Dateabs(token.txt, y = token.val[0], m = token.val[1], d = token.val[2])
1419+
token = TOK.Dateabs(token.txt,
1420+
y = token.val[0], m = token.val[1], d = token.val[2])
13691421
else:
1370-
token = TOK.Daterel(token.txt, y = token.val[0], m = token.val[1], d = token.val[2])
1422+
token = TOK.Daterel(token.txt,
1423+
y = token.val[0], m = token.val[1], d = token.val[2])
13711424

13721425
# Split TIMESTAMP into TIMESTAMPABS and TIMESTAMPREL
13731426
if token.kind == TOK.TIMESTAMP:
@@ -1379,12 +1432,13 @@ def parse_date_and_time(token_stream):
13791432

13801433
# Swallow "e.Kr." and "f.Kr." postfixes
13811434
if token.kind == TOK.DATEABS:
1382-
if next_token.kind == TOK.WORD and next_token.txt in { "e.Kr.", "e.Kr", "f.Kr.", "f.Kr" }:
1435+
if next_token.kind == TOK.WORD and next_token.txt in CE_BCE:
13831436
y = token.val[0]
1384-
if next_token.txt in { "f.Kr.", "f.Kr" }:
1437+
if next_token.txt in BCE:
13851438
# Change year to negative number
13861439
y = -y
1387-
token = TOK.Dateabs(token.txt + " " + next_token.txt, y = y, m = token.val[1], d = token.val[2])
1440+
token = TOK.Dateabs(token.txt + " " + next_token.txt,
1441+
y = y, m = token.val[1], d = token.val[2])
13881442
# Swallow the postfix
13891443
next_token = next(token_stream)
13901444

@@ -1473,6 +1527,11 @@ def convert_to_num(token):
14731527
token = TOK.Amount(token.txt + " " + next_token.txt, "ISK",
14741528
token.val[0] * AMOUNT_ABBREV[next_token.txt])
14751529
next_token = next(token_stream)
1530+
elif next_token.txt in CURRENCY_ABBREV:
1531+
# A number followed by an ISO currency abbreviation
1532+
token = TOK.Amount(token.txt + " " + next_token.txt, next_token.txt,
1533+
token.val[0])
1534+
next_token = next(token_stream)
14761535
else:
14771536
# Check for [number] 'percent'
14781537
percentage = match_stem_list(next_token, PERCENTAGES)
@@ -1490,11 +1549,13 @@ def convert_to_num(token):
14901549
# 'stjórnskipunar- og eftirlitsnefnd'
14911550
# 'viðskipta- og iðnaðarráðherra'
14921551
# 'marg-ítrekaðri'
1493-
if token.kind == TOK.WORD and \
1494-
next_token.kind == TOK.PUNCTUATION and next_token.txt == COMPOSITE_HYPHEN:
1552+
if (token.kind == TOK.WORD and
1553+
next_token.kind == TOK.PUNCTUATION and
1554+
next_token.txt == COMPOSITE_HYPHEN):
14951555

14961556
og_token = next(token_stream)
1497-
if og_token.kind != TOK.WORD or (og_token.txt != "og" and og_token.txt != "eða"):
1557+
if (og_token.kind != TOK.WORD or
1558+
(og_token.txt != "og" and og_token.txt != "eða")):
14981559
# Incorrect prediction: make amends and continue
14991560
handled = False
15001561
if og_token.kind == TOK.WORD:
@@ -1524,8 +1585,8 @@ def convert_to_num(token):
15241585
# the last word, but an amalgamated token text.
15251586
# Note: there is no meaning check for the first
15261587
# part of the composition, so it can be an unknown word.
1527-
txt = token.txt + "- " + og_token.txt + \
1528-
" " + final_token.txt
1588+
txt = (token.txt + "- " + og_token.txt +
1589+
" " + final_token.txt)
15291590
token = TOK.Word(txt)
15301591
next_token = next(token_stream)
15311592

@@ -1636,6 +1697,7 @@ def valid_sent(sent):
16361697
)
16371698
RE_SPLIT = re.compile(RE_SPLIT_STR)
16381699

1700+
16391701
def correct_spaces(s):
16401702
""" Utility function to split and re-compose a string with correct spacing between tokens"""
16411703
r = []

0 commit comments

Comments
 (0)