Skip to content

Commit 69a4443

Browse files
Merge pull request #17 from mideind/abbrevfix
Abbrevfix
2 parents 95014d3 + 7d6ea17 commit 69a4443

File tree

5 files changed

+44
-57
lines changed

5 files changed

+44
-57
lines changed

README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,9 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
754754
Changelog
755755
---------
756756

757+
* Version 2.1.0: Changed handling of periods at end of sentences if they are
758+
a part of an abbreviation. Now, the period is kept attached to the abbreviation,
759+
not split off into a separate period token, as before.
757760
* Version 2.0.7: Added ``TOK.COMPANY`` token type; fixed a few abbreviations;
758761
renamed parameter ``text`` to ``text_or_gen`` in functions that accept a string
759762
or a string iterator

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def read(*names, **kwargs):
5757

5858
setup(
5959
name="tokenizer",
60-
version="2.0.7", # Also update src/tokenizer/__init__.py
60+
version="2.1.0", # Also update src/tokenizer/__init__.py
6161
license="MIT",
6262
description="A tokenizer for Icelandic text",
6363
long_description=u"{0}\n{1}".format(

src/tokenizer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,4 @@
4242
from .abbrev import Abbreviations, ConfigError
4343

4444
__author__ = u"Miðeind ehf"
45-
__version__ = u"2.0.7" # Also update setup.py
45+
__version__ = u"2.1.0" # Also update setup.py

src/tokenizer/tokenizer.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,18 +1682,28 @@ def parse_phrases_1(token_stream):
16821682
while True:
16831683

16841684
next_token = next(token_stream)
1685+
# Coalesce abbreviations and trailing period
1686+
if token.kind == TOK.WORD and next_token.txt == ".":
1687+
abbrev = token.txt + next_token.txt
1688+
if abbrev in Abbreviations.FINISHERS:
1689+
token = TOK.Word(abbrev, token.val)
1690+
next_token = next(token_stream)
16851691

16861692
# Coalesce [year|number] + ['e.Kr.'|'f.Kr.'] into year
16871693
if token.kind == TOK.YEAR or token.kind == TOK.NUMBER:
16881694
val = token.val if token.kind == TOK.YEAR else token.val[0]
1695+
nval = None
16891696
if next_token.txt in BCE: # f.Kr.
16901697
# Yes, we set year X BCE as year -X ;-)
1691-
token = TOK.Year(token.txt + " " + next_token.txt, -val)
1692-
next_token = next(token_stream)
1698+
nval = -val
16931699
elif next_token.txt in CE: # e.Kr.
1694-
token = TOK.Year(token.txt + " " + next_token.txt, val)
1700+
nval = val
1701+
if nval is not None:
1702+
token = TOK.Year(token.txt + " " + next_token.txt, nval)
16951703
next_token = next(token_stream)
1696-
1704+
if next_token.txt == ".":
1705+
token = TOK.Year(token.txt + next_token.txt, nval)
1706+
next_token = next(token_stream)
16971707
# TODO: "5 mars" greinist sem dagsetning, vantar punktinn.
16981708
# Check for [number | ordinal] [month name]
16991709
if (

test/test_tokenizer.py

Lines changed: 25 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,7 @@ def test_single_tokens():
145145
"Fimmtánda mars árið 44 f.Kr.",
146146
[
147147
Tok(TOK.WORD, "Fimmtánda", None),
148-
Tok(TOK.DATEREL, "mars árið 44 f.Kr", (-44, 3, 0)),
149-
Tok(TOK.PUNCTUATION, ".", None),
148+
Tok(TOK.DATEREL, "mars árið 44 f.Kr.", (-44, 3, 0)),
150149
],
151150
),
152151
("17/6/2013", [Tok(TOK.DATEABS, "17/6/2013", (2013, 6, 17))]),
@@ -172,21 +171,12 @@ def test_single_tokens():
172171
),
173172
("2013", [Tok(TOK.YEAR, "2013", 2013)]),
174173
("20130", [Tok(TOK.NUMBER, "20130", (20130, None, None))]),
175-
(
176-
"874 e.Kr.",
177-
[Tok(TOK.YEAR, "874 e.Kr", 874), Tok(TOK.PUNCTUATION, ".", None)],
178-
),
179-
(
180-
"2013 f.Kr.",
181-
[Tok(TOK.YEAR, "2013 f.Kr", -2013), Tok(TOK.PUNCTUATION, ".", None)],
182-
),
174+
("874 e.Kr.", [Tok(TOK.YEAR, "874 e.Kr.", 874)]),
175+
("2013 f.Kr.", [Tok(TOK.YEAR, "2013 f.Kr.", -2013)]),
183176
("árið 2013", [Tok(TOK.YEAR, "árið 2013", 2013)]),
184177
("árinu 874", [Tok(TOK.YEAR, "árinu 874", 874)]),
185178
("ársins 2013", [Tok(TOK.YEAR, "ársins 2013", 2013)]),
186-
(
187-
"ársins 320 f.Kr.",
188-
[Tok(TOK.YEAR, "ársins 320 f.Kr", -320), Tok(TOK.PUNCTUATION, ".", None)],
189-
),
179+
("ársins 320 f.Kr.", [Tok(TOK.YEAR, "ársins 320 f.Kr.", -320)]),
190180
("213", [Tok(TOK.NUMBER, "213", (213, None, None))]),
191181
("2.013", [Tok(TOK.NUMBER, "2.013", (2013, None, None))]),
192182
("2,013", [Tok(TOK.NUMBER, "2,013", (2.013, None, None))]),
@@ -232,20 +222,8 @@ def test_single_tokens():
232222
("marg-ítrekað", TOK.WORD),
233223
("full-ítarlegur", TOK.WORD),
234224
("hálf-óviðbúinn", TOK.WORD),
235-
(
236-
"750 þús.kr.",
237-
[
238-
Tok(TOK.AMOUNT, "750 þús.kr", (750e3, "ISK", None, None)),
239-
Tok(TOK.PUNCTUATION, ".", None),
240-
],
241-
),
242-
(
243-
"750 þús. kr.",
244-
[
245-
Tok(TOK.AMOUNT, "750 þús. kr", (750e3, "ISK", None, None)),
246-
Tok(TOK.PUNCTUATION, ".", None),
247-
],
248-
),
225+
("750 þús.kr.", [Tok(TOK.AMOUNT, "750 þús.kr.", (750e3, "ISK", None, None))]),
226+
("750 þús. kr.", [Tok(TOK.AMOUNT, "750 þús. kr.", (750e3, "ISK", None, None))]),
249227
(
250228
"750 þús. ISK.",
251229
[
@@ -279,28 +257,24 @@ def test_single_tokens():
279257
[
280258
Tok(
281259
TOK.WORD,
282-
"m.kr",
260+
"m.kr.",
283261
[("milljónir króna", 0, "kvk", "skst", "m.kr.", "-")],
284262
),
285-
Tok(TOK.PUNCTUATION, ".", None),
286263
],
287264
),
288265
(
289266
"ma.kr.",
290267
[
291268
Tok(
292269
TOK.WORD,
293-
"ma.kr",
294-
[("milljarðar króna", 0, "kk", "skst", "ma.kr.", "-")],
270+
"ma.kr.", [("milljarðar króna", 0, "kk", "skst", "ma.kr.", "-")],
295271
),
296-
Tok(TOK.PUNCTUATION, ".", None),
297272
],
298273
),
299274
(
300275
"30,7 mö.kr.",
301276
[
302-
Tok(TOK.AMOUNT, "30,7 mö.kr", (30.7e9, "ISK", None, None)),
303-
Tok(TOK.PUNCTUATION, ".", None),
277+
Tok(TOK.AMOUNT, "30,7 mö.kr.", (30.7e9, "ISK", None, None)),
304278
],
305279
),
306280
(
@@ -326,26 +300,23 @@ def test_single_tokens():
326300
(
327301
"nk.",
328302
[
329-
Tok(TOK.WORD, "nk", [("næstkomandi", 0, "lo", "skst", "nk.", "-")]),
330-
Tok(TOK.PUNCTUATION, ".", None),
303+
Tok(TOK.WORD, "nk.", [("næstkomandi", 0, "lo", "skst", "nk.", "-")]),
331304
],
332305
),
333306
(
334307
"sl.",
335308
[
336-
Tok(TOK.WORD, "sl", [("síðastliðinn", 0, "lo", "skst", "sl.", "-")]),
337-
Tok(TOK.PUNCTUATION, ".", None),
309+
Tok(TOK.WORD, "sl.", [("síðastliðinn", 0, "lo", "skst", "sl.", "-")]),
338310
],
339311
),
340312
(
341313
"o.s.frv.",
342314
[
343315
Tok(
344316
TOK.WORD,
345-
"o.s.frv",
317+
"o.s.frv.",
346318
[("og svo framvegis", 0, "ao", "frasi", "o.s.frv.", "-")],
347319
),
348-
Tok(TOK.PUNCTUATION, ".", None),
349320
],
350321
),
351322
("BSRB", TOK.WORD),
@@ -680,7 +651,7 @@ def test_sentence(text, expected, **options):
680651
" Góðan daginn! Ég á 10.000 kr. í vasanum, €100 og $40.Gengi USD er 103,45. "
681652
"Í dag er 10. júlí. Klukkan er 15:40 núna.Ég fer kl. 13 niður á Hlemm o.s.frv. ",
682653
"B W W P E B W W A W W P A W A P E B W W W N P E "
683-
"B W W W DR P E B W W T W P E B W W T W W W W P E",
654+
"B W W W DR P E B W W T W P E B W W T W W W W E",
684655
)
685656

686657
test_sentence(
@@ -714,15 +685,15 @@ def test_sentence(text, expected, **options):
714685
"Málið um BSRB gekk marg-ítrekað til stjórnskipunar- og eftirlitsnefndar í 10. sinn "
715686
"skv. XVII. kafla þann 24. september 2015 nk. Ál-verið notar 60 MWst á ári.",
716687
"B W W W W W W W W O W "
717-
"W O W W DA W P E B W W ME W W P E",
688+
"W O W W DA W E B W W ME W W P E",
718689
)
719690

720691
test_sentence(
721692
"Ég er t.d. með tölvupóstfangið fake@news.com, vefföngin "
722693
"http://greynir.is og https://greynir.is, og síma 6638999. Hann gaf mér 1000 kr. Ég keypti mér 1/2 kaffi. "
723694
"Það er hægt að ná í mig í s 623 7892, eða vinnusíma, 7227979 eða eitthvað.",
724695
"B W W W W W M P W "
725-
"U W U P W W TEL P E B W W W A P E B W W W N W P E "
696+
"U W U P W W TEL P E B W W W A E B W W W N W P E "
726697
"B W W W W W W W W W TEL P W W P TEL W W P E"
727698
)
728699

@@ -764,7 +735,7 @@ def test_sentence(text, expected, **options):
764735

765736
test_sentence(
766737
"1.030 hPa lægð gengur yfir landið árið 2019 e.Kr. Jógúrtin inniheldur 80 kcal.",
767-
"B ME W W W W Y P E B W W ME P E",
738+
"B ME W W W W Y E B W W ME P E",
768739
)
769740

770741
test_sentence(
@@ -829,7 +800,7 @@ def test_sentence(text, expected, **options):
829800

830801
test_sentence(
831802
"Fyrri setningin var í þgf. en sú seinni í nf. Ég stóð í ef. en hann í þf. Hvað ef.",
832-
"B W W W W W W W W W W P E B W W W W W W W W P E B W W P E",
803+
"B W W W W W W W W W W E B W W W W W W W W E B W W P E",
833804
)
834805

835806
test_sentence(
@@ -864,7 +835,7 @@ def test_sentence(text, expected, **options):
864835

865836
test_sentence(
866837
"Jón, kt. 301265-5309, vann 301265-53090 kr. H2O var drukkið.",
867-
"B W P W K P W N P A P E B MO W W P E",
838+
"B W P W K P W N P A E B MO W W P E",
868839
)
869840

870841
test_sentence(
@@ -877,6 +848,11 @@ def test_sentence(text, expected, **options):
877848
"B W W W W W W W P E",
878849
)
879850

851+
test_sentence(
852+
"Tösku- og hanskabúðin, sálug, var á Lauga- eða Skothúsvegi.",
853+
"B W P W P W W W P E",
854+
)
855+
880856
test_sentence(
881857
"Tösku-og hanskabúðin, sálug, var á Lauga-eða Skothúsvegi.",
882858
"B W P W P W W W P E",
@@ -1109,8 +1085,7 @@ def test_abbrev():
11091085
Tok(kind=TOK.S_BEGIN, txt=None, val=(0, None)),
11101086
Tok(kind=TOK.WORD, txt="Jón", val=None),
11111087
Tok(kind=TOK.WORD, txt="var", val=None),
1112-
Tok(kind=TOK.WORD, txt="sérfr", val=[('sérfræðingur', 0, 'kk', 'skst', 'sérfr.', '-')]),
1113-
Tok(kind=TOK.PUNCTUATION, txt=".", val=(3, ".")),
1088+
Tok(kind=TOK.WORD, txt="sérfr.", val=[('sérfræðingur', 0, 'kk', 'skst', 'sérfr.', '-')]),
11141089
Tok(kind=TOK.S_END, txt=None, val=None),
11151090
Tok(kind=TOK.S_BEGIN, txt=None, val=(0, None)),
11161091
Tok(kind=TOK.WORD, txt="Guðmundur", val=None),
@@ -1124,8 +1099,7 @@ def test_abbrev():
11241099
Tok(kind=TOK.S_BEGIN, txt=None, val=(0, None)),
11251100
Tok(kind=TOK.WORD, txt="Jón", val=None),
11261101
Tok(kind=TOK.WORD, txt="var", val=None),
1127-
Tok(kind=TOK.WORD, txt="t.h", val=[('til hægri', 0, 'ao', 'frasi', 't.h.', '-')]),
1128-
Tok(kind=TOK.PUNCTUATION, txt=".", val=(3, ".")),
1102+
Tok(kind=TOK.WORD, txt="t.h.", val=[('til hægri', 0, 'ao', 'frasi', 't.h.', '-')]),
11291103
Tok(kind=TOK.S_END, txt=None, val=None),
11301104
Tok(kind=TOK.S_BEGIN, txt=None, val=(0, None)),
11311105
Tok(kind=TOK.WORD, txt="Guðmundur", val=None),

0 commit comments

Comments
 (0)