@@ -145,8 +145,7 @@ def test_single_tokens():
145
145
"Fimmtánda mars árið 44 f.Kr." ,
146
146
[
147
147
Tok (TOK .WORD , "Fimmtánda" , None ),
148
- Tok (TOK .DATEREL , "mars árið 44 f.Kr" , (- 44 , 3 , 0 )),
149
- Tok (TOK .PUNCTUATION , "." , None ),
148
+ Tok (TOK .DATEREL , "mars árið 44 f.Kr." , (- 44 , 3 , 0 )),
150
149
],
151
150
),
152
151
("17/6/2013" , [Tok (TOK .DATEABS , "17/6/2013" , (2013 , 6 , 17 ))]),
@@ -172,21 +171,12 @@ def test_single_tokens():
172
171
),
173
172
("2013" , [Tok (TOK .YEAR , "2013" , 2013 )]),
174
173
("20130" , [Tok (TOK .NUMBER , "20130" , (20130 , None , None ))]),
175
- (
176
- "874 e.Kr." ,
177
- [Tok (TOK .YEAR , "874 e.Kr" , 874 ), Tok (TOK .PUNCTUATION , "." , None )],
178
- ),
179
- (
180
- "2013 f.Kr." ,
181
- [Tok (TOK .YEAR , "2013 f.Kr" , - 2013 ), Tok (TOK .PUNCTUATION , "." , None )],
182
- ),
174
+ ("874 e.Kr." , [Tok (TOK .YEAR , "874 e.Kr." , 874 )]),
175
+ ("2013 f.Kr." , [Tok (TOK .YEAR , "2013 f.Kr." , - 2013 )]),
183
176
("árið 2013" , [Tok (TOK .YEAR , "árið 2013" , 2013 )]),
184
177
("árinu 874" , [Tok (TOK .YEAR , "árinu 874" , 874 )]),
185
178
("ársins 2013" , [Tok (TOK .YEAR , "ársins 2013" , 2013 )]),
186
- (
187
- "ársins 320 f.Kr." ,
188
- [Tok (TOK .YEAR , "ársins 320 f.Kr" , - 320 ), Tok (TOK .PUNCTUATION , "." , None )],
189
- ),
179
+ ("ársins 320 f.Kr." , [Tok (TOK .YEAR , "ársins 320 f.Kr." , - 320 )]),
190
180
("213" , [Tok (TOK .NUMBER , "213" , (213 , None , None ))]),
191
181
("2.013" , [Tok (TOK .NUMBER , "2.013" , (2013 , None , None ))]),
192
182
("2,013" , [Tok (TOK .NUMBER , "2,013" , (2.013 , None , None ))]),
@@ -232,20 +222,8 @@ def test_single_tokens():
232
222
("marg-ítrekað" , TOK .WORD ),
233
223
("full-ítarlegur" , TOK .WORD ),
234
224
("hálf-óviðbúinn" , TOK .WORD ),
235
- (
236
- "750 þús.kr." ,
237
- [
238
- Tok (TOK .AMOUNT , "750 þús.kr" , (750e3 , "ISK" , None , None )),
239
- Tok (TOK .PUNCTUATION , "." , None ),
240
- ],
241
- ),
242
- (
243
- "750 þús. kr." ,
244
- [
245
- Tok (TOK .AMOUNT , "750 þús. kr" , (750e3 , "ISK" , None , None )),
246
- Tok (TOK .PUNCTUATION , "." , None ),
247
- ],
248
- ),
225
+ ("750 þús.kr." , [Tok (TOK .AMOUNT , "750 þús.kr." , (750e3 , "ISK" , None , None ))]),
226
+ ("750 þús. kr." , [Tok (TOK .AMOUNT , "750 þús. kr." , (750e3 , "ISK" , None , None ))]),
249
227
(
250
228
"750 þús. ISK." ,
251
229
[
@@ -279,28 +257,24 @@ def test_single_tokens():
279
257
[
280
258
Tok (
281
259
TOK .WORD ,
282
- "m.kr" ,
260
+ "m.kr. " ,
283
261
[("milljónir króna" , 0 , "kvk" , "skst" , "m.kr." , "-" )],
284
262
),
285
- Tok (TOK .PUNCTUATION , "." , None ),
286
263
],
287
264
),
288
265
(
289
266
"ma.kr." ,
290
267
[
291
268
Tok (
292
269
TOK .WORD ,
293
- "ma.kr" ,
294
- [("milljarðar króna" , 0 , "kk" , "skst" , "ma.kr." , "-" )],
270
+ "ma.kr." , [("milljarðar króna" , 0 , "kk" , "skst" , "ma.kr." , "-" )],
295
271
),
296
- Tok (TOK .PUNCTUATION , "." , None ),
297
272
],
298
273
),
299
274
(
300
275
"30,7 mö.kr." ,
301
276
[
302
- Tok (TOK .AMOUNT , "30,7 mö.kr" , (30.7e9 , "ISK" , None , None )),
303
- Tok (TOK .PUNCTUATION , "." , None ),
277
+ Tok (TOK .AMOUNT , "30,7 mö.kr." , (30.7e9 , "ISK" , None , None )),
304
278
],
305
279
),
306
280
(
@@ -326,26 +300,23 @@ def test_single_tokens():
326
300
(
327
301
"nk." ,
328
302
[
329
- Tok (TOK .WORD , "nk" , [("næstkomandi" , 0 , "lo" , "skst" , "nk." , "-" )]),
330
- Tok (TOK .PUNCTUATION , "." , None ),
303
+ Tok (TOK .WORD , "nk." , [("næstkomandi" , 0 , "lo" , "skst" , "nk." , "-" )]),
331
304
],
332
305
),
333
306
(
334
307
"sl." ,
335
308
[
336
- Tok (TOK .WORD , "sl" , [("síðastliðinn" , 0 , "lo" , "skst" , "sl." , "-" )]),
337
- Tok (TOK .PUNCTUATION , "." , None ),
309
+ Tok (TOK .WORD , "sl." , [("síðastliðinn" , 0 , "lo" , "skst" , "sl." , "-" )]),
338
310
],
339
311
),
340
312
(
341
313
"o.s.frv." ,
342
314
[
343
315
Tok (
344
316
TOK .WORD ,
345
- "o.s.frv" ,
317
+ "o.s.frv. " ,
346
318
[("og svo framvegis" , 0 , "ao" , "frasi" , "o.s.frv." , "-" )],
347
319
),
348
- Tok (TOK .PUNCTUATION , "." , None ),
349
320
],
350
321
),
351
322
("BSRB" , TOK .WORD ),
@@ -680,7 +651,7 @@ def test_sentence(text, expected, **options):
680
651
" Góðan daginn! Ég á 10.000 kr. í vasanum, €100 og $40.Gengi USD er 103,45. "
681
652
"Í dag er 10. júlí. Klukkan er 15:40 núna.Ég fer kl. 13 niður á Hlemm o.s.frv. " ,
682
653
"B W W P E B W W A W W P A W A P E B W W W N P E "
683
- "B W W W DR P E B W W T W P E B W W T W W W W P E" ,
654
+ "B W W W DR P E B W W T W P E B W W T W W W W E" ,
684
655
)
685
656
686
657
test_sentence (
@@ -714,15 +685,15 @@ def test_sentence(text, expected, **options):
714
685
"Málið um BSRB gekk marg-ítrekað til stjórnskipunar- og eftirlitsnefndar í 10. sinn "
715
686
"skv. XVII. kafla þann 24. september 2015 nk. Ál-verið notar 60 MWst á ári." ,
716
687
"B W W W W W W W W O W "
717
- "W O W W DA W P E B W W ME W W P E" ,
688
+ "W O W W DA W E B W W ME W W P E" ,
718
689
)
719
690
720
691
test_sentence (
721
692
"Ég er t.d. með tölvupóstfangið fake@news.com, vefföngin "
722
693
"http://greynir.is og https://greynir.is, og síma 6638999. Hann gaf mér 1000 kr. Ég keypti mér 1/2 kaffi. "
723
694
"Það er hægt að ná í mig í s 623 7892, eða vinnusíma, 7227979 eða eitthvað." ,
724
695
"B W W W W W M P W "
725
- "U W U P W W TEL P E B W W W A P E B W W W N W P E "
696
+ "U W U P W W TEL P E B W W W A E B W W W N W P E "
726
697
"B W W W W W W W W W TEL P W W P TEL W W P E"
727
698
)
728
699
@@ -764,7 +735,7 @@ def test_sentence(text, expected, **options):
764
735
765
736
test_sentence (
766
737
"1.030 hPa lægð gengur yfir landið árið 2019 e.Kr. Jógúrtin inniheldur 80 kcal." ,
767
- "B ME W W W W Y P E B W W ME P E" ,
738
+ "B ME W W W W Y E B W W ME P E" ,
768
739
)
769
740
770
741
test_sentence (
@@ -829,7 +800,7 @@ def test_sentence(text, expected, **options):
829
800
830
801
test_sentence (
831
802
"Fyrri setningin var í þgf. en sú seinni í nf. Ég stóð í ef. en hann í þf. Hvað ef." ,
832
- "B W W W W W W W W W W P E B W W W W W W W W P E B W W P E" ,
803
+ "B W W W W W W W W W W E B W W W W W W W W E B W W P E" ,
833
804
)
834
805
835
806
test_sentence (
@@ -864,7 +835,7 @@ def test_sentence(text, expected, **options):
864
835
865
836
test_sentence (
866
837
"Jón, kt. 301265-5309, vann 301265-53090 kr. H2O var drukkið." ,
867
- "B W P W K P W N P A P E B MO W W P E" ,
838
+ "B W P W K P W N P A E B MO W W P E" ,
868
839
)
869
840
870
841
test_sentence (
@@ -877,6 +848,11 @@ def test_sentence(text, expected, **options):
877
848
"B W W W W W W W P E" ,
878
849
)
879
850
851
+ test_sentence (
852
+ "Tösku- og hanskabúðin, sálug, var á Lauga- eða Skothúsvegi." ,
853
+ "B W P W P W W W P E" ,
854
+ )
855
+
880
856
test_sentence (
881
857
"Tösku-og hanskabúðin, sálug, var á Lauga-eða Skothúsvegi." ,
882
858
"B W P W P W W W P E" ,
@@ -1109,8 +1085,7 @@ def test_abbrev():
1109
1085
Tok (kind = TOK .S_BEGIN , txt = None , val = (0 , None )),
1110
1086
Tok (kind = TOK .WORD , txt = "Jón" , val = None ),
1111
1087
Tok (kind = TOK .WORD , txt = "var" , val = None ),
1112
- Tok (kind = TOK .WORD , txt = "sérfr" , val = [('sérfræðingur' , 0 , 'kk' , 'skst' , 'sérfr.' , '-' )]),
1113
- Tok (kind = TOK .PUNCTUATION , txt = "." , val = (3 , "." )),
1088
+ Tok (kind = TOK .WORD , txt = "sérfr." , val = [('sérfræðingur' , 0 , 'kk' , 'skst' , 'sérfr.' , '-' )]),
1114
1089
Tok (kind = TOK .S_END , txt = None , val = None ),
1115
1090
Tok (kind = TOK .S_BEGIN , txt = None , val = (0 , None )),
1116
1091
Tok (kind = TOK .WORD , txt = "Guðmundur" , val = None ),
@@ -1124,8 +1099,7 @@ def test_abbrev():
1124
1099
Tok (kind = TOK .S_BEGIN , txt = None , val = (0 , None )),
1125
1100
Tok (kind = TOK .WORD , txt = "Jón" , val = None ),
1126
1101
Tok (kind = TOK .WORD , txt = "var" , val = None ),
1127
- Tok (kind = TOK .WORD , txt = "t.h" , val = [('til hægri' , 0 , 'ao' , 'frasi' , 't.h.' , '-' )]),
1128
- Tok (kind = TOK .PUNCTUATION , txt = "." , val = (3 , "." )),
1102
+ Tok (kind = TOK .WORD , txt = "t.h." , val = [('til hægri' , 0 , 'ao' , 'frasi' , 't.h.' , '-' )]),
1129
1103
Tok (kind = TOK .S_END , txt = None , val = None ),
1130
1104
Tok (kind = TOK .S_BEGIN , txt = None , val = (0 , None )),
1131
1105
Tok (kind = TOK .WORD , txt = "Guðmundur" , val = None ),
0 commit comments