Skip to content

Commit 12a2db2

Browse files
committed
Updated header + formatting
1 parent 7c2bbb8 commit 12a2db2

File tree

9 files changed

+193
-38
lines changed

9 files changed

+193
-38
lines changed

src/tokenizer/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright(C) 2022 Miðeind ehf.
3+
Copyright(C) 2016-2024 Miðeind ehf.
44
Original author: Vilhjálmur Þorsteinsson
55
66
This software is licensed under the MIT License:
@@ -64,8 +64,7 @@
6464

6565
__author__ = "Miðeind ehf."
6666
__copyright__ = "(C) 2016-2024 Miðeind ehf."
67-
__version__ = importlib.metadata.version("tokenizer")
68-
67+
__version__ = importlib.metadata.version(__name__)
6968

7069
__all__ = (
7170
"__author__",

src/tokenizer/abbrev.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Abbreviations module for tokenization of Icelandic text
44
5-
Copyright (C) 2022 Miðeind ehf.
5+
Copyright (C) 2016-2024 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:

src/tokenizer/definitions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Definitions used for tokenization of Icelandic text
44
5-
Copyright (C) 2022 Miðeind ehf.
5+
Copyright (C) 2016-2024 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:

src/tokenizer/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Tokenizer for Icelandic text
55
6-
Copyright (C) 2022 Miðeind ehf.
6+
Copyright (C) 2016-2024 Miðeind ehf.
77
Original author: Vilhjálmur Þorsteinsson
88
99
This software is licensed under the MIT License:

src/tokenizer/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Tokenizer for Icelandic text
44
5-
Copyright (C) 2022 Miðeind ehf.
5+
Copyright (C) 2016-2024 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:

test/test_detokenize.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
Tests for Tokenizer module
88
9-
Copyright (C) 2022 by Miðeind ehf.
9+
Copyright (C) 2016-2024 by Miðeind ehf.
1010
Original author: Vilhjálmur Þorsteinsson
1111
1212
This software is licensed under the MIT License:
@@ -37,7 +37,7 @@
3737

3838
def test_detokenize() -> None:
3939

40-
options = { "normalize": True }
40+
options = {"normalize": True}
4141

4242
def should_be_equal(s: str) -> None:
4343
toklist = t.tokenize(s, **options)
@@ -58,19 +58,18 @@ def should_be(s1: str, s2: str) -> None:
5858
should_be_equal("Páll veiddi 74 cm. lax í Norðurá þann 1.3.")
5959

6060
should_be(
61-
"Páll var með \"netfangið\" palli@einn.i.heiminum.is.",
62-
"Páll var með „netfangið“ palli@einn.i.heiminum.is."
61+
'Páll var með "netfangið" palli@einn.i.heiminum.is.',
62+
"Páll var með „netfangið“ palli@einn.i.heiminum.is.",
6363
)
6464

6565
# !!! BUG
66-
#should_be(
66+
# should_be(
6767
# "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").",
6868
# "Páll var með „netfangið“, þ.e.a.s. („þetta“).",
69-
#)
69+
# )
7070

71-
options = { "normalize": False }
71+
options = {"normalize": False}
7272

7373
should_be_equal("Páll var með „netfangið“, þ.e.a.s. („þetta“).")
74-
should_be_equal("Páll var með \"netfangið\" palli@einn.i.heiminum.is.")
75-
should_be_equal("Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").")
76-
74+
should_be_equal('Páll var með "netfangið" palli@einn.i.heiminum.is.')
75+
should_be_equal('Páll var með "netfangið", þ.e.a.s. ("þetta").')

test/test_index_calculation.py

Lines changed: 176 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
Tests for Tokenizer module
88
9-
Copyright (C) 2022 by Miðeind ehf.
9+
Copyright (C) 2016-2024 by Miðeind ehf.
1010
1111
This software is licensed under the MIT License:
1212
@@ -169,7 +169,6 @@ def test_small_difficult_cases() -> None:
169169
assert char_indexes == [0, 2, 4]
170170
assert byte_indexes == [0, 2, 4]
171171

172-
173172
# Two byte characters
174173
for x in ["þ", "æ", "á"]:
175174
s = x
@@ -230,12 +229,11 @@ def test_small_difficult_cases() -> None:
230229
assert char_indexes == [0, 2, 4]
231230
assert byte_indexes == [0, 3, 6]
232231

233-
234232
# Two character characters
235233
# These strings contain two unicode code points that are rendered as one letter.
236234
# They are counted as two characters in python.
237235
# In addition the accent and umlaut characters are two bytes.
238-
for x in ["a"+ACCENT, "o"+UMLAUT]:
236+
for x in ["a" + ACCENT, "o" + UMLAUT]:
239237
s = x
240238
toks = tokenizer.parse_tokens([s])
241239
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
@@ -288,11 +286,11 @@ def test_small_difficult_cases() -> None:
288286
# example chars:
289287
# " a´ a´"
290288
# 012345
291-
# ^ ^
289+
# ^ ^
292290
# example bytes:
293291
# " a´_ a´_"
294292
# 01234567
295-
# ^ ^
293+
# ^ ^
296294
toks = tokenizer.parse_tokens([s])
297295
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
298296
assert char_indexes == [0, 3]
@@ -302,7 +300,6 @@ def test_small_difficult_cases() -> None:
302300
assert char_indexes == [0, 3, 6]
303301
assert byte_indexes == [0, 4, 8]
304302

305-
306303
# The em-dash is 3 bytes
307304
for x in [EM_DASH]:
308305
s = x
@@ -361,7 +358,7 @@ def test_small_difficult_cases() -> None:
361358
# example bytes:
362359
# " a__ a__"
363360
# 01234567
364-
# ^ ^
361+
# ^ ^
365362
toks = tokenizer.parse_tokens([s])
366363
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
367364
assert char_indexes == [0, 2]
@@ -379,25 +376,181 @@ def test_larger_case() -> None:
379376
# x x x xx x
380377
toks = tokenizer.parse_tokens([s])
381378
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
382-
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
383-
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
379+
assert char_indexes == [
380+
0,
381+
5,
382+
13,
383+
16,
384+
18,
385+
25,
386+
30,
387+
33,
388+
36,
389+
40,
390+
45,
391+
50,
392+
53,
393+
61,
394+
66,
395+
72,
396+
]
397+
assert byte_indexes == [
398+
0,
399+
6,
400+
14,
401+
17,
402+
20,
403+
27,
404+
32,
405+
35,
406+
38,
407+
43,
408+
50,
409+
55,
410+
58,
411+
66,
412+
72,
413+
78,
414+
]
384415
toks = tokenizer.parse_tokens([s])
385416
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
386-
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
387-
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
417+
assert char_indexes == [
418+
0,
419+
5,
420+
13,
421+
16,
422+
18,
423+
25,
424+
30,
425+
33,
426+
36,
427+
40,
428+
45,
429+
50,
430+
53,
431+
61,
432+
66,
433+
72,
434+
73,
435+
]
436+
assert byte_indexes == [
437+
0,
438+
6,
439+
14,
440+
17,
441+
20,
442+
27,
443+
32,
444+
35,
445+
38,
446+
43,
447+
50,
448+
55,
449+
58,
450+
66,
451+
72,
452+
78,
453+
79,
454+
]
388455

389456

390457
def test_iterator_cases() -> None:
391-
s = ["Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."]
458+
s = [
459+
"Þessi ",
460+
"setning ",
461+
"er ",
462+
"í ",
463+
"lengra ",
464+
"lagi ",
465+
"og ",
466+
"er ",
467+
"með ",
468+
"bæði ",
469+
"eins ",
470+
"og ",
471+
"tveggja ",
472+
"bæta ",
473+
"stafi.",
474+
]
392475
# (char and byte indexes in a similar test above)
393476
toks = tokenizer.parse_tokens(s)
394477
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
395-
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
396-
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
478+
assert char_indexes == [
479+
0,
480+
5,
481+
13,
482+
16,
483+
18,
484+
25,
485+
30,
486+
33,
487+
36,
488+
40,
489+
45,
490+
50,
491+
53,
492+
61,
493+
66,
494+
72,
495+
]
496+
assert byte_indexes == [
497+
0,
498+
6,
499+
14,
500+
17,
501+
20,
502+
27,
503+
32,
504+
35,
505+
38,
506+
43,
507+
50,
508+
55,
509+
58,
510+
66,
511+
72,
512+
78,
513+
]
397514
toks = tokenizer.parse_tokens(s)
398515
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
399-
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
400-
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
516+
assert char_indexes == [
517+
0,
518+
5,
519+
13,
520+
16,
521+
18,
522+
25,
523+
30,
524+
33,
525+
36,
526+
40,
527+
45,
528+
50,
529+
53,
530+
61,
531+
66,
532+
72,
533+
73,
534+
]
535+
assert byte_indexes == [
536+
0,
537+
6,
538+
14,
539+
17,
540+
20,
541+
27,
542+
32,
543+
35,
544+
38,
545+
43,
546+
50,
547+
55,
548+
58,
549+
66,
550+
72,
551+
78,
552+
79,
553+
]
401554

402555
s = ["Stutt setning.", "", "Önnur setning."]
403556
# 01234567890123 45678901234567
@@ -493,11 +646,15 @@ def test_lengthening_substitutions() -> None:
493646
# ^ ^ ^ ^ ^
494647
# x x
495648
# ! lengthening happens here (3ji->þriðji)
496-
toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
649+
toks = tokenizer.parse_tokens(
650+
s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
651+
)
497652
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
498653
assert char_indexes == [0, 5, 8, 12, 21]
499654
assert byte_indexes == [0, 6, 9, 13, 23]
500-
toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
655+
toks = tokenizer.parse_tokens(
656+
s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
657+
)
501658
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
502659
assert char_indexes == [0, 5, 8, 12, 21, 22]
503660
assert byte_indexes == [0, 6, 9, 13, 23, 24]

test/test_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
66
Tests for Tokenizer module
77
8-
Copyright (C) 2022 by Miðeind ehf.
8+
Copyright (C) 2016-2024 by Miðeind ehf.
99
Original author: Vilhjálmur Þorsteinsson
1010
1111
This software is licensed under the MIT License:

test/test_tokenizer_tok.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Tests for Tokenizer module
55
6-
Copyright (C) 2022 by Miðeind ehf.
6+
Copyright (C) 2016-2024 by Miðeind ehf.
77
88
This software is licensed under the MIT License:
99

0 commit comments

Comments
 (0)