Skip to content

Commit 54e2c0a

Browse files
Merge pull request #38 from mideind/ospl
Ospl
2 parents 0de43c4 + d3a56d5 commit 54e2c0a

File tree

12 files changed

+97
-52
lines changed

12 files changed

+97
-52
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2021 Miðeind ehf.
3+
Copyright (C) 2022 Miðeind ehf.
44
Original author: Vilhjálmur Þorsteinsson
55

66
Permission is hereby granted, free of charge, to any person obtaining a copy

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Tokenizer for Icelandic text
55
6-
Copyright (C) 2021 Miðeind ehf.
6+
Copyright (C) 2022 Miðeind ehf.
77
Original author: Vilhjálmur Þorsteinsson
88
99
This software is licensed under the MIT License:

src/tokenizer/Abbrev.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# List of abbreviations for tokenization of Icelandic text
33
#
4-
# Copyright (C) 2021 Miðeind ehf.
4+
# Copyright (C) 2022 Miðeind ehf.
55
# Original author: Vilhjálmur Þorsteinsson
66
#
77
# This software is licensed under the MIT License:

src/tokenizer/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright(C) 2021 Miðeind ehf.
3+
Copyright(C) 2022 Miðeind ehf.
44
Original author: Vilhjálmur Þorsteinsson
55
66
This software is licensed under the MIT License:
@@ -62,7 +62,7 @@
6262
from .version import __version__
6363

6464
__author__ = "Miðeind ehf"
65-
__copyright__ = "(C) 2021 Miðeind ehf."
65+
__copyright__ = "(C) 2022 Miðeind ehf."
6666

6767
__all__ = (
6868
"__author__",

src/tokenizer/abbrev.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Abbreviations module for tokenization of Icelandic text
44
5-
Copyright (C) 2021 Miðeind ehf.
5+
Copyright (C) 2022 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:

src/tokenizer/definitions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Definitions used for tokenization of Icelandic text
44
5-
Copyright (C) 2021 Miðeind ehf.
5+
Copyright (C) 2022 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:

src/tokenizer/main.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Tokenizer for Icelandic text
55
6-
Copyright (C) 2021 Miðeind ehf.
6+
Copyright (C) 2022 Miðeind ehf.
77
Original author: Vilhjálmur Þorsteinsson
88
99
This software is licensed under the MIT License:
@@ -110,7 +110,10 @@
110110
)
111111

112112
parser.add_argument(
113-
"-o", "--original", action="store_true", help="Outputs original text of tokens",
113+
"-o",
114+
"--original",
115+
action="store_true",
116+
help="Outputs original text of tokens",
114117
)
115118

116119
parser.add_argument(
@@ -152,27 +155,27 @@
152155

153156

154157
def main() -> None:
155-
""" Main function, called when the tokenize command is invoked """
158+
"""Main function, called when the tokenize command is invoked"""
156159

157160
args = parser.parse_args()
158161
options: Dict[str, bool] = dict()
159162

160163
def quote(s: str) -> str:
161-
""" Return the string s within double quotes, and with any contained
162-
backslashes and double quotes escaped with a backslash """
164+
"""Return the string s within double quotes, and with any contained
165+
backslashes and double quotes escaped with a backslash"""
163166
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
164167

165168
def spanquote(l: List[int]) -> str:
166-
""" Return the list l as a string within double quotes """
169+
"""Return the list l as a string within double quotes"""
167170
return '"' + "-".join(str(x) for x in l) + '"'
168171

169172
def gen(f: TextIO) -> Iterator[str]:
170-
""" Generate the lines of text in the input file """
173+
"""Generate the lines of text in the input file"""
171174
for line in f:
172175
yield line
173176

174177
def val(t: Tok, quote_word: bool = False) -> Any:
175-
""" Return the value part of the token t """
178+
"""Return the value part of the token t"""
176179
if t.val is None:
177180
return None
178181
if t.kind == TOK.WORD:
@@ -252,8 +255,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
252255
# Configure our JSON dump function
253256
json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
254257
curr_sent: List[str] = []
255-
sep = "" if args.original else " "
256-
258+
tsep = "" if args.original else " " # token separator
257259
for t in tokenize(gen(args.infile), **options):
258260
if args.csv:
259261
# Output the tokens in CSV format, one line per token
@@ -285,13 +287,18 @@ def val(t: Tok, quote_word: bool = False) -> Any:
285287
d["s"] = t.origin_spans
286288
print(json_dumps(d), file=args.outfile)
287289
else:
288-
# Normal shallow parse, one line per sentence,
290+
# Normal shallow parse, sentences separated by newline by default,
289291
# tokens separated by spaces
292+
if t.kind in TOK.END:
293+
# End of sentence/paragraph
294+
if curr_sent:
295+
print(tsep.join(curr_sent), file=args.outfile)
296+
curr_sent = []
290297
txt = to_text(t)
291298
if txt:
292299
curr_sent.append(txt)
293300
if curr_sent:
294-
print(sep.join(curr_sent), file=args.outfile)
301+
print(tsep.join(curr_sent), file=args.outfile)
295302

296303

297304
if __name__ == "__main__":

src/tokenizer/tokenizer.py

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Tokenizer for Icelandic text
44
5-
Copyright (C) 2021 Miðeind ehf.
5+
Copyright (C) 2022 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:
@@ -68,6 +68,11 @@
6868
_T = TypeVar("_T", bound="Tok")
6969

7070

71+
# Set of punctuation characters that are grouped into one
72+
# normalized exclamation
73+
EXCLAMATIONS = frozenset(("!", "?"))
74+
75+
7176
class Tok:
7277

7378
"""Information about a single token"""
@@ -1655,37 +1660,34 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
16551660
elif rtxt.startswith("[…]"):
16561661
punct, rt = rt.split(3)
16571662
yield TOK.Punctuation(punct)
1658-
elif rtxt.startswith("..."):
1659-
# Treat ellipsis as one piece of punctuation
1663+
elif rtxt.startswith("...") or rtxt.startswith("…"):
1664+
# Treat >= 3 periods as ellipsis, one piece of punctuation
16601665
numdots = 0
16611666
for c in rtxt:
1662-
if c == ".":
1667+
if c == "." or c == "…":
16631668
numdots += 1
16641669
else:
16651670
break
16661671
dots, rt = rt.split(numdots)
16671672
yield TOK.Punctuation(dots, normalized="…")
1668-
elif rtxt.startswith("…"):
1669-
# Treat ellipsis as one piece of punctuation
1670-
numdots = 0
1671-
for c in rtxt:
1672-
if c == "…":
1673-
numdots += 1
1673+
elif rtxt.startswith(".."):
1674+
# Normalize two periods to one
1675+
dots, rt = rt.split(2)
1676+
yield TOK.Punctuation(dots, normalized=".")
1677+
elif rtxt.startswith(",,") and rtxt[2:3].isalpha():
1678+
# Probably someone trying to type opening double quotes with commas
1679+
punct, rt = rt.split(2)
1680+
yield TOK.Punctuation(punct, normalized="„")
1681+
elif rtxt.startswith(",,"):
1682+
# Coalesce multiple commas into one normalized comma
1683+
numcommas = 2
1684+
for c in rtxt[2:]:
1685+
if c == ",":
1686+
numcommas += 1
16741687
else:
16751688
break
1676-
dots, rt = rt.split(numdots)
1677-
yield TOK.Punctuation(dots, normalized="…")
1678-
# TODO LAGA Hér ætti að safna áfram.
1679-
# TODO Was at the end of a word or by itself, should be ",".
1680-
# Won't correct automatically, check for M6
1681-
elif rt.txt == ",,":
1682-
punct, rt = rt.split(2)
1689+
punct, rt = rt.split(numcommas)
16831690
yield TOK.Punctuation(punct, normalized=",")
1684-
# TODO STILLING kommum í upphafi orðs breytt í gæsalappir
1685-
elif rtxt.startswith(",,"):
1686-
# Probably an idiot trying to type opening double quotes with commas
1687-
punct, rt = rt.split(2)
1688-
yield TOK.Punctuation(punct, normalized="„")
16891691
elif rtxt[0] in HYPHENS:
16901692
# Normalize all hyphens the same way
16911693
punct, rt = rt.split(1)
@@ -1715,6 +1717,16 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
17151717
# Return the @-sign and leave the rest
17161718
punct, rt = rt.split(1)
17171719
yield TOK.Punctuation(punct)
1720+
elif len(rtxt) >= 2 and frozenset(rtxt) <= EXCLAMATIONS:
1721+
# Possibly '???!!!' or something of the sort
1722+
numpunct = 2
1723+
for p in rtxt[2:]:
1724+
if p in EXCLAMATIONS:
1725+
numpunct += 1
1726+
else:
1727+
break
1728+
punct, rt = rt.split(numpunct)
1729+
yield TOK.Punctuation(punct, normalized=rtxt[0])
17181730
else:
17191731
punct, rt = rt.split(1)
17201732
yield TOK.Punctuation(punct)
@@ -2223,14 +2235,14 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
22232235
):
22242236
# Ordinal, i.e. whole number or Roman numeral followed by period:
22252237
# convert to an ordinal token
2226-
follow_token = token_stream[0]
2227-
if follow_token and not (
2228-
follow_token.kind in TOK.END
2229-
or follow_token.punctuation in {"„", '"'}
2238+
ord_token: Optional[Tok] = token_stream[0]
2239+
if ord_token and not (
2240+
ord_token.kind in TOK.END
2241+
or ord_token.punctuation in {"„", '"'}
22302242
or (
2231-
follow_token.kind == TOK.WORD
2232-
and follow_token.txt[0].isupper()
2233-
and month_for_token(follow_token, True) is None
2243+
ord_token.kind == TOK.WORD
2244+
and ord_token.txt[0].isupper()
2245+
and month_for_token(ord_token, True) is None
22342246
)
22352247
):
22362248
# OK: replace the number/Roman numeral and the period

test/test_detokenize.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
Tests for Tokenizer module
88
9-
Copyright (C) 2021 by Miðeind ehf.
9+
Copyright (C) 2022 by Miðeind ehf.
1010
Original author: Vilhjálmur Þorsteinsson
1111
1212
This software is licensed under the MIT License:
@@ -61,6 +61,7 @@ def should_be(s1: str, s2: str) -> None:
6161
"Páll var með \"netfangið\" palli@einn.i.heiminum.is.",
6262
"Páll var með „netfangið“ palli@einn.i.heiminum.is."
6363
)
64+
6465
# !!! BUG
6566
#should_be(
6667
# "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").",

test/test_index_calculation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
Tests for Tokenizer module
88
9-
Copyright (C) 2021 by Miðeind ehf.
9+
Copyright (C) 2022 by Miðeind ehf.
1010
1111
This software is licensed under the MIT License:
1212

test/test_tokenizer.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
66
Tests for Tokenizer module
77
8-
Copyright (C) 2021 by Miðeind ehf.
8+
Copyright (C) 2022 by Miðeind ehf.
99
Original author: Vilhjálmur Þorsteinsson
1010
1111
This software is licensed under the MIT License:
@@ -2346,6 +2346,31 @@ def test_normalization() -> None:
23462346
assert t.text_from_tokens(toklist) == 'Hann sagði : " Þú ert ágæt ! " .'
23472347
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ! “ ."
23482348

2349+
toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt!!??!".'))
2350+
assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt !!??! " .'
2351+
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ! “ ."
2352+
2353+
toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt??!?".'))
2354+
assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ??!? " .'
2355+
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ? “ ."
2356+
2357+
toklist = list(t.tokenize('Jón,, farðu út.'))
2358+
assert t.text_from_tokens(toklist) == 'Jón ,, farðu út .'
2359+
assert t.normalized_text_from_tokens(toklist) == "Jón , farðu út ."
2360+
2361+
toklist = list(t.tokenize('Jón ,,farðu út.'))
2362+
assert t.text_from_tokens(toklist) == 'Jón ,, farðu út .'
2363+
assert t.normalized_text_from_tokens(toklist) == "Jón „ farðu út ."
2364+
2365+
toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt.....".'))
2366+
assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ..... " .'
2367+
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt … “ ."
2368+
2369+
toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt…..".'))
2370+
assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ….. " .'
2371+
assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt … “ ."
2372+
2373+
23492374

23502375
def test_abbr_at_eos() -> None:
23512376
"""Test that 'Örn.' is not treated as an abbreviation here"""

test/test_tokenizer_tok.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Tests for Tokenizer module
55
6-
Copyright (C) 2021 by Miðeind ehf.
6+
Copyright (C) 2022 by Miðeind ehf.
77
88
This software is licensed under the MIT License:
99

0 commit comments

Comments
 (0)