Merge pull request #38 from mideind/ospl

vthorsteinsson · web-flow · commit 54e2c0ae20b8 · 2022-03-10T12:20:28.000Z
Ospl
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Miðeind ehf.
+Copyright (C) 2022 Miðeind ehf.
 Original author: Vilhjálmur Þorsteinsson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
     Tokenizer for Icelandic text
 
-    Copyright (C) 2021 Miðeind ehf.
+    Copyright (C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/Abbrev.conf b/src/tokenizer/Abbrev.conf
@@ -1,7 +1,7 @@
 #
 # List of abbreviations for tokenization of Icelandic text
 #
-# Copyright (C) 2021 Miðeind ehf.
+# Copyright (C) 2022 Miðeind ehf.
 # Original author: Vilhjálmur Þorsteinsson
 #
 # This software is licensed under the MIT License:
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -1,6 +1,6 @@
 """
 
-    Copyright(C) 2021 Miðeind ehf.
+    Copyright(C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -62,7 +62,7 @@
 from .version import __version__
 
 __author__ = "Miðeind ehf"
-__copyright__ = "(C) 2021 Miðeind ehf."
+__copyright__ = "(C) 2022 Miðeind ehf."
 
 __all__ = (
     "__author__",
diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
@@ -2,7 +2,7 @@
 
     Abbreviations module for tokenization of Icelandic text
 
-    Copyright (C) 2021 Miðeind ehf.
+    Copyright (C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
@@ -2,7 +2,7 @@
 
     Definitions used for tokenization of Icelandic text
 
-    Copyright (C) 2021 Miðeind ehf.
+    Copyright (C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -3,7 +3,7 @@
 
     Tokenizer for Icelandic text
 
-    Copyright (C) 2021 Miðeind ehf.
+    Copyright (C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -110,7 +110,10 @@
 )
 
 parser.add_argument(
-    "-o", "--original", action="store_true", help="Outputs original text of tokens",
+    "-o",
+    "--original",
+    action="store_true",
+    help="Outputs original text of tokens",
 )
 
 parser.add_argument(
@@ -152,27 +155,27 @@
 
 
 def main() -> None:
-    """ Main function, called when the tokenize command is invoked """
+    """Main function, called when the tokenize command is invoked"""
 
     args = parser.parse_args()
     options: Dict[str, bool] = dict()
 
     def quote(s: str) -> str:
-        """ Return the string s within double quotes, and with any contained
-            backslashes and double quotes escaped with a backslash """
+        """Return the string s within double quotes, and with any contained
+        backslashes and double quotes escaped with a backslash"""
         return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
 
     def spanquote(l: List[int]) -> str:
-        """ Return the list l as a string within double quotes """
+        """Return the list l as a string within double quotes"""
         return '"' + "-".join(str(x) for x in l) + '"'
 
     def gen(f: TextIO) -> Iterator[str]:
-        """ Generate the lines of text in the input file """
+        """Generate the lines of text in the input file"""
         for line in f:
             yield line
 
     def val(t: Tok, quote_word: bool = False) -> Any:
-        """ Return the value part of the token t """
+        """Return the value part of the token t"""
         if t.val is None:
             return None
         if t.kind == TOK.WORD:
@@ -252,8 +255,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
     # Configure our JSON dump function
     json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
     curr_sent: List[str] = []
-    sep = "" if args.original else " "
-
+    tsep = "" if args.original else " "  # token separator
     for t in tokenize(gen(args.infile), **options):
         if args.csv:
             # Output the tokens in CSV format, one line per token
@@ -285,13 +287,18 @@ def val(t: Tok, quote_word: bool = False) -> Any:
                 d["s"] = t.origin_spans
             print(json_dumps(d), file=args.outfile)
         else:
-            # Normal shallow parse, one line per sentence,
+            # Normal shallow parse, sentences separated by newline by default,
             # tokens separated by spaces
+            if t.kind in TOK.END:
+                # End of sentence/paragraph
+                if curr_sent:
+                    print(tsep.join(curr_sent), file=args.outfile)
+                    curr_sent = []
             txt = to_text(t)
             if txt:
                 curr_sent.append(txt)
     if curr_sent:
-        print(sep.join(curr_sent), file=args.outfile)
+        print(tsep.join(curr_sent), file=args.outfile)
 
 
 if __name__ == "__main__":
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -2,7 +2,7 @@
 
     Tokenizer for Icelandic text
 
-    Copyright (C) 2021 Miðeind ehf.
+    Copyright (C) 2022 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -68,6 +68,11 @@
 _T = TypeVar("_T", bound="Tok")
 
 
+# Set of punctuation characters that are grouped into one
+# normalized exclamation
+EXCLAMATIONS = frozenset(("!", "?"))
+
+
 class Tok:
 
     """Information about a single token"""
@@ -1655,37 +1660,34 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
             elif rtxt.startswith("[…]"):
                 punct, rt = rt.split(3)
                 yield TOK.Punctuation(punct)
-            elif rtxt.startswith("..."):
-                # Treat ellipsis as one piece of punctuation
+            elif rtxt.startswith("...") or rtxt.startswith("…"):
+                # Treat >= 3 periods as ellipsis, one piece of punctuation
                 numdots = 0
                 for c in rtxt:
-                    if c == ".":
+                    if c == "." or c == "…":
                         numdots += 1
                     else:
                         break
                 dots, rt = rt.split(numdots)
                 yield TOK.Punctuation(dots, normalized="…")
-            elif rtxt.startswith("…"):
-                # Treat ellipsis as one piece of punctuation
-                numdots = 0
-                for c in rtxt:
-                    if c == "…":
-                        numdots += 1
+            elif rtxt.startswith(".."):
+                # Normalize two periods to one
+                dots, rt = rt.split(2)
+                yield TOK.Punctuation(dots, normalized=".")
+            elif rtxt.startswith(",,") and rtxt[2:3].isalpha():
+                # Probably someone trying to type opening double quotes with commas
+                punct, rt = rt.split(2)
+                yield TOK.Punctuation(punct, normalized="„")
+            elif rtxt.startswith(",,"):
+                # Coalesce multiple commas into one normalized comma
+                numcommas = 2
+                for c in rtxt[2:]:
+                    if c == ",":
+                        numcommas += 1
                     else:
                         break
-                dots, rt = rt.split(numdots)
-                yield TOK.Punctuation(dots, normalized="…")
-                # TODO LAGA Hér ætti að safna áfram.
-            # TODO Was at the end of a word or by itself, should be ",".
-            # Won't correct automatically, check for M6
-            elif rt.txt == ",,":
-                punct, rt = rt.split(2)
+                punct, rt = rt.split(numcommas)
                 yield TOK.Punctuation(punct, normalized=",")
-            # TODO STILLING kommum í upphafi orðs breytt í gæsalappir
-            elif rtxt.startswith(",,"):
-                # Probably an idiot trying to type opening double quotes with commas
-                punct, rt = rt.split(2)
-                yield TOK.Punctuation(punct, normalized="„")
             elif rtxt[0] in HYPHENS:
                 # Normalize all hyphens the same way
                 punct, rt = rt.split(1)
@@ -1715,6 +1717,16 @@ def parse(self, rt: Tok) -> Iterable[Tok]:
                     # Return the @-sign and leave the rest
                     punct, rt = rt.split(1)
                     yield TOK.Punctuation(punct)
+            elif len(rtxt) >= 2 and frozenset(rtxt) <= EXCLAMATIONS:
+                # Possibly '???!!!' or something of the sort
+                numpunct = 2
+                for p in rtxt[2:]:
+                    if p in EXCLAMATIONS:
+                        numpunct += 1
+                    else:
+                        break
+                punct, rt = rt.split(numpunct)
+                yield TOK.Punctuation(punct, normalized=rtxt[0])
             else:
                 punct, rt = rt.split(1)
                 yield TOK.Punctuation(punct)
@@ -2223,14 +2235,14 @@ def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]:
                 ):
                     # Ordinal, i.e. whole number or Roman numeral followed by period:
                     # convert to an ordinal token
-                    follow_token = token_stream[0]
-                    if follow_token and not (
-                        follow_token.kind in TOK.END
-                        or follow_token.punctuation in {"„", '"'}
+                    ord_token: Optional[Tok] = token_stream[0]
+                    if ord_token and not (
+                        ord_token.kind in TOK.END
+                        or ord_token.punctuation in {"„", '"'}
                         or (
-                            follow_token.kind == TOK.WORD
-                            and follow_token.txt[0].isupper()
-                            and month_for_token(follow_token, True) is None
+                            ord_token.kind == TOK.WORD
+                            and ord_token.txt[0].isupper()
+                            and month_for_token(ord_token, True) is None
                         )
                     ):
                         # OK: replace the number/Roman numeral and the period
diff --git a/test/test_detokenize.py b/test/test_detokenize.py
@@ -6,7 +6,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2021 by Miðeind ehf.
+    Copyright (C) 2022 by Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -61,6 +61,7 @@ def should_be(s1: str, s2: str) -> None:
         "Páll var með \"netfangið\" palli@einn.i.heiminum.is.",
         "Páll var með „netfangið“ palli@einn.i.heiminum.is."
     )
+
     # !!! BUG
     #should_be(
     #    "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").",
diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py
@@ -6,7 +6,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2021 by Miðeind ehf.
+    Copyright (C) 2022 by Miðeind ehf.
 
     This software is licensed under the MIT License:
 
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -5,7 +5,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2021 by Miðeind ehf.
+    Copyright (C) 2022 by Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -2346,6 +2346,31 @@ def test_normalization() -> None:
     assert t.text_from_tokens(toklist) == 'Hann sagði : " Þú ert ágæt ! " .'
     assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ! “ ."
 
+    toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt!!??!".'))
+    assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt !!??! " .'
+    assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ! “ ."
+
+    toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt??!?".'))
+    assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ??!? " .'
+    assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt ? “ ."
+
+    toklist = list(t.tokenize('Jón,, farðu út.'))
+    assert t.text_from_tokens(toklist) == 'Jón ,, farðu út .'
+    assert t.normalized_text_from_tokens(toklist) == "Jón , farðu út ."
+
+    toklist = list(t.tokenize('Jón ,,farðu út.'))
+    assert t.text_from_tokens(toklist) == 'Jón ,, farðu út .'
+    assert t.normalized_text_from_tokens(toklist) == "Jón „ farðu út ."
+
+    toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt.....".'))
+    assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ..... " .'
+    assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt … “ ."
+
+    toklist = list(t.tokenize('Hann sagði: ,,Þú ert ágæt…..".'))
+    assert t.text_from_tokens(toklist) == 'Hann sagði : ,, Þú ert ágæt ….. " .'
+    assert t.normalized_text_from_tokens(toklist) == "Hann sagði : „ Þú ert ágæt … “ ."
+
+
 
 def test_abbr_at_eos() -> None:
     """Test that 'Örn.' is not treated as an abbreviation here"""
diff --git a/test/test_tokenizer_tok.py b/test/test_tokenizer_tok.py
@@ -3,7 +3,7 @@
 
     Tests for Tokenizer module
 
-    Copyright (C) 2021 by Miðeind ehf.
+    Copyright (C) 2022 by Miðeind ehf.
 
     This software is licensed under the MIT License:
 

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#`
`2`	`2`	`# List of abbreviations for tokenization of Icelandic text`
`3`	`3`	`#`
`4`		`-# Copyright (C) 2021 Miðeind ehf.`
	`4`	`+# Copyright (C) 2022 Miðeind ehf.`
`5`	`5`	`# Original author: Vilhjálmur Þorsteinsson`
`6`	`6`	`#`
`7`	`7`	`# This software is licensed under the MIT License:`