Merge pull request #22 from mideind/onesentperline

vthorsteinsson · web-flow · commit 52cc1e678daf · 2021-03-08T11:15:57.000Z
Onesentperline
diff --git a/README.rst b/README.rst
@@ -104,11 +104,44 @@ on the command line:
 | | ``--json``      | Deep tokenization. Output token objects in JSON   |
 |                   | format, one per line.                             |
 +-------------------+---------------------------------------------------+
-| | ``--normalize`` | Normalize punctuation, causing e.g. quotes to be  |
-|                   | output in Icelandic form and hyphens to be        |
-|                   | regularized. This option is only applicable to    |
-|                   | shallow tokenization.                             |
-+-------------------+---------------------------------------------------+
+
+Other options can be specified on the command line:
+
++-----------------------------------+---------------------------------------------------+
+| | ``-n``                          | Normalize punctuation, causing e.g. quotes to be  |
+| | ``--normalize``                 | output in Icelandic form and hyphens to be        |
+|                                   | regularized. This option is only applicable to    |
+|                                   | shallow tokenization.                             |
++-----------------------------------+---------------------------------------------------+
+| | ``-s``                          | Input contains strictly one sentence per line.    |
+| | ``--one_sent_per_line``         |                                                   |
++-----------------------------------+---------------------------------------------------+
+| | ``-m``                          | Degree signal in tokens denoting temperature      |
+| | ``--convert_measurements``      | normalized (200° C -> 200 °C)                     |
++-----------------------------------+---------------------------------------------------+
+| | ``-a``                          | Additional annotation, usually handled by         |
+| | ``--with_annotation``           | GreynirPackage, added to tokens.                  |
++-----------------------------------+---------------------------------------------------+
+| | ``-p``                          | Numbers combined into one token with the          |
+| | ``--coalesce_percent``          | following token denoting percentage word forms    |
+|                                   | (prósent, prósentustig, hundraðshlutar)           |
++-----------------------------------+---------------------------------------------------+
+| | ``-g``                          | Composite glyphs not replaced with a single       |
+| | ``--keep_composite_glyphs``     | code point, so a ́' is not replaced with á        |
++-----------------------------------+---------------------------------------------------+
+| | ``-e``                          | HTML escape codes replaced,                       |
+| | ``--replace_html_escapes``      | such as '&aacute;' -> 'á'                         |
++-----------------------------------+---------------------------------------------------+
+| | ``-c``                          | English-style decimal points and thousands        |
+| | ``--convert_numbers``           | separators in numbers changed to Icelandic style  |
++-----------------------------------+---------------------------------------------------+
+| | ``-k N``                        | Kludgy ordinal handling defined.                  |
+| | ``--handle_kludgy_ordinals N``  | 0: Returns the original mixed word form           |
+|                                   | 1. Kludgy ordinal returned as pure word forms     |
+|                                   | 2: Kludgy ordinals returned as pure numbers       |
++-----------------------------------+---------------------------------------------------+
+
+
 
 Type ``tokenize -h`` or ``tokenize --help`` to get a short help message.
 
diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -85,7 +85,63 @@
 group.add_argument(
     "--json", help="Output one token per line in JSON format", action="store_true"
 )
-group.add_argument("--normalize", help="Normalize punctuation", action="store_true")
+
+parser.add_argument(
+    "-s",
+    "--one_sent_per_line",
+    action="store_true",
+    help="Input contains one sentence per line",
+)
+
+parser.add_argument(
+    "-m",
+    "--convert_measurements",
+    action="store_true",
+    help="Degree signal in temperature tokens normalized (200° C -> 200 °C)",
+)
+
+parser.add_argument(
+    "-p",
+    "--coalesce_percent",
+    action="store_true",
+    help="Numbers combined into one token with percentage word forms (prósent/prósentustig/hundraðshlutar)",
+)
+
+parser.add_argument(
+    "-n",
+    "--normalize",
+    action="store_true",
+    help="Outputs normalized value of punctuation tokens instead of original text",
+)
+
+parser.add_argument(
+    "-g",
+    "--keep_composite_glyphs",
+    action="store_true",
+    help="Composite glyphs not replaced with a single code point",
+)
+
+parser.add_argument(
+    "-e",
+    "--replace_html_escapes",
+    action="store_true",
+    help="Escape codes from HTML replaced",
+)
+
+parser.add_argument(
+    "-c",
+    "--convert_numbers",
+    action="store_true",
+    help="English-style decimal points and thousands separators in numbers changed to Icelandic style",
+)
+
+parser.add_argument(
+    "-k",
+    "--handle_kludgy_ordinals",
+    type=int,
+    default=0,
+    help="Kludgy ordinal handling defined. \n\t0: Returns the original word form. \n\t1: Ordinals returned as pure words. \n\t2: Ordinals returned as numbers.",
+)
 
 
 def main():
@@ -150,6 +206,27 @@ def val(t, quote_word=False):
     else:
         to_text = lambda t: t.txt
 
+    if args.convert_measurements:
+        options["convert_measurements"] = True
+
+    if args.coalesce_percent:
+        options["coalesce_percent"] = True
+
+    if args.keep_composite_glyphs:
+        options["replace_composite_glyphs"] = False  # True is the default in tokenizer.py
+
+    if args.replace_html_escapes:
+        options["replace_html_escapes"] = True
+
+    if args.convert_numbers:
+        options["convert_numbers"] = True
+
+    if args.one_sent_per_line:
+        options["one_sent_per_line"] = True
+
+    if args.handle_kludgy_ordinals:
+        options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals
+
     # Configure our JSON dump function
     json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
     curr_sent = []
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -698,7 +698,7 @@ def html_escape(match):
     return unicode_chr(int(g[1:]))
 
 
-def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=False):
+def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=False, one_sent_per_line=False):
     """ Generate rough tokens from a string """
     if replace_composite_glyphs:
         # Replace composite glyphs with single code points
@@ -712,18 +712,25 @@ def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=Fal
     # newlines separated only by whitespace), we interpret
     # them as hard sentence boundaries
     first = True
-    for span in re.split(r"\n\s*\n", txt):
+    if one_sent_per_line:
+        # We know there's a single sentence per line
+        # Only split on newline
+        splitter = re.split(r"\n", txt)
+    else:
+        splitter = re.split(r"\n\s*\n", txt)
+
+    for span in splitter:
         if first:
             first = False
         else:
             # Return a sentence splitting token in lieu of the
             # newline pair that separates the spans
             yield ""
         for w in span.split():
-            yield w
-
+            if w:
+                yield w
 
-def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False):
+def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False, one_sent_per_line=False):
     """ Generate rough tokens from a string or a generator """
     if text_or_gen is None:
         return
@@ -741,7 +748,7 @@ def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False):
             txt = make_str(txt)
             # Yield the contained rough tokens
             for w in gen_from_string(
-                txt, replace_composite_glyphs, replace_html_escapes
+                txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line
             ):
                 yield w
 
@@ -769,6 +776,7 @@ def parse_tokens(txt, **options):
     convert_numbers = options.get("convert_numbers", False)
     replace_composite_glyphs = options.get("replace_composite_glyphs", True)
     replace_html_escapes = options.get("replace_html_escapes", False)
+    one_sent_per_line = options.get("one_sent_per_line", False)
 
     # The default behavior for kludgy ordinals is to pass them
     # through as word tokens
@@ -795,7 +803,7 @@ def parse_tokens(txt, **options):
     # 7) The process is repeated from step 4) until the current raw token is
     #    exhausted. At that point, we obtain the next token and start from 2).
 
-    for w in gen(txt, replace_composite_glyphs, replace_html_escapes):
+    for w in gen(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line):
 
         # Handle each sequence w of non-whitespace characters
 
@@ -804,6 +812,7 @@ def parse_tokens(txt, **options):
             yield TOK.Split_Sentence()
             continue
 
+
         if w.isalpha() or w in SI_UNITS:
             # Shortcut for most common case: pure word
             yield TOK.Word(w)
@@ -2022,7 +2031,7 @@ def convert_to_num(token):
                     )
                     next_token = next(token_stream)
                 else:
-                    # Check for [number] 'prósent/prósentustig/hundraðshluta'
+                    # Check for [number] 'prósent/prósentustig/hundraðshlutar'
                     if coalesce_percent:
                         percentage = match_stem_list(next_token, PERCENTAGES)
                     else:
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -540,6 +540,11 @@ def test_single_tokens():
         ("12,3 prósentustig", TOK.PERCENT),
     ]
 
+    TEST_CASES_CONVERT_MEASUREMENTS = [
+        ("200° C", [Tok(TOK.MEASUREMENT, "200 °C", ('K', 473.15))]),
+        ("80° F", [Tok(TOK.MEASUREMENT, "80 °F", ('K', 299.8166666666667))]),
+    ]
+
     def run_test(test_cases, **options):
         for test_case in test_cases:
             if len(test_case) == 3:
@@ -594,7 +599,10 @@ def run_test(test_cases, **options):
         TEST_CASES_COALESCE_PERCENT,
         coalesce_percent=True
     )
-
+    run_test(
+        TEST_CASES_CONVERT_MEASUREMENTS,
+        convert_measurements=True
+    )
 
 def test_sentences():
 
@@ -1280,6 +1288,8 @@ def gen(s):
         "Vigur kom með fullfermi að landi",
     ]
 
+    # Test onesentperline
+
 
 def test_normalization():
     toklist = list(t.tokenize("Hann sagði: \"Þú ert ágæt!\"."))
@@ -1356,6 +1366,51 @@ def test_html_escapes():
     assert toklist == correct
 
 
+def test_one_sent_per_line():
+    toklist = list(
+        t.tokenize(
+            "Hér er hestur\nmaður beit hund",
+            one_sent_per_line=True
+        )
+
+    )
+
+    correct = [
+        Tok(kind=11001, txt=None, val=(0, None)),
+        Tok(kind=6, txt='Hér', val=None),
+        Tok(kind=6, txt='er', val=None),
+        Tok(kind=6, txt='hestur', val=None),
+        Tok(kind=11002, txt=None, val=None),
+        Tok(kind=11001, txt=None, val=(0, None)),
+        Tok(kind=6, txt='maður', val=None),
+        Tok(kind=6, txt='beit', val=None),
+        Tok(kind=6, txt='hund', val=None),
+        Tok(kind=11002, txt=None, val=None),
+    ]
+    assert toklist == correct
+
+    # Test without option
+    toklist = list(
+        t.tokenize(
+            "Hér er hestur\nmaður beit hund",
+            one_sent_per_line=False
+        )
+
+    )
+
+    correct = [
+        Tok(kind=11001, txt=None, val=(0, None)),
+        Tok(kind=6, txt='Hér', val=None),
+        Tok(kind=6, txt='er', val=None),
+        Tok(kind=6, txt='hestur', val=None),
+        Tok(kind=6, txt='maður', val=None),
+        Tok(kind=6, txt='beit', val=None),
+        Tok(kind=6, txt='hund', val=None),
+        Tok(kind=11002, txt=None, val=None),
+    ]
+    assert toklist == correct
+
+
 if __name__ == "__main__":
 
     test_single_tokens()