Skip to content

Commit 52cc1e6

Browse files
Merge pull request #22 from mideind/onesentperline
Onesentperline
2 parents 35bfb23 + 487725a commit 52cc1e6

File tree

4 files changed

+189
-15
lines changed

4 files changed

+189
-15
lines changed

README.rst

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,44 @@ on the command line:
104104
| | ``--json`` | Deep tokenization. Output token objects in JSON |
105105
| | format, one per line. |
106106
+-------------------+---------------------------------------------------+
107-
| | ``--normalize`` | Normalize punctuation, causing e.g. quotes to be |
108-
| | output in Icelandic form and hyphens to be |
109-
| | regularized. This option is only applicable to |
110-
| | shallow tokenization. |
111-
+-------------------+---------------------------------------------------+
107+
108+
Other options can be specified on the command line:
109+
110+
+-----------------------------------+---------------------------------------------------+
111+
| | ``-n`` | Normalize punctuation, causing e.g. quotes to be |
112+
| | ``--normalize`` | output in Icelandic form and hyphens to be |
113+
| | regularized. This option is only applicable to |
114+
| | shallow tokenization. |
115+
+-----------------------------------+---------------------------------------------------+
116+
| | ``-s`` | Input contains strictly one sentence per line. |
117+
| | ``--one_sent_per_line`` | |
118+
+-----------------------------------+---------------------------------------------------+
119+
| | ``-m`` | Degree signal in tokens denoting temperature |
120+
| | ``--convert_measurements`` | normalized (200° C -> 200 °C) |
121+
+-----------------------------------+---------------------------------------------------+
122+
| | ``-a`` | Additional annotation, usually handled by |
123+
| | ``--with_annotation`` | GreynirPackage, added to tokens. |
124+
+-----------------------------------+---------------------------------------------------+
125+
| | ``-p`` | Numbers combined into one token with the |
126+
| | ``--coalesce_percent`` | following token denoting percentage word forms |
127+
| | (prósent, prósentustig, hundraðshlutar) |
128+
+-----------------------------------+---------------------------------------------------+
129+
| | ``-g`` | Composite glyphs not replaced with a single |
130+
| | ``--keep_composite_glyphs`` | code point, so a ́' is not replaced with á |
131+
+-----------------------------------+---------------------------------------------------+
132+
| | ``-e`` | HTML escape codes replaced, |
133+
| | ``--replace_html_escapes`` | such as 'á' -> 'á' |
134+
+-----------------------------------+---------------------------------------------------+
135+
| | ``-c`` | English-style decimal points and thousands |
136+
| | ``--convert_numbers`` | separators in numbers changed to Icelandic style |
137+
+-----------------------------------+---------------------------------------------------+
138+
| | ``-k N`` | Kludgy ordinal handling defined. |
139+
| | ``--handle_kludgy_ordinals N`` | 0: Returns the original mixed word form |
140+
| | 1. Kludgy ordinal returned as pure word forms |
141+
| | 2: Kludgy ordinals returned as pure numbers |
142+
+-----------------------------------+---------------------------------------------------+
143+
144+
112145

113146
Type ``tokenize -h`` or ``tokenize --help`` to get a short help message.
114147

src/tokenizer/main.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,63 @@
8585
group.add_argument(
8686
"--json", help="Output one token per line in JSON format", action="store_true"
8787
)
88-
group.add_argument("--normalize", help="Normalize punctuation", action="store_true")
88+
89+
parser.add_argument(
90+
"-s",
91+
"--one_sent_per_line",
92+
action="store_true",
93+
help="Input contains one sentence per line",
94+
)
95+
96+
parser.add_argument(
97+
"-m",
98+
"--convert_measurements",
99+
action="store_true",
100+
help="Degree signal in temperature tokens normalized (200° C -> 200 °C)",
101+
)
102+
103+
parser.add_argument(
104+
"-p",
105+
"--coalesce_percent",
106+
action="store_true",
107+
help="Numbers combined into one token with percentage word forms (prósent/prósentustig/hundraðshlutar)",
108+
)
109+
110+
parser.add_argument(
111+
"-n",
112+
"--normalize",
113+
action="store_true",
114+
help="Outputs normalized value of punctuation tokens instead of original text",
115+
)
116+
117+
parser.add_argument(
118+
"-g",
119+
"--keep_composite_glyphs",
120+
action="store_true",
121+
help="Composite glyphs not replaced with a single code point",
122+
)
123+
124+
parser.add_argument(
125+
"-e",
126+
"--replace_html_escapes",
127+
action="store_true",
128+
help="Escape codes from HTML replaced",
129+
)
130+
131+
parser.add_argument(
132+
"-c",
133+
"--convert_numbers",
134+
action="store_true",
135+
help="English-style decimal points and thousands separators in numbers changed to Icelandic style",
136+
)
137+
138+
parser.add_argument(
139+
"-k",
140+
"--handle_kludgy_ordinals",
141+
type=int,
142+
default=0,
143+
help="Kludgy ordinal handling defined. \n\t0: Returns the original word form. \n\t1: Ordinals returned as pure words. \n\t2: Ordinals returned as numbers.",
144+
)
89145

90146

91147
def main():
@@ -150,6 +206,27 @@ def val(t, quote_word=False):
150206
else:
151207
to_text = lambda t: t.txt
152208

209+
if args.convert_measurements:
210+
options["convert_measurements"] = True
211+
212+
if args.coalesce_percent:
213+
options["coalesce_percent"] = True
214+
215+
if args.keep_composite_glyphs:
216+
options["replace_composite_glyphs"] = False # True is the default in tokenizer.py
217+
218+
if args.replace_html_escapes:
219+
options["replace_html_escapes"] = True
220+
221+
if args.convert_numbers:
222+
options["convert_numbers"] = True
223+
224+
if args.one_sent_per_line:
225+
options["one_sent_per_line"] = True
226+
227+
if args.handle_kludgy_ordinals:
228+
options["handle_kludgy_ordinals"] = args.handle_kludgy_ordinals
229+
153230
# Configure our JSON dump function
154231
json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
155232
curr_sent = []

src/tokenizer/tokenizer.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -698,7 +698,7 @@ def html_escape(match):
698698
return unicode_chr(int(g[1:]))
699699

700700

701-
def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=False):
701+
def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=False, one_sent_per_line=False):
702702
""" Generate rough tokens from a string """
703703
if replace_composite_glyphs:
704704
# Replace composite glyphs with single code points
@@ -712,18 +712,25 @@ def gen_from_string(txt, replace_composite_glyphs=True, replace_html_escapes=Fal
712712
# newlines separated only by whitespace), we interpret
713713
# them as hard sentence boundaries
714714
first = True
715-
for span in re.split(r"\n\s*\n", txt):
715+
if one_sent_per_line:
716+
# We know there's a single sentence per line
717+
# Only split on newline
718+
splitter = re.split(r"\n", txt)
719+
else:
720+
splitter = re.split(r"\n\s*\n", txt)
721+
722+
for span in splitter:
716723
if first:
717724
first = False
718725
else:
719726
# Return a sentence splitting token in lieu of the
720727
# newline pair that separates the spans
721728
yield ""
722729
for w in span.split():
723-
yield w
724-
730+
if w:
731+
yield w
725732

726-
def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False):
733+
def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False, one_sent_per_line=False):
727734
""" Generate rough tokens from a string or a generator """
728735
if text_or_gen is None:
729736
return
@@ -741,7 +748,7 @@ def gen(text_or_gen, replace_composite_glyphs=True, replace_html_escapes=False):
741748
txt = make_str(txt)
742749
# Yield the contained rough tokens
743750
for w in gen_from_string(
744-
txt, replace_composite_glyphs, replace_html_escapes
751+
txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line
745752
):
746753
yield w
747754

@@ -769,6 +776,7 @@ def parse_tokens(txt, **options):
769776
convert_numbers = options.get("convert_numbers", False)
770777
replace_composite_glyphs = options.get("replace_composite_glyphs", True)
771778
replace_html_escapes = options.get("replace_html_escapes", False)
779+
one_sent_per_line = options.get("one_sent_per_line", False)
772780

773781
# The default behavior for kludgy ordinals is to pass them
774782
# through as word tokens
@@ -795,7 +803,7 @@ def parse_tokens(txt, **options):
795803
# 7) The process is repeated from step 4) until the current raw token is
796804
# exhausted. At that point, we obtain the next token and start from 2).
797805

798-
for w in gen(txt, replace_composite_glyphs, replace_html_escapes):
806+
for w in gen(txt, replace_composite_glyphs, replace_html_escapes, one_sent_per_line):
799807

800808
# Handle each sequence w of non-whitespace characters
801809

@@ -804,6 +812,7 @@ def parse_tokens(txt, **options):
804812
yield TOK.Split_Sentence()
805813
continue
806814

815+
807816
if w.isalpha() or w in SI_UNITS:
808817
# Shortcut for most common case: pure word
809818
yield TOK.Word(w)
@@ -2022,7 +2031,7 @@ def convert_to_num(token):
20222031
)
20232032
next_token = next(token_stream)
20242033
else:
2025-
# Check for [number] 'prósent/prósentustig/hundraðshluta'
2034+
# Check for [number] 'prósent/prósentustig/hundraðshlutar'
20262035
if coalesce_percent:
20272036
percentage = match_stem_list(next_token, PERCENTAGES)
20282037
else:

test/test_tokenizer.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,11 @@ def test_single_tokens():
540540
("12,3 prósentustig", TOK.PERCENT),
541541
]
542542

543+
TEST_CASES_CONVERT_MEASUREMENTS = [
544+
("200° C", [Tok(TOK.MEASUREMENT, "200 °C", ('K', 473.15))]),
545+
("80° F", [Tok(TOK.MEASUREMENT, "80 °F", ('K', 299.8166666666667))]),
546+
]
547+
543548
def run_test(test_cases, **options):
544549
for test_case in test_cases:
545550
if len(test_case) == 3:
@@ -594,7 +599,10 @@ def run_test(test_cases, **options):
594599
TEST_CASES_COALESCE_PERCENT,
595600
coalesce_percent=True
596601
)
597-
602+
run_test(
603+
TEST_CASES_CONVERT_MEASUREMENTS,
604+
convert_measurements=True
605+
)
598606

599607
def test_sentences():
600608

@@ -1280,6 +1288,8 @@ def gen(s):
12801288
"Vigur kom með fullfermi að landi",
12811289
]
12821290

1291+
# Test onesentperline
1292+
12831293

12841294
def test_normalization():
12851295
toklist = list(t.tokenize("Hann sagði: \"Þú ert ágæt!\"."))
@@ -1356,6 +1366,51 @@ def test_html_escapes():
13561366
assert toklist == correct
13571367

13581368

1369+
def test_one_sent_per_line():
1370+
toklist = list(
1371+
t.tokenize(
1372+
"Hér er hestur\nmaður beit hund",
1373+
one_sent_per_line=True
1374+
)
1375+
1376+
)
1377+
1378+
correct = [
1379+
Tok(kind=11001, txt=None, val=(0, None)),
1380+
Tok(kind=6, txt='Hér', val=None),
1381+
Tok(kind=6, txt='er', val=None),
1382+
Tok(kind=6, txt='hestur', val=None),
1383+
Tok(kind=11002, txt=None, val=None),
1384+
Tok(kind=11001, txt=None, val=(0, None)),
1385+
Tok(kind=6, txt='maður', val=None),
1386+
Tok(kind=6, txt='beit', val=None),
1387+
Tok(kind=6, txt='hund', val=None),
1388+
Tok(kind=11002, txt=None, val=None),
1389+
]
1390+
assert toklist == correct
1391+
1392+
# Test without option
1393+
toklist = list(
1394+
t.tokenize(
1395+
"Hér er hestur\nmaður beit hund",
1396+
one_sent_per_line=False
1397+
)
1398+
1399+
)
1400+
1401+
correct = [
1402+
Tok(kind=11001, txt=None, val=(0, None)),
1403+
Tok(kind=6, txt='Hér', val=None),
1404+
Tok(kind=6, txt='er', val=None),
1405+
Tok(kind=6, txt='hestur', val=None),
1406+
Tok(kind=6, txt='maður', val=None),
1407+
Tok(kind=6, txt='beit', val=None),
1408+
Tok(kind=6, txt='hund', val=None),
1409+
Tok(kind=11002, txt=None, val=None),
1410+
]
1411+
assert toklist == correct
1412+
1413+
13591414
if __name__ == "__main__":
13601415

13611416
test_single_tokens()

0 commit comments

Comments
 (0)