Skip to content

Commit 12fa6d5

Browse files
Refactoring of tokenizer.py to shorten very long functions
1 parent 82d7605 commit 12fa6d5

File tree

2 files changed

+457
-354
lines changed

2 files changed

+457
-354
lines changed

src/tokenizer/__init__.py

Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,36 @@
2727
"""
2828

2929
from .definitions import (
30-
TP_LEFT, TP_CENTER, TP_RIGHT, TP_NONE, TP_WORD,
31-
EN_DASH, EM_DASH,
32-
KLUDGY_ORDINALS_PASS_THROUGH, KLUDGY_ORDINALS_MODIFY, KLUDGY_ORDINALS_TRANSLATE,
33-
BIN_Tuple, BIN_TupleList
30+
TP_LEFT,
31+
TP_CENTER,
32+
TP_RIGHT,
33+
TP_NONE,
34+
TP_WORD,
35+
EN_DASH,
36+
EM_DASH,
37+
KLUDGY_ORDINALS_PASS_THROUGH,
38+
KLUDGY_ORDINALS_MODIFY,
39+
KLUDGY_ORDINALS_TRANSLATE,
40+
BIN_Tuple,
41+
BIN_TupleList,
3442
)
3543
from .tokenizer import (
36-
TOK, Tok, tokenize, tokenize_without_annotation, split_into_sentences,
37-
parse_tokens, correct_spaces, detokenize, mark_paragraphs, paragraphs,
38-
normalized_text, normalized_text_from_tokens, text_from_tokens,
39-
calculate_indexes, generate_rough_tokens, TokenStream
44+
TOK,
45+
Tok,
46+
tokenize,
47+
tokenize_without_annotation,
48+
split_into_sentences,
49+
parse_tokens,
50+
correct_spaces,
51+
detokenize,
52+
mark_paragraphs,
53+
paragraphs,
54+
normalized_text,
55+
normalized_text_from_tokens,
56+
text_from_tokens,
57+
calculate_indexes,
58+
generate_raw_tokens,
59+
TokenStream,
4060
)
4161
from .abbrev import Abbreviations, ConfigError
4262
from .version import __version__
@@ -45,14 +65,38 @@
4565
__copyright__ = "(C) 2021 Miðeind ehf."
4666

4767
__all__ = (
48-
"TP_LEFT","TP_CENTER", "TP_RIGHT", "TP_NONE", "TP_WORD",
49-
"EN_DASH", "EM_DASH",
50-
"KLUDGY_ORDINALS_PASS_THROUGH", "KLUDGY_ORDINALS_MODIFY", "KLUDGY_ORDINALS_TRANSLATE",
51-
"BIN_Tuple", "BIN_TupleList",
52-
"TOK", "Tok", "tokenize", "tokenize_without_annotation", "split_into_sentences",
53-
"parse_tokens", "correct_spaces", "detokenize", "mark_paragraphs", "paragraphs",
54-
"normalized_text", "normalized_text_from_tokens", "text_from_tokens",
55-
"calculate_indexes", "generate_rough_tokens", "TokenStream",
56-
"Abbreviations", "ConfigError", "__version__", "__author__", "__copyright__"
68+
"__author__",
69+
"__copyright__",
70+
"__version__",
71+
"Abbreviations",
72+
"BIN_Tuple",
73+
"BIN_TupleList",
74+
"calculate_indexes",
75+
"ConfigError",
76+
"correct_spaces",
77+
"detokenize",
78+
"EM_DASH",
79+
"EN_DASH",
80+
"generate_raw_tokens",
81+
"KLUDGY_ORDINALS_MODIFY",
82+
"KLUDGY_ORDINALS_PASS_THROUGH",
83+
"KLUDGY_ORDINALS_TRANSLATE",
84+
"mark_paragraphs",
85+
"normalized_text_from_tokens",
86+
"normalized_text",
87+
"paragraphs",
88+
"parse_tokens",
89+
"split_into_sentences",
90+
"text_from_tokens",
91+
"Tok",
92+
"TOK",
93+
"tokenize_without_annotation",
94+
"tokenize",
95+
"TokenStream",
96+
"TP_CENTER",
97+
"TP_LEFT",
98+
"TP_NONE",
99+
"TP_RIGHT",
100+
"TP_WORD",
57101
)
58102

0 commit comments

Comments
 (0)