Skip to content

Commit a24a30f

Browse files
Version 3.3.2; minor fixes
1 parent 12fa6d5 commit a24a30f

File tree

3 files changed

+6
-5
lines changed

3 files changed

+6
-5
lines changed

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
809809
Changelog
810810
---------
811811

812+
* Version 3.3.2: Internal refactoring; bug fixes in paragraph handling.
812813
* Version 3.3.1: Fixed bug where opening quotes at the start of paragraphs
813814
were sometimes incorrectly recognized and normalized.
814815
* Version 3.2.0: Numbers and amounts that consist of word tokens only ('sex hundruð')

src/tokenizer/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.3.1"
1+
__version__ = "3.3.2"

test/test_tokenizer_tok.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,29 +254,29 @@ def test_split_without_origin_tracking() -> None:
254254

255255
def test_html_escapes_with_origin_tracking() -> None:
256256
test_string = "xyazáwab"
257-
tokens = list(tokenizer.generate_rough_tokens(test_string, replace_html_escapes=True))
257+
tokens = list(tokenizer.generate_raw_tokens(test_string, replace_html_escapes=True))
258258
assert len(tokens) == 1
259259
assert tokens[0] == Tok(kind=TOK.RAW, txt="xyazáwab", val=None, original=test_string, origin_spans=[0, 1, 2, 8, 9, 17, 18, 23])
260260

261261

262262
def test_unicode_escapes_with_origin_tracking() -> None:
263263
test_string = "xya" + ACCENT + "zu" + ACCENT + "wo" + UMLAUT + "b"
264-
tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True))
264+
tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True))
265265
assert len(tokens) == 1
266266
assert tokens[0] == Tok(kind=TOK.RAW, txt="xyázúwöb", val=None, original=test_string, origin_spans=[0, 1, 2, 4, 5, 7, 8, 10])
267267

268268

269269
def test_unicode_escapes_that_are_removed() -> None:
270270
test_string = "a\xadb\xadc"
271-
tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True))
271+
tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True))
272272
assert len(tokens) == 1
273273
assert tokens[0] == Tok(kind=TOK.RAW, txt="abc", val=None, original=test_string, origin_spans=[0, 2, 4])
274274

275275

276276
def test_html_unicode_mix() -> None:
277277
test_string = "xya" + ACCENT + "zu" + ACCENT + "wáo" + UMLAUT + "b"
278278
# 012 3 45 6 7890123456 7 8
279-
tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True, replace_html_escapes=True))
279+
tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True, replace_html_escapes=True))
280280
assert len(tokens) == 1
281281
assert tokens[0] == Tok(kind=TOK.RAW, txt="xyázúwáöb", val=None, original=test_string, origin_spans=[0, 1, 2, 4, 5, 7, 8, 16, 18])
282282

0 commit comments

Comments
 (0)