Version 3.3.2; minor fixes

vthorsteinsson · vthorsteinsson · commit a24a30f5b5cf · 2021-09-27T14:54:25.000Z
diff --git a/README.rst b/README.rst
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.3.2: Internal refactoring; bug fixes in paragraph handling.
 * Version 3.3.1: Fixed bug where opening quotes at the start of paragraphs
   were sometimes incorrectly recognized and normalized.
 * Version 3.2.0: Numbers and amounts that consist of word tokens only ('sex hundruð')
diff --git a/src/tokenizer/version.py b/src/tokenizer/version.py
@@ -1 +1 @@
-__version__ = "3.3.1"
+__version__ = "3.3.2"
diff --git a/test/test_tokenizer_tok.py b/test/test_tokenizer_tok.py
@@ -254,29 +254,29 @@ def test_split_without_origin_tracking() -> None:
 
 def test_html_escapes_with_origin_tracking() -> None:
     test_string = "xy&#x61;z&aacute;w&#97;b"
-    tokens = list(tokenizer.generate_rough_tokens(test_string, replace_html_escapes=True))
+    tokens = list(tokenizer.generate_raw_tokens(test_string, replace_html_escapes=True))
     assert len(tokens) == 1
     assert tokens[0] == Tok(kind=TOK.RAW, txt="xyazáwab", val=None, original=test_string, origin_spans=[0, 1, 2, 8, 9, 17, 18, 23])
 
 
 def test_unicode_escapes_with_origin_tracking() -> None:
     test_string = "xya" + ACCENT + "zu" + ACCENT + "wo" + UMLAUT + "b"
-    tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True))
+    tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True))
     assert len(tokens) == 1
     assert tokens[0] == Tok(kind=TOK.RAW, txt="xyázúwöb", val=None, original=test_string, origin_spans=[0, 1, 2, 4, 5, 7, 8, 10])
 
 
 def test_unicode_escapes_that_are_removed() -> None:
     test_string = "a\xadb\xadc"
-    tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True))
+    tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True))
     assert len(tokens) == 1
     assert tokens[0] == Tok(kind=TOK.RAW, txt="abc", val=None, original=test_string, origin_spans=[0, 2, 4])
 
 
 def test_html_unicode_mix() -> None:
     test_string = "xya" + ACCENT + "zu" + ACCENT + "w&aacute;o" + UMLAUT + "b"
     #              012    3         45    6         7890123456    7         8
-    tokens = list(tokenizer.generate_rough_tokens(test_string, replace_composite_glyphs=True, replace_html_escapes=True))
+    tokens = list(tokenizer.generate_raw_tokens(test_string, replace_composite_glyphs=True, replace_html_escapes=True))
     assert len(tokens) == 1
     assert tokens[0] == Tok(kind=TOK.RAW, txt="xyázúwáöb", val=None, original=test_string, origin_spans=[0, 1, 2, 4, 5, 7, 8, 16, 18])
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.3.1"`
	`1`	`+__version__ = "3.3.2"`