diff --git a/normality/cleaning.py b/normality/cleaning.py index 2137709..6f04c2f 100644 --- a/normality/cleaning.py +++ b/normality/cleaning.py @@ -5,9 +5,11 @@ from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS from normality.util import Categories, is_text -COLLAPSE_RE = re.compile(r"\s+", re.U) +COLLAPSE_RE = re.compile(r"[\s\u2028\u2029\u200b\u200c\u200d]+", re.U) BOM_RE = re.compile("^\ufeff", re.U) -UNSAFE_RE = re.compile(r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f]|\u2028") +UNSAFE_RE = re.compile( + r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f\u2028\u2029\u200b\u200c\u200d]" +) QUOTES_RE = re.compile(r'^["\'](.*)["\']$') diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py new file mode 100644 index 0000000..9d2118b --- /dev/null +++ b/tests/test_cleaning.py @@ -0,0 +1,23 @@ +from normality.cleaning import remove_unsafe_chars, collapse_spaces + + +def test_remove_unsafe_chars(): + assert remove_unsafe_chars(None) is None + assert remove_unsafe_chars("") == "" + assert remove_unsafe_chars(" ") == " " + assert remove_unsafe_chars("\u2028 ") == " " + assert remove_unsafe_chars("\ufeff ") == " " + assert remove_unsafe_chars("lalala\ufeff ") == "lalala\ufeff " + assert remove_unsafe_chars("lalala\u200bx") == "lalalax" + + +def test_collapse_spaces(): + assert collapse_spaces(None) is None + assert collapse_spaces("") == "" + assert collapse_spaces(" ") == "" + assert collapse_spaces(" ") == "" + assert collapse_spaces(" \n ") == "" + assert collapse_spaces(" \n\n ") == "" + assert collapse_spaces(" \njfshdhdfjk\n ") == "jfshdhdfjk" + assert collapse_spaces(" \n\u2028\u2029\u200b\u200c\n ") == "" + assert collapse_spaces("a\u200bx") == "a x"