Skip to content

Commit

Permalink
add zero-width characters to remove_unsafe and collapse_spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Jan 9, 2025
1 parent 79d5c28 commit adef294
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
6 changes: 4 additions & 2 deletions normality/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS
from normality.util import Categories, is_text

COLLAPSE_RE = re.compile(r"\s+", re.U)
COLLAPSE_RE = re.compile(r"[\s\u2028\u2029\u200b\u200c\u200d]+", re.U)
BOM_RE = re.compile("^\ufeff", re.U)
UNSAFE_RE = re.compile(r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f]|\u2028")
UNSAFE_RE = re.compile(
r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f\u2028\u2029\u200b\u200c\u200d]"
)
QUOTES_RE = re.compile(r'^["\'](.*)["\']$')


Expand Down
23 changes: 23 additions & 0 deletions tests/test_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from normality.cleaning import remove_unsafe_chars, collapse_spaces


def test_remove_unsafe_chars():
assert remove_unsafe_chars(None) is None
assert remove_unsafe_chars("") == ""
assert remove_unsafe_chars(" ") == " "
assert remove_unsafe_chars("\u2028 ") == " "
assert remove_unsafe_chars("\ufeff ") == " "
assert remove_unsafe_chars("lalala\ufeff ") == "lalala\ufeff "
assert remove_unsafe_chars("lalala\u200bx") == "lalalax"


def test_collapse_spaces():
assert collapse_spaces(None) is None
assert collapse_spaces("") == ""
assert collapse_spaces(" ") == ""
assert collapse_spaces(" ") == ""
assert collapse_spaces(" \n ") == ""
assert collapse_spaces(" \n\n ") == ""
assert collapse_spaces(" \njfshdhdfjk\n ") == "jfshdhdfjk"
assert collapse_spaces(" \n\u2028\u2029\u200b\u200c\n ") == ""
assert collapse_spaces("a\u200bx") == "a x"

0 comments on commit adef294

Please sign in to comment.