Skip to content

Commit 89a6637

Browse files
committed
Update multilingual_policy_filter.py
1 parent cd8f1cd commit 89a6637

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class MultilingualPolicyFilter(BaseFilter):
2222
policy_strings: list of policy substrings to remove
2323
"""
2424

25-
name = " Multilingual Policy"
25+
name = "🌎 Multilingual Policy"
2626
_requires_dependencies = ["nltk"]
2727

2828
def __init__(
@@ -61,9 +61,9 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
6161
# check line has too long word
6262
line_l = line.lower()
6363
# lorem ipsum
64-
#if any(p in line_l for p in self.policy_strings):
65-
# self.stat_update("line-filter-policy")
66-
# continue
64+
if any(p in line_l for p in self.policy_strings):
65+
self.stat_update("line-filter-policy")
66+
continue
6767
num_sentences += len(sent_tokenize(line, language=self.language)) if self.split_paragraph else 1
6868
kept_lines.append(line)
6969
self.stat_update("line-kept")

0 commit comments

Comments
 (0)