Skip to content

Commit 6b95e5b

Browse files
committed
Adjust multilingual policy
1 parent bf4144c commit 6b95e5b

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,14 @@ def __init__(
5353
language: str = "german",
5454
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
5555
min_num_sentences: int = 5, # set to -1 to disableQ
56+
policy_strings: str = None
5657
):
5758
super().__init__(exclusion_writer)
5859
self.language = language
5960
self.split_paragraph = split_paragraph
6061
self.min_num_sentences = min_num_sentences
62+
self.policy_strings = policy_strings if policy_strings else POLICY_SUBSTRINGS[self.language]
63+
6164

6265
def filter(self, doc: Document) -> bool | tuple[bool, str]:
6366
from nltk.tokenize import sent_tokenize
@@ -78,7 +81,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
7881
# check line has too long word
7982
line_l = line.lower()
8083
# lorem ipsum
81-
if any(p in line_l for p in POLICY_SUBSTRINGS[self.language]):
84+
if any(p in line_l for p in self.policy_strings):
8285
self.stat_update("line-filter-policy")
8386
continue
8487
num_sentences += len(sent_tokenize(line, language=self.language)) if self.split_paragraph else 1

0 commit comments

Comments
 (0)