Adjust multilingual policy

manuelbrack · manuelbrack · commit 6b95e5bfdd43 · 2024-04-26T10:12:49.000+02:00
diff --git a/src/datatrove/pipeline/filters/multilingual_policy_filter.py b/src/datatrove/pipeline/filters/multilingual_policy_filter.py
@@ -53,11 +53,14 @@ def __init__(
         language: str = "german",
         split_paragraph: bool = True,  # default as used on c4. Set to "False" to split with sent_tokenize
         min_num_sentences: int = 5,  # set to -1 to disableQ
+        policy_strings: str = None
     ):
         super().__init__(exclusion_writer)
         self.language = language
         self.split_paragraph = split_paragraph
         self.min_num_sentences = min_num_sentences
+        self.policy_strings = policy_strings if policy_strings else POLICY_SUBSTRINGS[self.language]
+
 
     def filter(self, doc: Document) -> bool | tuple[bool, str]:
         from nltk.tokenize import sent_tokenize
@@ -78,7 +81,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
             # check line has too long word
             line_l = line.lower()
             # lorem ipsum
-            if any(p in line_l for p in POLICY_SUBSTRINGS[self.language]):
+            if any(p in line_l for p in self.policy_strings):
                 self.stat_update("line-filter-policy")
                 continue
             num_sentences += len(sent_tokenize(line, language=self.language)) if self.split_paragraph else 1