Skip to content

Commit cd8f1cd

Browse files
committed
Test debug
1 parent 44f2c54 commit cd8f1cd

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class MultilingualPolicyFilter(BaseFilter):
1919
Set to "False" to apply the filters to each sentence instead of to each line
2020
min_num_sentences: remove documents that do not have at least this number of sentences (after line filtering).
2121
set to -1 to disable
22+
policy_strings: list of policy substrings to remove
2223
"""
2324

2425
name = "⛰ Multilingual Policy"
@@ -30,18 +31,17 @@ def __init__(
3031
language: str = "german",
3132
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
3233
min_num_sentences: int = 5, # set to -1 to disableQ
33-
#policy_strings: list[str] = ["terms of use",
34-
# "privacy policy",
35-
# "cookie policy",
36-
# "uses cookies",
37-
# "use of cookies",
38-
# "use cookies", ]
34+
policy_strings=None,
3935
):
4036
super().__init__(exclusion_writer)
37+
if policy_strings is None:
38+
policy_strings = ["terms of use", "privacy policy", "cookie policy", "uses cookies", "use of cookies",
39+
"use cookies", ]
4140
self.language = language
4241
self.split_paragraph = split_paragraph
4342
self.min_num_sentences = min_num_sentences
44-
#self.policy_strings = policy_strings
43+
self.policy_strings = policy_strings
44+
4545

4646
def filter(self, doc: Document) -> bool | tuple[bool, str]:
4747
from nltk.tokenize import sent_tokenize

0 commit comments

Comments
 (0)