Skip to content

Commit 44f2c54

Browse files
committed
Test debug
1 parent 5fe27d6 commit 44f2c54

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,18 @@ def __init__(
3030
language: str = "german",
3131
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
3232
min_num_sentences: int = 5, # set to -1 to disableQ
33-
policy_strings: list[str] = ["terms of use",
34-
"privacy policy",
35-
"cookie policy",
36-
"uses cookies",
37-
"use of cookies",
38-
"use cookies", ]
33+
#policy_strings: list[str] = ["terms of use",
34+
# "privacy policy",
35+
# "cookie policy",
36+
# "uses cookies",
37+
# "use of cookies",
38+
# "use cookies", ]
3939
):
4040
super().__init__(exclusion_writer)
4141
self.language = language
4242
self.split_paragraph = split_paragraph
4343
self.min_num_sentences = min_num_sentences
44-
self.policy_strings = policy_strings
44+
#self.policy_strings = policy_strings
4545

4646
def filter(self, doc: Document) -> bool | tuple[bool, str]:
4747
from nltk.tokenize import sent_tokenize
@@ -61,9 +61,9 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
6161
# check line has too long word
6262
line_l = line.lower()
6363
# lorem ipsum
64-
if any(p in line_l for p in self.policy_strings):
65-
self.stat_update("line-filter-policy")
66-
continue
64+
#if any(p in line_l for p in self.policy_strings):
65+
# self.stat_update("line-filter-policy")
66+
# continue
6767
num_sentences += len(sent_tokenize(line, language=self.language)) if self.split_paragraph else 1
6868
kept_lines.append(line)
6969
self.stat_update("line-kept")

0 commit comments

Comments
 (0)