@@ -30,18 +30,18 @@ def __init__(
30
30
language : str = "german" ,
31
31
split_paragraph : bool = True , # default as used on c4. Set to "False" to split with sent_tokenize
32
32
min_num_sentences : int = 5 , # set to -1 to disableQ
33
- policy_strings : list [str ] = ["terms of use" ,
34
- "privacy policy" ,
35
- "cookie policy" ,
36
- "uses cookies" ,
37
- "use of cookies" ,
38
- "use cookies" , ]
33
+ # policy_strings: list[str] = ["terms of use",
34
+ # "privacy policy",
35
+ # "cookie policy",
36
+ # "uses cookies",
37
+ # "use of cookies",
38
+ # "use cookies", ]
39
39
):
40
40
super ().__init__ (exclusion_writer )
41
41
self .language = language
42
42
self .split_paragraph = split_paragraph
43
43
self .min_num_sentences = min_num_sentences
44
- self .policy_strings = policy_strings
44
+ # self.policy_strings = policy_strings
45
45
46
46
def filter (self , doc : Document ) -> bool | tuple [bool , str ]:
47
47
from nltk .tokenize import sent_tokenize
@@ -61,9 +61,9 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
61
61
# check line has too long word
62
62
line_l = line .lower ()
63
63
# lorem ipsum
64
- if any (p in line_l for p in self .policy_strings ):
65
- self .stat_update ("line-filter-policy" )
66
- continue
64
+ # if any(p in line_l for p in self.policy_strings):
65
+ # self.stat_update("line-filter-policy")
66
+ # continue
67
67
num_sentences += len (sent_tokenize (line , language = self .language )) if self .split_paragraph else 1
68
68
kept_lines .append (line )
69
69
self .stat_update ("line-kept" )
0 commit comments