6
6
from datatrove .pipeline .writers .disk_base import DiskWriter
7
7
8
8
9
- POLICY_SUBSTRINGS = {
10
- "german" : [
11
- "benutzungsbedingungen" ,
12
- "nutzungsbedingungen" ,
13
- "nutzungsbestimmungen" ,
14
- "datenschutzerklärung" ,
15
- "datenschutzbestimmungen" ,
16
- "datenschutzrichtlinie" ,
17
- "cookie-richtlinie" ,
18
- "verwendet cookies" ,
19
- "benutzt cookies" ,
20
- "cookies verwendet" ,
21
- "verwendung von cookies" ,
22
- "einsatz von cookies" ,
23
- "nutzung von cookies" ,
24
- "verwenden cookies" ,
25
- "benutzen cookies"
26
- ]
27
- }
28
-
29
-
30
-
31
-
32
9
class MultilingualPolicyFilter (BaseFilter ):
33
10
"""Applies C4 Policy filter for other languages
34
11
@@ -44,23 +21,27 @@ class MultilingualPolicyFilter(BaseFilter):
44
21
set to -1 to disable
45
22
"""
46
23
47
- name = "⛰ C4 Quality "
24
+ name = "⛰ Multilingual Policy "
48
25
_requires_dependencies = ["nltk" ]
49
26
50
27
def __init__ (
51
- self ,
52
- exclusion_writer : DiskWriter = None ,
53
- language : str = "german" ,
54
- split_paragraph : bool = True , # default as used on c4. Set to "False" to split with sent_tokenize
55
- min_num_sentences : int = 5 , # set to -1 to disableQ
56
- policy_strings : str = None
28
+ self ,
29
+ exclusion_writer : DiskWriter = None ,
30
+ language : str = "german" ,
31
+ split_paragraph : bool = True , # default as used on c4. Set to "False" to split with sent_tokenize
32
+ min_num_sentences : int = 5 , # set to -1 to disableQ
33
+ policy_strings : list [str ] = ["terms of use" ,
34
+ "privacy policy" ,
35
+ "cookie policy" ,
36
+ "uses cookies" ,
37
+ "use of cookies" ,
38
+ "use cookies" , ]
57
39
):
58
40
super ().__init__ (exclusion_writer )
59
41
self .language = language
60
42
self .split_paragraph = split_paragraph
61
43
self .min_num_sentences = min_num_sentences
62
- self .policy_strings = policy_strings if policy_strings is not None else POLICY_SUBSTRINGS [self .language ]
63
-
44
+ self .policy_strings = policy_strings
64
45
65
46
def filter (self , doc : Document ) -> bool | tuple [bool , str ]:
66
47
from nltk .tokenize import sent_tokenize
@@ -76,7 +57,6 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
76
57
77
58
for line in lines :
78
59
line = line .strip ()
79
- words = line .split ()
80
60
self .stat_update ("line-total" )
81
61
# check line has too long word
82
62
line_l = line .lower ()
@@ -115,8 +95,8 @@ def paragraph_filter(self, page):
115
95
# Filter out docs that don't have at least three "paragraphs"
116
96
# (lines >= `min_paragraph_len` chars).
117
97
if (
118
- len (lines ) < self .min_paragraphs
119
- or min (heapq .nlargest (3 , [len (line ) for line in lines ])) < self .min_paragraph_len
98
+ len (lines ) < self .min_paragraphs
99
+ or min (heapq .nlargest (3 , [len (line ) for line in lines ])) < self .min_paragraph_len
120
100
):
121
101
return False
122
102
return True
0 commit comments