Skip to content

Commit 5fe27d6

Browse files
committed
Adjust multilingual policy
1 parent 21587e7 commit 5fe27d6

File tree

1 file changed

+15
-35
lines changed

1 file changed

+15
-35
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+15-35
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,6 @@
66
from datatrove.pipeline.writers.disk_base import DiskWriter
77

88

9-
POLICY_SUBSTRINGS = {
10-
"german": [
11-
"benutzungsbedingungen",
12-
"nutzungsbedingungen",
13-
"nutzungsbestimmungen",
14-
"datenschutzerklärung",
15-
"datenschutzbestimmungen",
16-
"datenschutzrichtlinie",
17-
"cookie-richtlinie",
18-
"verwendet cookies",
19-
"benutzt cookies",
20-
"cookies verwendet",
21-
"verwendung von cookies",
22-
"einsatz von cookies",
23-
"nutzung von cookies",
24-
"verwenden cookies",
25-
"benutzen cookies"
26-
]
27-
}
28-
29-
30-
31-
329
class MultilingualPolicyFilter(BaseFilter):
3310
"""Applies C4 Policy filter for other languages
3411
@@ -44,23 +21,27 @@ class MultilingualPolicyFilter(BaseFilter):
4421
set to -1 to disable
4522
"""
4623

47-
name = "⛰ C4 Quality"
24+
name = "⛰ Multilingual Policy"
4825
_requires_dependencies = ["nltk"]
4926

5027
def __init__(
51-
self,
52-
exclusion_writer: DiskWriter = None,
53-
language: str = "german",
54-
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
55-
min_num_sentences: int = 5, # set to -1 to disableQ
56-
policy_strings: str = None
28+
self,
29+
exclusion_writer: DiskWriter = None,
30+
language: str = "german",
31+
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
32+
min_num_sentences: int = 5, # set to -1 to disableQ
33+
policy_strings: list[str] = ["terms of use",
34+
"privacy policy",
35+
"cookie policy",
36+
"uses cookies",
37+
"use of cookies",
38+
"use cookies", ]
5739
):
5840
super().__init__(exclusion_writer)
5941
self.language = language
6042
self.split_paragraph = split_paragraph
6143
self.min_num_sentences = min_num_sentences
62-
self.policy_strings = policy_strings if policy_strings is not None else POLICY_SUBSTRINGS[self.language]
63-
44+
self.policy_strings = policy_strings
6445

6546
def filter(self, doc: Document) -> bool | tuple[bool, str]:
6647
from nltk.tokenize import sent_tokenize
@@ -76,7 +57,6 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
7657

7758
for line in lines:
7859
line = line.strip()
79-
words = line.split()
8060
self.stat_update("line-total")
8161
# check line has too long word
8262
line_l = line.lower()
@@ -115,8 +95,8 @@ def paragraph_filter(self, page):
11595
# Filter out docs that don't have at least three "paragraphs"
11696
# (lines >= `min_paragraph_len` chars).
11797
if (
118-
len(lines) < self.min_paragraphs
119-
or min(heapq.nlargest(3, [len(line) for line in lines])) < self.min_paragraph_len
98+
len(lines) < self.min_paragraphs
99+
or min(heapq.nlargest(3, [len(line) for line in lines])) < self.min_paragraph_len
120100
):
121101
return False
122102
return True

0 commit comments

Comments
 (0)