Skip to content

Commit f7edd25

Browse files
committed
Add MultilingualPolicyFilter
1 parent e096851 commit f7edd25

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

Diff for: src/datatrove/pipeline/filters/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
from .sampler_filter import SamplerFilter
1010
from .unigram_log_probs import UnigramLogProbFilter
1111
from .url_filter import URLFilter
12+
from .multilingual_policy_filter import MultilingualPolicyFilter
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import heapq
2+
import re
3+
4+
from datatrove.data import Document
5+
from datatrove.pipeline.filters.base_filter import BaseFilter
6+
from datatrove.pipeline.writers.disk_base import DiskWriter
7+
8+
9+
POLICY_SUBSTRINGS = {
10+
"german": [
11+
"benutzungsbedingungen",
12+
"nutzungsbedingungen",
13+
"nutzungsbestimmungen",
14+
"datenschutzerklärung",
15+
"datenschutzbestimmungen",
16+
"datenschutzrichtlinie",
17+
"cookie-richtlinie",
18+
"verwendet cookies",
19+
"benutzt cookies",
20+
"cookies verwendet",
21+
"verwendung von cookies",
22+
"einsatz von cookies",
23+
"nutzung von cookies",
24+
"verwenden cookies",
25+
"benutzen cookies"
26+
]
27+
}
28+
29+
30+
31+
32+
class MultilingualPolicyFilter(BaseFilter):
33+
"""Applies C4 Policy filter for other languages
34+
35+
- Remove lines with cookies and terms of use keywords
36+
37+
Reference implementation: https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/text/c4_utils.py#L197
38+
Args:
39+
exclusion_writer: optionally pass in a writer that will save the dropped documents
40+
language: used to determine policy strings and for language specific punkt tokenizer from nltk
41+
min_num_sentences: remove documents that do not have at least this number of sentences (after line filtering).
42+
set to -1 to disable
43+
"""
44+
45+
name = "⛰ C4 Quality"
46+
_requires_dependencies = ["nltk"]
47+
48+
def __init__(
49+
self,
50+
exclusion_writer: DiskWriter = None,
51+
language: str = "german",
52+
min_num_sentences: int = 5, # set to -1 to disableQ
53+
):
54+
super().__init__(exclusion_writer)
55+
self.language = language
56+
57+
self.min_num_sentences = min_num_sentences
58+
59+
def filter(self, doc: Document) -> bool | tuple[bool, str]:
60+
from nltk.tokenize import sent_tokenize
61+
62+
lines = (
63+
doc.text.splitlines()
64+
if self.split_paragraph
65+
else sent_tokenize(doc.text, language=self.language)
66+
)
67+
68+
num_sentences = 0
69+
kept_lines = []
70+
71+
for line in lines:
72+
line = line.strip()
73+
words = line.split()
74+
self.stat_update("line-total")
75+
# check line has too long word
76+
line_l = line.lower()
77+
# lorem ipsum
78+
if any(p in line_l for p in POLICY_SUBSTRINGS[self.language]):
79+
self.stat_update("line-filter-policy")
80+
continue
81+
num_sentences += len(sent_tokenize(line, language=self.tokenizer_language)) if self.split_paragraph else 1
82+
kept_lines.append(line)
83+
self.stat_update("line-kept")
84+
if num_sentences < self.min_num_sentences:
85+
return False, "too_few_sentences"
86+
87+
doc.text = ("\n" if self.split_paragraph else " ").join(kept_lines).strip()
88+
return True
89+
90+
91+
class C4ParagraphFilter(BaseFilter):
92+
"""Applies paragraph filtering from mC4
93+
94+
https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/text/c4_utils.py#L551
95+
"""
96+
97+
name = "⛰ C4 Paragraph"
98+
99+
def __init__(self, exclusion_writer: DiskWriter = None):
100+
super().__init__(exclusion_writer)
101+
102+
self.min_paragraphs = 3
103+
self.min_paragraph_len = 200
104+
self.line_delimiter = "\n"
105+
106+
def paragraph_filter(self, page):
107+
"""Returns False iff a page has too few or too short paragraphs."""
108+
lines = page.split(self.line_delimiter)
109+
# Filter out docs that don't have at least three "paragraphs"
110+
# (lines >= `min_paragraph_len` chars).
111+
if (
112+
len(lines) < self.min_paragraphs
113+
or min(heapq.nlargest(3, [len(line) for line in lines])) < self.min_paragraph_len
114+
):
115+
return False
116+
return True
117+
118+
def filter(self, doc: Document) -> bool | tuple[bool, str]:
119+
if not self.paragraph_filter(doc.text):
120+
return False, f"< {self.min_paragraphs} paragraphs"
121+
return True

0 commit comments

Comments
 (0)