Skip to content

Commit 76cecc3

Browse files
committed
Add MultilingualPolicyFilter
1 parent f7edd25 commit 76cecc3

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

Diff for: src/datatrove/pipeline/filters/multilingual_policy_filter.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ class MultilingualPolicyFilter(BaseFilter):
3838
Args:
3939
exclusion_writer: optionally pass in a writer that will save the dropped documents
4040
language: used to determine policy strings and for language specific punkt tokenizer from nltk
41+
split_paragraph: by default (as in the paper) split on "\n".
42+
Set to "False" to apply the filters to each sentence instead of to each line
4143
min_num_sentences: remove documents that do not have at least this number of sentences (after line filtering).
4244
set to -1 to disable
4345
"""
@@ -49,11 +51,12 @@ def __init__(
4951
self,
5052
exclusion_writer: DiskWriter = None,
5153
language: str = "german",
54+
split_paragraph: bool = True, # default as used on c4. Set to "False" to split with sent_tokenize
5255
min_num_sentences: int = 5, # set to -1 to disableQ
5356
):
5457
super().__init__(exclusion_writer)
5558
self.language = language
56-
59+
self.split_paragraph = split_paragraph
5760
self.min_num_sentences = min_num_sentences
5861

5962
def filter(self, doc: Document) -> bool | tuple[bool, str]:

0 commit comments

Comments
 (0)