File tree 1 file changed +4
-1
lines changed
src/datatrove/pipeline/filters
1 file changed +4
-1
lines changed Original file line number Diff line number Diff line change @@ -38,6 +38,8 @@ class MultilingualPolicyFilter(BaseFilter):
38
38
Args:
39
39
exclusion_writer: optionally pass in a writer that will save the dropped documents
40
40
language: used to determine policy strings and for language specific punkt tokenizer from nltk
41
+ split_paragraph: by default (as in the paper) split on "\n ".
42
+ Set to "False" to apply the filters to each sentence instead of to each line
41
43
min_num_sentences: remove documents that do not have at least this number of sentences (after line filtering).
42
44
set to -1 to disable
43
45
"""
@@ -49,11 +51,12 @@ def __init__(
49
51
self ,
50
52
exclusion_writer : DiskWriter = None ,
51
53
language : str = "german" ,
54
+ split_paragraph : bool = True , # default as used on c4. Set to "False" to split with sent_tokenize
52
55
min_num_sentences : int = 5 , # set to -1 to disableQ
53
56
):
54
57
super ().__init__ (exclusion_writer )
55
58
self .language = language
56
-
59
+ self . split_paragraph = split_paragraph
57
60
self .min_num_sentences = min_num_sentences
58
61
59
62
def filter (self , doc : Document ) -> bool | tuple [bool , str ]:
You can’t perform that action at this time.
0 commit comments