File tree 1 file changed +5
-2
lines changed
src/datatrove/pipeline/filters
1 file changed +5
-2
lines changed Original file line number Diff line number Diff line change @@ -17,6 +17,7 @@ def __init__(
17
17
languages : tuple = (Languages .english ,),
18
18
language_threshold : float = 0.65 ,
19
19
exclusion_writer : DiskWriter = None ,
20
+ upate_metadata = True
20
21
):
21
22
"""
22
23
filters if the predicted language is not among given language or if the language score is below language
@@ -31,6 +32,7 @@ def __init__(
31
32
self .language_threshold = language_threshold
32
33
self .languages = languages
33
34
self ._model = None
35
+ self .upate_metadata = upate_metadata
34
36
35
37
@property
36
38
def model (self ):
@@ -57,6 +59,7 @@ def filter(self, doc: Document) -> bool:
57
59
language , score = self .model .predict (doc .text .replace ("\n " , "" ))
58
60
# language label is given in the form __label__<language_id>
59
61
language = language [0 ].split ("__" )[2 ]
60
- doc .metadata ["language" ] = language
61
- doc .metadata ["language_score" ] = score [0 ]
62
+ if self .upate_metadata :
63
+ doc .metadata ["language" ] = language
64
+ doc .metadata ["language_score" ] = score [0 ]
62
65
return score > self .language_threshold and language in self .languages
You can’t perform that action at this time.
0 commit comments