Skip to content

Commit 979a0c7

Browse files
committed
Add non-documenting option to language filter
1 parent f2e1974 commit 979a0c7

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

Diff for: src/datatrove/pipeline/filters/language_filter.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def __init__(
1717
languages: tuple = (Languages.english,),
1818
language_threshold: float = 0.65,
1919
exclusion_writer: DiskWriter = None,
20+
upate_metadata = True
2021
):
2122
"""
2223
filters if the predicted language is not among given language or if the language score is below language
@@ -31,6 +32,7 @@ def __init__(
3132
self.language_threshold = language_threshold
3233
self.languages = languages
3334
self._model = None
35+
self.upate_metadata = upate_metadata
3436

3537
@property
3638
def model(self):
@@ -57,6 +59,7 @@ def filter(self, doc: Document) -> bool:
5759
language, score = self.model.predict(doc.text.replace("\n", ""))
5860
# language label is given in the form __label__<language_id>
5961
language = language[0].split("__")[2]
60-
doc.metadata["language"] = language
61-
doc.metadata["language_score"] = score[0]
62+
if self.upate_metadata:
63+
doc.metadata["language"] = language
64+
doc.metadata["language_score"] = score[0]
6265
return score > self.language_threshold and language in self.languages

0 commit comments

Comments
 (0)