We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 76cecc3 commit 7ec7d62Copy full SHA for 7ec7d62
src/datatrove/pipeline/filters/multilingual_policy_filter.py
@@ -81,7 +81,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
81
if any(p in line_l for p in POLICY_SUBSTRINGS[self.language]):
82
self.stat_update("line-filter-policy")
83
continue
84
- num_sentences += len(sent_tokenize(line, language=self.tokenizer_language)) if self.split_paragraph else 1
+ num_sentences += len(sent_tokenize(line, language=self.langauge)) if self.split_paragraph else 1
85
kept_lines.append(line)
86
self.stat_update("line-kept")
87
if num_sentences < self.min_num_sentences:
0 commit comments