We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent efec1c4 commit 09ddae1Copy full SHA for 09ddae1
src/datatrove/pipeline/filters/oscar_filter.py
@@ -56,6 +56,6 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
56
return False, 'kenlm_min_harmful_ppl'
57
if doc.metadata['harmful_pp'] and doc.metadata['harmful_pp'] > self.max_harmful_ppl:
58
return False, 'kenlm_max_harmful_ppl'
59
- if doc['medatdata']['oscar_categories'] and len(set(doc['medatdata']['oscar_categories']) & self.exclude_categories) > 0:
+ if doc.metadata['oscar_categories'] and len(set(doc.metadata['oscar_categories']) & self.exclude_categories) > 0:
60
return False, 'oscar_category'
61
return True
0 commit comments