We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 0d70701 commit 33be756Copy full SHA for 33be756
src/datatrove/pipeline/filters/oscar_filter.py
@@ -26,20 +26,19 @@
26
class OSCARFilter(BaseFilter):
27
name = "🗑 OSCAR"
28
29
- def __init__(self, regex_exp: str,
+ def __init__(self,
30
exclusion_writer: DiskWriter = None,
31
min_harmful_ppl: float = DEFAULT_OSCAR_MIN_HARMFUL_PP,
32
max_harmful_ppl: float = DEFAULT_OSCAR_MAX_HARMFUL_PP,
33
exclude_categories: set = DEFAULT_EXCLUDE_CATEGORIES):
34
"""
35
- filters if regex finds at least one match
+ filters data based on OSCAR annotations
36
37
Args:
38
regex_exp: regex expression
39
exclusion_writer:
40
41
super().__init__(exclusion_writer)
42
- self.regex = re.compile(regex_exp)
43
self.min_harmful_ppl = min_harmful_ppl
44
self.max_harmful_ppl = max_harmful_ppl
45
self.exclude_categories = exclude_categories
0 commit comments