We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 07b12ca commit 57e6109Copy full SHA for 57e6109
src/datatrove/pipeline/filters/fineweb_quality_filter.py
@@ -28,7 +28,7 @@ def __init__(
28
def filter(self, doc) -> bool | tuple[bool, str]:
29
from nltk import word_tokenize
30
31
- stop_chars = (".", "'", '"', "!", "?")
+ stop_chars = (".", "'", '"', "!", "?", ";")
32
33
lines = doc.text.split("\n")
34
ratio = sum(1 for line in lines if line.endswith(stop_chars)) / len(lines)
0 commit comments