Skip to content

Commit a5ce855

Browse files
authored
Merge branch 'huggingface:main' into main
2 parents 9dbd432 + 6d06210 commit a5ce855

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1801
-285
lines changed

Diff for: .github/workflows/pypi-release.yml

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
name: PyPI release
2+
on:
3+
workflow_dispatch:
4+
5+
jobs:
6+
testing:
7+
uses: ./.github/workflows/testing.yml
8+
release:
9+
needs: testing
10+
runs-on: ubuntu-latest
11+
env:
12+
TWINE_USERNAME: __token__
13+
14+
steps:
15+
- name: Checkout Repo
16+
uses: actions/checkout@v3
17+
18+
- name: Setup Python
19+
uses: actions/setup-python@v4
20+
with:
21+
python-version: "3.10"
22+
23+
- name: Install build dependencies
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install -U twine build
27+
28+
- name: Build the dist files
29+
run: python -m build .
30+
31+
- name: Publish to the test PyPI
32+
env:
33+
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
34+
run: twine upload dist/* --repository=testpypi
35+
36+
- name: Test installing from test PyPI and running tests
37+
run: |
38+
pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple datatrove[testing]
39+
python -m nltk.downloader punkt
40+
make test
41+
42+
- name: Get tag name
43+
id: get_tag_name
44+
run: |
45+
echo TAG_NAME=$(grep '^version' pyproject.toml | head -1 | cut -d '"' -f 2) >> $GITHUB_OUTPUT
46+
47+
- name: Tag the release
48+
uses: actions/github-script@v7
49+
with:
50+
script: |
51+
github.rest.git.createRef({
52+
owner: context.repo.owner,
53+
repo: context.repo.repo,
54+
ref: 'refs/tags/v${{ steps.get_tag_name.outputs.TAG_NAME }}',
55+
sha: context.sha
56+
})
57+
58+
- name: Publish to PyPI
59+
env:
60+
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
61+
run: twine upload dist/* --repository=pypi

Diff for: .github/workflows/ci.yml renamed to .github/workflows/testing.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: CI
1+
name: Test & Check Code Quality
22

33
on:
44
pull_request:
@@ -7,6 +7,7 @@ on:
77
push:
88
branches:
99
- main
10+
workflow_call:
1011

1112
jobs:
1213
check_code_quality:

Diff for: examples/fineweb.py

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
"""
2+
This file contains the code used to process and create the
3+
FineWeb dataset (https://huggingface.co/datasets/HuggingFaceFW/fineweb)
4+
"""
5+
from datatrove.executor.slurm import SlurmPipelineExecutor
6+
from datatrove.pipeline.dedup import MinhashDedupCluster, MinhashDedupFilter, MinhashDedupSignature
7+
from datatrove.pipeline.dedup.minhash import MinhashConfig, MinhashDedupBuckets
8+
from datatrove.pipeline.extractors import Trafilatura
9+
from datatrove.pipeline.filters import (
10+
C4QualityFilter,
11+
FineWebQualityFilter,
12+
GopherQualityFilter,
13+
GopherRepetitionFilter,
14+
LanguageFilter,
15+
URLFilter,
16+
)
17+
from datatrove.pipeline.formatters import PIIFormatter
18+
from datatrove.pipeline.readers import JsonlReader, WarcReader
19+
from datatrove.pipeline.tokens import TokensCounter
20+
from datatrove.pipeline.writers.jsonl import JsonlWriter
21+
22+
23+
"""
24+
we first ran the following pipeline for each dump
25+
"""
26+
DUMP_TO_PROCESS = "CC-MAIN-2O23-5O" # example
27+
28+
MAIN_OUTPUT_PATH = "s3://some_s3_bucket"
29+
FILTERING_OUTPUT_PATH = f"{MAIN_OUTPUT_PATH}/base_processing"
30+
31+
main_processing_executor = SlurmPipelineExecutor(
32+
job_name=f"cc_{DUMP_TO_PROCESS}",
33+
pipeline=[
34+
WarcReader(
35+
f"s3://commoncrawl/crawl-data/{DUMP_TO_PROCESS}/segments/",
36+
glob_pattern="*/warc/*", # we want the warc files
37+
default_metadata={"dump": DUMP_TO_PROCESS},
38+
),
39+
URLFilter(exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/1_url/{DUMP_TO_PROCESS}")),
40+
Trafilatura(favour_precision=True),
41+
LanguageFilter(
42+
exclusion_writer=JsonlWriter(
43+
f"{FILTERING_OUTPUT_PATH}/2_non_english/",
44+
output_filename="${language}/" + DUMP_TO_PROCESS + "/${rank}.jsonl.gz",
45+
# folder structure: language/dump/file
46+
)
47+
),
48+
GopherRepetitionFilter(
49+
exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/3_gopher_rep/{DUMP_TO_PROCESS}")
50+
),
51+
GopherQualityFilter(
52+
exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/4_gopher_qual/{DUMP_TO_PROCESS}")
53+
),
54+
C4QualityFilter(
55+
filter_no_terminal_punct=False,
56+
exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/5_c4/{DUMP_TO_PROCESS}"),
57+
),
58+
FineWebQualityFilter(
59+
exclusion_writer=JsonlWriter(f"{FILTERING_OUTPUT_PATH}/removed/6_fineweb_qual/{DUMP_TO_PROCESS}")
60+
),
61+
JsonlWriter(f"{FILTERING_OUTPUT_PATH}/output/{DUMP_TO_PROCESS}"),
62+
],
63+
tasks=8000,
64+
time="10:00:00",
65+
logging_dir=f"{MAIN_OUTPUT_PATH}/logs/base_processing/{DUMP_TO_PROCESS}",
66+
slurm_logs_folder=f"logs/base_processing/{DUMP_TO_PROCESS}/slurm_logs", # must be local
67+
randomize_start=True, # don't hit the bucket all at once with the list requests
68+
mem_per_cpu_gb=2,
69+
partition="hopper-cpu",
70+
)
71+
main_processing_executor.run()
72+
73+
"""
74+
we then applied minhash deduplication to each individual dump,
75+
"""
76+
77+
# you can also change ngrams or the number of buckets and their size here
78+
minhash_config = MinhashConfig(
79+
use_64bit_hashes=True, # better precision -> fewer false positives (collisions)
80+
num_buckets=14,
81+
hashes_per_bucket=8,
82+
n_grams=5,
83+
)
84+
85+
S3_MINHASH_BASE_PATH = f"{MAIN_OUTPUT_PATH}/minhash"
86+
87+
S3_LOGS_FOLDER = f"{MAIN_OUTPUT_PATH}/logs/minhash"
88+
LOCAL_LOGS_FOLDER = "logs/minhash"
89+
90+
TOTAL_TASKS = 1000
91+
92+
# this is the original data that we want to deduplicate
93+
INPUT_READER = JsonlReader(
94+
f"{FILTERING_OUTPUT_PATH}/output/{DUMP_TO_PROCESS}"
95+
) # this is the output from the first part
96+
97+
# stage 1 computes minhash signatures for each task (each task gets a set of files)
98+
stage1 = SlurmPipelineExecutor(
99+
job_name=f"mh1_{DUMP_TO_PROCESS}",
100+
pipeline=[
101+
INPUT_READER,
102+
MinhashDedupSignature(
103+
output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/signatures", config=minhash_config
104+
),
105+
],
106+
tasks=TOTAL_TASKS,
107+
time="5:00:00",
108+
partition="hopper-cpu",
109+
logging_dir=f"{S3_LOGS_FOLDER}/signatures",
110+
slurm_logs_folder=f"{LOCAL_LOGS_FOLDER}/signatures/slurm_logs",
111+
randomize_start=True,
112+
depends=main_processing_executor, # only start after the first one completes
113+
)
114+
115+
stage2 = SlurmPipelineExecutor(
116+
job_name=f"mh2_{DUMP_TO_PROCESS}",
117+
pipeline=[
118+
MinhashDedupBuckets(
119+
input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/signatures",
120+
output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/buckets",
121+
config=MinhashConfig(use_64bit_hashes=True),
122+
),
123+
],
124+
tasks=minhash_config.num_buckets * 50, # the code supports parallelizing each bucket. here we run 50
125+
# workers per bucket
126+
randomize_start=True,
127+
logging_dir=f"{S3_LOGS_FOLDER}/buckets",
128+
partition="hopper-cpu",
129+
time="02:00:00",
130+
mem_per_cpu_gb=4,
131+
cpus_per_task=3, # you can add run more (smaller) tasks if you do not have a lot of memory
132+
depends=stage1,
133+
)
134+
135+
136+
stage3 = SlurmPipelineExecutor(
137+
job_name=f"mh3_{DUMP_TO_PROCESS}",
138+
pipeline=[
139+
MinhashDedupCluster(
140+
input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/buckets",
141+
output_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/remove_ids",
142+
config=minhash_config,
143+
),
144+
],
145+
tasks=1, # this step runs on a single task
146+
logging_dir=f"{S3_LOGS_FOLDER}/clustering",
147+
partition="hopper-cpu",
148+
time="30:00:00", # and can also be quite slow. Usually not this slow though
149+
mem_per_cpu_gb=25,
150+
cpus_per_task=8, # if you dedup a full dump, you do need a lot of memory for this one
151+
depends=stage2,
152+
)
153+
154+
155+
stage4 = SlurmPipelineExecutor(
156+
job_name=f"mh4_{DUMP_TO_PROCESS}",
157+
pipeline=[
158+
INPUT_READER,
159+
TokensCounter(), # you can remove this one, it's just a nice way to know how many tokens we have
160+
# before and after dedup
161+
MinhashDedupFilter(input_folder=f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/remove_ids"),
162+
# run the PII removal
163+
PIIFormatter(),
164+
JsonlWriter(f"{S3_MINHASH_BASE_PATH}/{DUMP_TO_PROCESS}/deduped_output"),
165+
],
166+
tasks=TOTAL_TASKS,
167+
logging_dir=f"{S3_LOGS_FOLDER}/filtering",
168+
partition="hopper-cpu",
169+
time="5:00:00",
170+
mem_per_cpu_gb=4,
171+
depends=stage3,
172+
)
173+
174+
# launch dedup pipelines
175+
stage4.run()

Diff for: examples/tokenize_from_hf_to_s3.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
output_folder=WORKING_DIR,
7979
local_working_dir=LOCAL_WORKING_DIR,
8080
save_filename=f"{DATASET_NAME}_tokenized",
81-
tokenizer_name=args.tokenizer,
81+
tokenizer_name_or_path=args.tokenizer,
8282
),
8383
],
8484
# If you have a very small dataset, feel free to set this to "1" and remove the merge_executor

Diff for: examples/url_deduplication.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import argparse
2+
3+
import numpy as np
4+
5+
from datatrove.executor.base import PipelineExecutor
6+
from datatrove.executor.local import LocalPipelineExecutor
7+
from datatrove.pipeline.dedup.url_dedup import (
8+
UrlDedupConfig,
9+
UrlDedupFilter,
10+
UrlDedupSignature,
11+
UrlFindDedups,
12+
)
13+
from datatrove.pipeline.readers import JsonlReader
14+
from datatrove.pipeline.writers.jsonl import JsonlWriter
15+
16+
17+
"""
18+
Example on how to use url-deduplication.
19+
To run url deduplication we need to run three different pipelines (same as sentence dedup)
20+
"""
21+
22+
23+
# modify url dedup hyper params here
24+
url_dedup_config = UrlDedupConfig(
25+
# this will keep the longest document for each url
26+
document_priority=lambda doc: min(np.iinfo(np.uint16).max, len(doc.text) // 4),
27+
url_normalizer=lambda url: url.lower(),
28+
)
29+
30+
FINDER_WORKERS = 4 # this will speed up/parallelize step 2
31+
32+
LIMIT = -1 # for testing
33+
34+
35+
def run_example(args):
36+
pipeline_1 = [
37+
JsonlReader(args.input_folder, limit=LIMIT, progress=True),
38+
UrlDedupSignature(
39+
output_folder=f"{args.sigs_dup_folder}/sigs",
40+
config=url_dedup_config,
41+
finder_workers=FINDER_WORKERS,
42+
),
43+
]
44+
45+
pipeline_2 = [
46+
UrlFindDedups(
47+
data_folder=f"{args.sigs_dup_folder}/sigs",
48+
output_folder=f"{args.sigs_dup_folder}/dups",
49+
config=url_dedup_config,
50+
)
51+
]
52+
53+
pipeline_3 = [
54+
JsonlReader(data_folder=args.input_folder, limit=LIMIT, progress=True),
55+
UrlDedupFilter(
56+
data_folder=f"{args.sigs_dup_folder}/dups",
57+
config=url_dedup_config,
58+
exclusion_writer=JsonlWriter(output_folder=f"{args.base_output_folder}/removed"),
59+
),
60+
JsonlWriter(output_folder=f"{args.base_output_folder}/output"),
61+
]
62+
63+
executor_1: PipelineExecutor = LocalPipelineExecutor(pipeline=pipeline_1, tasks=4)
64+
65+
executor_2: PipelineExecutor = LocalPipelineExecutor(pipeline=pipeline_2, tasks=FINDER_WORKERS)
66+
67+
executor_3: PipelineExecutor = LocalPipelineExecutor(pipeline=pipeline_3, tasks=4)
68+
69+
print(executor_1.run())
70+
print(executor_2.run())
71+
print(executor_3.run())
72+
73+
74+
parser = argparse.ArgumentParser(description="URL Deduplication")
75+
parser.add_argument("input_folder", help="Input folder path")
76+
parser.add_argument("base_output_folder", help="Base output folder path")
77+
parser.add_argument("sigs_dup_folder", help="sigs-dup folder path")
78+
if __name__ == "__main__":
79+
args = parser.parse_args()
80+
run_example(args)

Diff for: pyproject.toml

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "datatrove"
3-
version = "0.0.1" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
3+
version = "0.2.0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
44
description = "HuggingFace library to process and filter large amounts of webdata"
55
readme = "README.md"
66
authors = [
@@ -52,8 +52,11 @@ processing = [
5252
"inscriptis",
5353
# "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
5454
"tldextract",
55-
"trafilatura",
55+
"trafilatura>=1.8.0",
5656
"tokenizers",
57+
"ftfy",
58+
"fasteners",
59+
"xxhash"
5760
]
5861
quality = [
5962
"ruff>=0.1.5"

0 commit comments

Comments
 (0)