Skip to content

Commit 341e95a

Browse files
authored
prevent rate limiting to hf when using dispatch batches (axolotl-ai-cloud#2536) [skip ci]
1 parent b882dfb commit 341e95a

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

src/axolotl/utils/data/sft.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import functools
44
import logging
55
import os
6+
import tempfile
67
from pathlib import Path
78
from typing import List, Optional, Tuple, Union
89

@@ -117,9 +118,27 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
117118
cfg.pretraining_dataset[0]["type"] or "pretrain",
118119
)
119120

120-
iter_ds = load_dataset(
121-
path, streaming=True, split=split, name=name, data_files=data_files
122-
)
121+
# when letting accelerator dispatch batches from the main process, we don't need to load the dataset from
122+
# other ranks, we just need to present a fake dataset
123+
if (
124+
cfg.accelerator_config
125+
and cfg.accelerator_config.dispatch_batches
126+
and not is_local_main_process()
127+
):
128+
with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
129+
f.write("text\n")
130+
f.write("lorem ipsum dolor sit amet\n")
131+
# rewind the file pointer to the beginning so we can read it again
132+
f.seek(0)
133+
iter_ds = load_dataset(
134+
"csv", data_files=f.name, split="train", streaming=True
135+
)
136+
else:
137+
if is_local_main_process():
138+
iter_ds = load_dataset(
139+
path, streaming=True, split=split, name=name, data_files=data_files
140+
)
141+
123142
if skip:
124143
LOG.info(f"Skipping {skip} samples from the dataset")
125144
iter_ds = iter_ds.skip(skip)

src/axolotl/utils/schemas/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ def check_evals(cls, data):
660660
data.get("val_set_size") == 0
661661
and (data.get("eval_steps") or data.get("eval_strategy"))
662662
and not data.get("test_datasets")
663+
and data.get("eval_strategy") != "no"
663664
):
664665
raise ValueError(
665666
"eval_steps and eval_strategy are not supported with val_set_size == 0"

0 commit comments

Comments
 (0)