Skip to content

Commit fe28543

Browse files
authored
optimize the iteration when tokenizeing large datasets (axolotl-ai-cloud#332)
1 parent 0d2e34f commit fe28543

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

src/axolotl/utils/data.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Module containing data utilities"""
22
import functools
3+
import itertools
34
import logging
45
from hashlib import md5
56
from pathlib import Path
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
264265
LOG.info("tokenizing, merging, and shuffling master dataset")
265266

266267
samples: List[int] = []
268+
chunk_size = 1000
267269
for d in datasets:
268-
samples = samples + list(d)
270+
d_iter = iter(d)
271+
while True:
272+
chunk = list(itertools.islice(d_iter, chunk_size))
273+
if not chunk:
274+
break
275+
samples.extend(chunk)
276+
277+
LOG.info("shuffle")
269278
dataset = Dataset.from_list(samples).shuffle(seed=seed)
270279
if cfg.local_rank == 0:
271280
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")

0 commit comments

Comments
 (0)