We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 0d2e34f commit fe28543Copy full SHA for fe28543
src/axolotl/utils/data.py
@@ -1,5 +1,6 @@
1
"""Module containing data utilities"""
2
import functools
3
+import itertools
4
import logging
5
from hashlib import md5
6
from pathlib import Path
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
264
265
LOG.info("tokenizing, merging, and shuffling master dataset")
266
267
samples: List[int] = []
268
+ chunk_size = 1000
269
for d in datasets:
- samples = samples + list(d)
270
+ d_iter = iter(d)
271
+ while True:
272
+ chunk = list(itertools.islice(d_iter, chunk_size))
273
+ if not chunk:
274
+ break
275
+ samples.extend(chunk)
276
+
277
+ LOG.info("shuffle")
278
dataset = Dataset.from_list(samples).shuffle(seed=seed)
279
if cfg.local_rank == 0:
280
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
0 commit comments