Skip to content

Commit 28fd429

Browse files
authored
Merge pull request axolotl-ai-cloud#293 from NanoCode012/fix/tokenize-speed
Fix(tokenizing): Use multi-core
2 parents edd6980 + 45ac7c4 commit 28fd429

File tree

1 file changed

+11
-12
lines changed

1 file changed

+11
-12
lines changed

Diff for: src/axolotl/datasets.py

+11-12
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
"""Module containing Dataset functionality"""
22

33
import logging
4+
import os
45
from typing import List
56

67
import torch
78
from datasets import IterableDataset
89

9-
from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy
10+
from .prompt_tokenizers import PromptTokenizingStrategy
1011

1112
# We want this to be a wrapper for an existing dataset that we have loaded
1213
# lets use the concept of middlewares to wrap each dataset, for example
@@ -34,17 +35,15 @@ def __init__( # pylint: disable=super-init-not-called
3435
self.dataset = dataset
3536

3637
def __iter__(self):
37-
iterator = iter(self.dataset)
38-
count = 0
39-
# Loop through the entire dataset
40-
for example in iterator:
41-
try:
42-
yield self.prompt_tokenizer.tokenize_prompt(example)
43-
count += 1
44-
except InvalidDataException:
45-
pass
46-
if count == 0:
47-
raise RuntimeError("Expected at least one datapoint in dataset.")
38+
features = self.dataset.features.keys()
39+
num_proc = os.cpu_count()
40+
return iter(
41+
self.dataset.map(
42+
self.prompt_tokenizer.tokenize_prompt,
43+
num_proc=num_proc,
44+
remove_columns=features,
45+
)
46+
)
4847

4948

5049
# TODO this isn't the best since it can't interleave datasets

0 commit comments

Comments
 (0)