Skip to content

Commit c3088c9

Browse files
thomasthomas
thomas
authored and
thomas
committed
update
1 parent cc42bd7 commit c3088c9

File tree

1 file changed

+6
-9
lines changed

1 file changed

+6
-9
lines changed

src/lightning/data/README.md

+6-9
Original file line numberDiff line numberDiff line change
@@ -218,24 +218,20 @@ You can easily experiment with dataset mixtures using the CombinedStreamingDatas
218218
```python
219219
from lightning.data import StreamingDataset, CombinedStreamingDataset
220220
from lightning.data.streaming.item_loader import TokensLoader
221-
import os
222221
from tqdm import tqdm
222+
import os
223223
from torch.utils.data import DataLoader
224224

225-
# Increase by one because we need the next word as well
226-
effective_block_size = 2048 + 1
227-
228-
input_dir = "tinyllama-data"
229225
train_datasets = [
230226
StreamingDataset(
231-
input_dir="tinyllama-data/slimpajama/train",
232-
item_loader=TokensLoader(block_size=effective_block_size),
227+
input_dir="s3://tinyllama-template/slimpajama/train/",
228+
item_loader=TokensLoader(block_size=2048 + 1), # Optimized loader for tokens used by LLMs
233229
shuffle=True,
234230
drop_last=True,
235231
),
236232
StreamingDataset(
237-
input_dir="tinyllama-data/starcoder",
238-
item_loader=TokensLoader(block_size=effective_block_size),
233+
input_dir="s3://tinyllama-template/starcoder/",
234+
item_loader=TokensLoader(block_size=2048 + 1), # Optimized loader for tokens used by LLMs
239235
shuffle=True,
240236
drop_last=True,
241237
),
@@ -244,6 +240,7 @@ train_datasets = [
244240
# Mix SlimPajama data and Starcoder data with these proportions:
245241
weights = (0.693584, 0.306416)
246242
combined_dataset = CombinedStreamingDataset(datasets=train_datasets, seed=42, weights=weights)
243+
247244
train_dataloader = DataLoader(combined_dataset, batch_size=8, pin_memory=True, num_workers=os.cpu_count())
248245

249246
# Iterate over the combined datasets

0 commit comments

Comments
 (0)