@@ -218,24 +218,20 @@ You can easily experiment with dataset mixtures using the CombinedStreamingDatas
218
218
``` python
219
219
from lightning.data import StreamingDataset, CombinedStreamingDataset
220
220
from lightning.data.streaming.item_loader import TokensLoader
221
- import os
222
221
from tqdm import tqdm
222
+ import os
223
223
from torch.utils.data import DataLoader
224
224
225
- # Increase by one because we need the next word as well
226
- effective_block_size = 2048 + 1
227
-
228
- input_dir = " tinyllama-data"
229
225
train_datasets = [
230
226
StreamingDataset(
231
- input_dir = " tinyllama-data /slimpajama/train" ,
232
- item_loader = TokensLoader(block_size = effective_block_size),
227
+ input_dir = " s3:// tinyllama-template /slimpajama/train/ " ,
228
+ item_loader = TokensLoader(block_size = 2048 + 1 ), # Optimized loader for tokens used by LLMs
233
229
shuffle = True ,
234
230
drop_last = True ,
235
231
),
236
232
StreamingDataset(
237
- input_dir = " tinyllama-data /starcoder" ,
238
- item_loader = TokensLoader(block_size = effective_block_size),
233
+ input_dir = " s3:// tinyllama-template /starcoder/ " ,
234
+ item_loader = TokensLoader(block_size = 2048 + 1 ), # Optimized loader for tokens used by LLMs
239
235
shuffle = True ,
240
236
drop_last = True ,
241
237
),
@@ -244,6 +240,7 @@ train_datasets = [
244
240
# Mix SlimPajama data and Starcoder data with these proportions:
245
241
weights = (0.693584 , 0.306416 )
246
242
combined_dataset = CombinedStreamingDataset(datasets = train_datasets, seed = 42 , weights = weights)
243
+
247
244
train_dataloader = DataLoader(combined_dataset, batch_size = 8 , pin_memory = True , num_workers = os.cpu_count())
248
245
249
246
# Iterate over the combined datasets
0 commit comments