Commit 50c338e 1 parent dfbf0e5 commit 50c338e Copy full SHA for 50c338e
File tree 1 file changed +3
-3
lines changed
1 file changed +3
-3
lines changed Original file line number Diff line number Diff line change @@ -342,19 +342,19 @@ from lightning.data import optimize
342
342
from tokenizer import Tokenizer
343
343
from functools import partial
344
344
345
- # 2 . Define a function to convert the text within the parquet files into tokens
345
+ # 1 . Define a function to convert the text within the parquet files into tokens
346
346
def tokenize_fn (filepath , tokenizer = None ):
347
347
parquet_file = pq.ParquetFile(filepath)
348
348
# Process per batch to reduce RAM usage
349
349
for batch in parquet_file.iter_batches(batch_size = 8192 , columns = [" content" ]):
350
350
for text in batch.to_pandas()[" content" ]:
351
351
yield tokenizer.encode(text, bos = False , eos = True )
352
352
353
- # 3 . Generate the inputs (we are going to optimize all the parquet files from StarCoder dataset )
353
+ # 2 . Generate the inputs
354
354
input_dir = " /teamspace/s3_connections/tinyllama-template"
355
355
inputs = [str (file ) for file in Path(f " { input_dir} /starcoderdata " ).rglob(" *.parquet" )]
356
356
357
- # 4 . Store the optimized data wherever you want under "/teamspace/datasets" or "/teamspace/s3_connections"
357
+ # 3 . Store the optimized data wherever you want under "/teamspace/datasets" or "/teamspace/s3_connections"
358
358
outputs = optimize(
359
359
fn = partial(tokenize_fn, tokenizer = Tokenizer(f " { input_dir} /checkpoints/Llama-2-7b-hf " )), # Note: You can use HF tokenizer or any others
360
360
inputs = inputs,
You can’t perform that action at this time.
0 commit comments