Skip to content

Commit 50c338e

Browse files
thomasthomas
thomas
authored and
thomas
committed
update
1 parent dfbf0e5 commit 50c338e

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/lightning/data/README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -342,19 +342,19 @@ from lightning.data import optimize
342342
from tokenizer import Tokenizer
343343
from functools import partial
344344

345-
# 2. Define a function to convert the text within the parquet files into tokens
345+
# 1. Define a function to convert the text within the parquet files into tokens
346346
def tokenize_fn(filepath, tokenizer=None):
347347
parquet_file = pq.ParquetFile(filepath)
348348
# Process per batch to reduce RAM usage
349349
for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]):
350350
for text in batch.to_pandas()["content"]:
351351
yield tokenizer.encode(text, bos=False, eos=True)
352352

353-
# 3. Generate the inputs (we are going to optimize all the parquet files from StarCoder dataset )
353+
# 2. Generate the inputs
354354
input_dir = "/teamspace/s3_connections/tinyllama-template"
355355
inputs = [str(file) for file in Path(f"{input_dir}/starcoderdata").rglob("*.parquet")]
356356

357-
# 4. Store the optimized data wherever you want under "/teamspace/datasets" or "/teamspace/s3_connections"
357+
# 3. Store the optimized data wherever you want under "/teamspace/datasets" or "/teamspace/s3_connections"
358358
outputs = optimize(
359359
fn=partial(tokenize_fn, tokenizer=Tokenizer(f"{input_dir}/checkpoints/Llama-2-7b-hf")), # Note: You can use HF tokenizer or any others
360360
inputs=inputs,

0 commit comments

Comments
 (0)