add gcp push

samsja · samsja · commit 60841459423c · 2025-01-23T11:59:45.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "tqdm",
     "antlr4-python3-runtime==4.11",
     "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@74c94ee",
+    "google-cloud-storage",
 ]
 
 [project.optional-dependencies]
diff --git a/src/genesys/generate.py b/src/genesys/generate.py
@@ -1,38 +1,51 @@
 import itertools
+from pydantic import model_validator
 from pydantic_config import BaseConfig, parse_argv
 import sglang as sgl
 from datasets import load_dataset
 from tqdm import tqdm
 from transformers import AutoTokenizer
-
-from genesys.utils import repeat_elements, save_batch_results
+from genesys.utils import GcpBucket, repeat_elements, save_batch_results
 
 SYSTEM_PROMPT = "Solve the following math problem efficiently and clearly. Think carefully and step by step about your response and reason before providing a final response. Conclude your response with: \n\nTherefore, the final answer is: $\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem. If the question is a multiple choice question, [answer] should be the letter indicating your correct response (e.g. \\text{A} or \\text{B})."
 
 
 class Config(BaseConfig):
     name_model: str = "Qwen/QwQ-32B-Preview"
-    out_file_name: str = "out.jsonl"
     num_responses_per_question: int = 1
     num_gpus: int = 8
     temperature: float = 0.9
-    batch_size: int = 10000
+    batch_size: int = 10_000
     max_samples: int | None = None
+    gcp_bucket: str | None = None  # optional, if provided, will save the each file with sample_per_file  to GCP
+    sample_per_file: int = 10_000  # how much sample each file contains
+
+    @model_validator(mode="after")
+    def check_batch_size(self):
+        if self.sample_per_file < self.batch_size:
+            raise ValueError("sample_per_file must be greater than or equal to batch_size")
+        if self.max_samples is not None and self.max_samples < self.sample_per_file:
+            raise ValueError("max_samples must be greater than or equal to sample_per_file")
+        return self
 
 
 def main(config: Config):
+    if config.gcp_bucket is not None:
+        gcp_bucket = GcpBucket(config.gcp_bucket)
+
     llm = sgl.Engine(model_path=config.name_model, tp_size=config.num_gpus)
     tokenizer = AutoTokenizer.from_pretrained(config.name_model)
 
-    math_dataset = load_dataset("Primegenesys/NuminaMath-groundtruth")["train"]
+    math_dataset = load_dataset("PrimeIntellect/NuminaMath-groundtruth")["train"]
     math_dataset = math_dataset.add_column("problem_id", range(len(math_dataset)))
 
     sampling_params = dict(temperature=config.temperature, max_new_tokens=8192, stop=["<|eot_id|>"])
 
-    open(config.out_file_name, "w").close()
-
     max_samples = config.max_samples if config.max_samples is not None else len(math_dataset)
 
+    all_results = []
+    file_counter = 0
+
     for i in tqdm(range(0, min(max_samples, len(math_dataset)), config.batch_size), desc="Generating data"):
         batch = math_dataset[i : min(i + config.batch_size, len(math_dataset))]
         batch_ids = list(
@@ -50,7 +63,6 @@ def main(config: Config):
         batch_inputs = tokenizer.apply_chat_template(batch_messages, tokenize=False, add_generation_prompt=True)
         batch_output = llm.generate(batch_inputs, sampling_params)
 
-        all_results = []
         for j, out in enumerate(batch_output):
             result = dict()
             result["prompt"] = batch_messages[j][1]["content"]
@@ -60,7 +72,11 @@ def main(config: Config):
 
             all_results.append(result)
 
-        save_batch_results(all_results, config.out_file_name)
+        if len(all_results) >= config.sample_per_file:
+            file_name = f"out_{file_counter}.jsonl"
+            save_batch_results(all_results, file_name, gcp_bucket)
+            all_results = []
+            file_counter += 1
 
 
 if __name__ == "__main__":
diff --git a/src/genesys/utils.py b/src/genesys/utils.py
@@ -1,12 +1,45 @@
 import json
+import os
+from google.cloud import storage
 
 
 def repeat_elements(lst, n):
     return [item for item in lst for _ in range(n)]
 
 
-def save_batch_results(batch_results, results_file):
+class GcpBucket:
+    def __init__(self, gcp_path: str):
+        # Parse GCS path (e.g., "gs://bucket-name/folder/path")
+        path = gcp_path.replace("gs://", "")
+        self.bucket_name = path.split("/")[0]
+        self.destination_folder = "/".join(path.split("/")[1:])
+
+        # Initialize client
+        self.client = storage.Client()
+        self.bucket = self.client.bucket(self.bucket_name)
+        print(f"Initialized GCP bucket: {self.bucket_name}, folder: {self.destination_folder}")
+
+    def push(self, file_name: str):
+        # Create the full destination path including folder
+        destination_blob_name = os.path.join(self.destination_folder, os.path.basename(file_name))
+        print(f"Uploading {file_name} to gs://{self.bucket_name}/{destination_blob_name}")
+
+        # Upload the file
+        blob = self.bucket.blob(destination_blob_name)
+        blob.upload_from_filename(file_name)
+
+
+def save_batch_results(batch_results, results_file, gcp_bucket: GcpBucket | None = None):
+    # Save locally first
     with open(results_file, "a") as f:
         for result in batch_results:
             json.dump(result, f)
             f.write("\n")
+
+    # Upload to GCP if bucket is configured
+    if gcp_bucket is not None:
+        try:
+            gcp_bucket.push(results_file)
+            print(f"Successfully uploaded {results_file} to GCP bucket")
+        except Exception as e:
+            print(f"Error uploading to GCP: {str(e)}")
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ dependencies = [`
`11`	`11`	`"tqdm",`
`12`	`12`	`"antlr4-python3-runtime==4.11",`
`13`	`13`	`"pydantic_config @ git+https://github.com/samsja/pydantic_config.git@74c94ee",`
	`14`	`+ "google-cloud-storage",`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`[project.optional-dependencies]`