add latest experiments

JetBrains-Research · Jan 30, 2025 · b9c668d · b9c668d
1 parent 977d200
commit b9c668d
Show file tree

Hide file tree

Showing 64 changed files with 5,839 additions and 3,268 deletions.
diff --git a/commit_message_generation/Makefile b/commit_message_generation/Makefile
@@ -0,0 +1,7 @@
+format:
+	poetry run ruff format --config pyproject.toml
+	poetry run ruff check --fix --config pyproject.toml
+lint:
+	poetry run ruff format --check --config pyproject.toml
+	poetry run ruff check --config pyproject.toml
+	poetry run mypy --config-file pyproject.toml .
diff --git a/commit_message_generation/README.md b/commit_message_generation/README.md
@@ -32,14 +32,22 @@ This baseline consists of the following configurable components:
 <summary>💛 Click here to view the currently supported options for each component.</summary>
 
 * **Models / Backbones:**
-  * Models from OpenAI API: implemented as [`OpenAIBackbone`](src/backbones/openai_backbone.py) class
-  * Models from 🤗 HuggingFace Hub: implemented as [`HuggingFaceBackbone`](src/backbones/hf_backbone.py) class
+  * Models from [OpenAI API](https://platform.openai.com/docs/overview): implemented as [`OpenAIBackbone`](src/backbones/openai_backbone.py) class
+  * Models from 🤗 [HuggingFace Hub](https://huggingface.co/): implemented as [`HuggingFaceBackbone`](src/backbones/hf_backbone.py) class
+  * Models from [Together API](https://www.together.ai/): implemented as [`TogetherBackbone`](src/backbones/together_backbone.py) class
+  * Models from [DeepSeek API](https://www.together.ai/): implemented as [`DeepSeekBackbone`](src/backbones/deepseek_backbone.py) class
 * **Preprocessors:**
   * Simple preprocessing: implemented as [`SimpleCMGPreprocessor`](src/preprocessors/simple_diff_preprocessor.py) class
   * Simple preprocessing + truncation: implemented as [`TruncationCMGPreprocessor`](src/preprocessors/truncation_diff_preprocessor.py) class
+  * BM25 retrieval: implemented as [`RetrievalCMGPreprocessor`](src/preprocessors/retrieval_preprocessor.py) class
+  * Full modified files contents instead of diffs: implemented as [`FullFilesCMGPreprocessor`](src/preprocessors/full_files_preprocessor.py) class
+  * Utility preprocessor that loads prebuilt contexts from a HF dataset: implemented as [`LoadFromDatasetPreprocessor`](src/preprocessors/load_from_dataset_preprocessor.py) class
+
 * **Prompts:** 
   * Plain zero-shot prompt: implemented as [`SimpleCMGPrompt`](src/prompts/prompts.py) class
   * Detailed zero-shot prompt: implemented as [`DetailedCMGPrompt`](src/prompts/prompts.py) class
+  * Detailed zero-shot prompt for Diff + BM25 setting: implemented as [`DetailedCMGPromptWContext`](src/prompts/prompts.py) class
+  * Detailed zero-shot prompt for Full File setting: implemented as [`DetailedCMGPromptForFullFiles`](src/prompts/prompts.py) class
 </details>
 
 We also provide several `.yaml` configs as examples (see [Available Examples](#available-examples) section).
@@ -55,12 +63,27 @@ The main running script is [`run_baseline.py`](run_baseline.py).
 
 In both cases, you can also add command-line arguments using [Hydra's override feature](https://hydra.cc/docs/advanced/override_grammar/basic/) (see [Available Examples](#available-examples) section for examples).
 
+> There is also an asynchronous version [`arun_baseline.py`](arun_baseline.py). The main difference is that it always expects a YAML config to be stored under [`configs/async`](configs/async) and doesn't use override from Hydra.
+> 
+> * If you use Poetry, run: `poetry run python arun_baseline.py --config-name {your-config-basename}`
+> * Otherwise, run: `python arun_baseline.py --config-name {your-config-basename}`
+
+Additionally, there is a script that launches only preprocessor and saves preprocessed contexts for each commit.
+It could be used in cases when preprocessing is sophisticated enough to do it on the fly (e.g., with retrieval).
+
+* If you use Poetry, run: `poetry run python run_preprocessor.py`
+* Otherwise, run: `python run_preprocessor.py`
+
+In both cases, you can also add command-line arguments using [Hydra's override feature](https://hydra.cc/docs/advanced/override_grammar/basic/) (see [Available Examples](#available-examples) section for examples).
+
 # Available examples
 
 Together with the dataset, we release the results for several models.
 They were obtained using this repository, 
 and we provide the exact commands for each of them as well as `.yaml` configs examples under [`configs/examples`](configs/examples) folder.
 
+Note that there are no commands for newer models from our experiments. See also [`configs/async`](configs/async) for models that were launched with [`arun_baseline.py`](arun_baseline.py) and [`configs/retrieval`](configs/retrieval) for runs that saved preprocessed contexts via [`run_preprocessor.py`](run_preprocessor.py).
+
 **Note.** The configs and the commands are provided for a single seed value, which is controlled by `backbone.parameters.seed` for OpenAI models and `backbone.seed` for models from HuggingFace Hub. We averaged the results across three seeds. For convenience, you can use [Hydra's multi-run functionality](https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/) to launch three subsequent runs with different seeds. 
 
 ## OpenAI models

diff --git a/commit_message_generation/arun_baseline.py b/commit_message_generation/arun_baseline.py
@@ -0,0 +1,140 @@
+import asyncio
+import json
+import logging
+import os
+import random
+import sys
+from argparse import ArgumentParser
+from typing import Any, Dict
+
+import hydra
+import jsonlines
+import pandas as pd  # type: ignore[import-untyped]
+import wandb
+from dotenv import load_dotenv
+from hydra import compose, initialize
+from omegaconf import OmegaConf
+from tqdm import tqdm  # type: ignore[import-untyped]
+
+from configs import BaselineConfig
+from src import CMGBackbone, CMGBaseline, CMGMetrics
+
+load_dotenv()
+
+root = logging.getLogger()
+root.setLevel(logging.INFO)
+handler = logging.StreamHandler(sys.stdout)
+formatter = logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")
+handler.setFormatter(formatter)
+root.addHandler(handler)
+
+
+def init_baseline(cfg: BaselineConfig) -> CMGBaseline:
+    # init backbone
+    backbone: CMGBackbone = hydra.utils.instantiate(cfg.backbone)
+
+    # init preprocessor
+    preprocessor = hydra.utils.instantiate(
+        cfg.preprocessor, model_name=cfg.backbone.model_name, model_provider=backbone.name
+    )
+
+    return CMGBaseline(backbone=backbone, preprocessor=preprocessor)
+
+
+async def get_predictions(
+    baseline: CMGBaseline, cfg: BaselineConfig, predictions_path: str = "predictions.jsonl"
+) -> str:
+    # init iterator (either over local file or over HuggingFace dataset)
+    if hasattr(cfg.data_src, "path"):
+        cfg.data_src.path = hydra.utils.to_absolute_path(cfg.data_src.path)  # type: ignore[attr-defined]
+    reader = hydra.utils.instantiate(cfg.data_src)
+
+    async def _get_prediction(line: Dict[str, Any]) -> None:
+        baseline_output = await baseline.agenerate_msg(commit=line)  # type: ignore[arg-type]
+        cur_example = {"reference": line["message"], "hash": line["hash"], "repo": line["repo"]}
+        cur_example.update(baseline_output)
+
+        with jsonlines.open(predictions_path, "a") as writer:
+            writer.write(cur_example)
+
+        return None
+
+    # get predictions for all input examples
+    open(predictions_path, "w").close()
+    tasks = [_get_prediction(line) for line in reader]
+    await asyncio.gather(*tasks)
+    return predictions_path
+
+
+def compute_metrics(predictions_path: str) -> Dict[str, float]:
+    metrics = CMGMetrics()
+    with jsonlines.open(predictions_path, "r") as reader:
+        for example in tqdm(reader, desc="Computing metrics"):
+            metrics.update(predictions=[example["prediction"]], references=[example["reference"]])
+    computed_metrics = metrics.compute()
+    print("=== METRICS ===")
+    print(computed_metrics)
+    return computed_metrics
+
+
+async def main(config_name: str) -> None:
+    initialize(version_base="1.1", config_path="configs/async")
+    cfg_dict = compose(config_name=config_name)
+    cfg = BaselineConfig(**cfg_dict)  # type: ignore
+
+    os.makedirs(f"results/{config_name[: -len('.yaml')]}", exist_ok=True)
+
+    if hasattr(cfg.backbone, "seed") and cfg.backbone.seed is None:
+        cfg.backbone.seed = random.randint(1, 2**32)
+        logging.warning(f"Using random seed {cfg.backbone.seed}.")
+
+    # init W&B (optional)
+    if cfg.logger.use_wandb:
+        wandb.init(
+            project=cfg.logger.project,
+            name=cfg.logger.name,
+            config=OmegaConf.to_container(cfg, resolve=True),  # type: ignore[arg-type]
+            job_type="eval",
+        )
+
+    # init baseline
+    baseline = init_baseline(cfg)
+
+    # obtain predictions
+    predictions_path = await get_predictions(
+        cfg=cfg, baseline=baseline, predictions_path=f"results/{config_name[: -len('.yaml')]}/predictions.jsonl"
+    )
+
+    # log predictions to W&B (optional)
+    if cfg.logger.use_wandb:
+        artifact = wandb.Artifact(
+            f"{cfg.backbone.model_name.replace('/', '__')}_{cfg.preprocessor._target_.split('.')[-1]}_{cfg.logger.name + '_' if cfg.logger.name else ''}predictions",
+            type="dataset",
+        )
+        if cfg.logger.local_artifact:
+            artifact.add_reference(f"file:///{os.path.abspath(predictions_path)}")
+        else:
+            test_table = wandb.Table(dataframe=pd.read_json(predictions_path, orient="records", lines=True))
+            artifact.add(test_table, "predictions")
+        wandb.log_artifact(artifact)
+
+    # compute metrics
+    computed_metrics = compute_metrics(predictions_path)
+    with open(f"results/{config_name[: -len('.yaml')]}/metrics.json", "w") as f:
+        json.dump(computed_metrics, f)
+
+    # log metrics to W&B (optional)
+    if cfg.logger.use_wandb:
+        wandb.log(computed_metrics)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Launch a commit message generation model for Long Code Arena dataset asynchronously."
+    )
+    parser.add_argument(
+        "--config-name", type=str, help="Which config under `configs/async` directory to use.", required=True
+    )
+    args = parser.parse_args()
+
+    asyncio.run(main(args.config_name))
diff --git a/commit_message_generation/configs/async/DeepSeek-R1.yaml b/commit_message_generation/configs/async/DeepSeek-R1.yaml
@@ -0,0 +1,23 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPrompt
+  model_name: deepseek-ai/DeepSeek-R1
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.SimpleCMGPreprocessor
+  include_path: true
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-16k-files.yaml b/commit_message_generation/configs/async/DeepSeek-V3-16k-files.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptForFullFiles
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "full_files"
+  hf_repo_split: "16k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-16k.yaml b/commit_message_generation/configs/async/DeepSeek-V3-16k.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptWContext
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "retrieval_bm25"
+  hf_repo_split: "16k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-32k.yaml b/commit_message_generation/configs/async/DeepSeek-V3-32k.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptWContext
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "retrieval_bm25"
+  hf_repo_split: "32k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-4k-files.yaml b/commit_message_generation/configs/async/DeepSeek-V3-4k-files.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptForFullFiles
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "full_files"
+  hf_repo_split: "4k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-4k.yaml b/commit_message_generation/configs/async/DeepSeek-V3-4k.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptWContext
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "retrieval_bm25"
+  hf_repo_split: "4k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test
diff --git a/commit_message_generation/configs/async/DeepSeek-V3-64k.yaml b/commit_message_generation/configs/async/DeepSeek-V3-64k.yaml
@@ -0,0 +1,26 @@
+backbone:
+  _target_: src.backbones.TogetherBackbone
+  prompt:
+    _target_: src.prompts.DetailedCMGPromptWContext
+  model_name: deepseek-ai/DeepSeek-V3
+  api_key: null
+  parameters:
+    temperature: 0.8
+preprocessor:
+  _target_: src.preprocessors.LoadFromDatasetPreprocessor
+  include_path: true
+  hf_repo_id: "JetBrains-Research/lca-commit-message-generation"
+  hf_repo_config: "retrieval_bm25"
+  hf_repo_split: "64k"
+logger:
+  use_wandb: false
+  name: null
+  project: null
+  local_artifact: null
+data_src:
+  _target_: src.data_sources.HFDataSource
+  cache_dir: null
+  hub_name: JetBrains-Research/lca-commit-message-generation
+  configs:
+  - default
+  split: test