bigcode-project · gameofby · Dec 4, 2024
diff --git a/bigcode_eval/tasks/multiple.py b/bigcode_eval/tasks/multiple.py
@@ -41,7 +41,7 @@
 LANGUAGES = [
     "py",
     "sh",
-    "clj"
+    "clj",
     "cpp",
     "cs",
     "d",
@@ -53,7 +53,7 @@
     "js",
     "jl",
     "lua",
-    "ml"
+    "ml",
     "pl",
     "php",
     "r",
@@ -71,13 +71,19 @@ def create_all_tasks():
     :return: {task_name: task}
         e.g. {multiple-py: Task, multiple-java: Task}
     """
-    return {f"multiple-{language}": create_task(language) for language in LANGUAGES}
+    # The root dataset is HumanEval
+    tasks = {f"multiple-{language}": create_task("humaneval", language) for language in LANGUAGES}
+
+    # The root dataset is MBPP
+    for language in LANGUAGES:
+        tasks[f"multiple-{language}-mbpp"] = create_task("mbpp", language)
 
+    return tasks
 
-def create_task(language):
+def create_task(source, language):
     class MultiPLE(GeneralMultiPLE):
         def __init__(self):
-            super().__init__(language)
+            super().__init__(source, language)
 
     return MultiPLE
 
@@ -91,9 +97,9 @@ class GeneralMultiPLE(Task):
     DATASET_NAME = None
     DATASET_REVISION = "ff5c146da05f10bc69b9ce393b77f381b3825d1b"
 
-    def __init__(self, language):
+    def __init__(self, source, language):
         self.language = language
-        self.DATASET_NAME = f"humaneval-{language}"
+        self.DATASET_NAME = f"{source}-{language}"
         # we need the dataset to get stop words for each language
         self.dataset = load_dataset(
             GeneralMultiPLE.DATASET_PATH,
@@ -194,3 +200,5 @@ def process_results(self, generations, references):
             if k <= len(generations[0])
         }
         return results
+
+print(create_all_tasks().keys())
diff --git a/docs/README.md b/docs/README.md
@@ -184,7 +184,7 @@ For [StarChat-Beta](https://huggingface.co/HuggingFaceH4/starchat-beta) for exam
 [MBPP](https://huggingface.co/datasets/mbpp):  consists of around 1,000 crowd-sourced Python programming problems, 
 designed to be solvable by entry-level programmers. Each problem consists of a task description in English, a code solution and 3 automated test cases. We evaluate on the test set of samples from index 11 to 511.
 
-* Prompts and generation: We use a few-shot setting in InCoder style prompt: we feed the prompt to the model as a doctring and only include one solution, to help the model catch the function name which is required in the unit tests.
+* Prompts and generation: We use a few-shot setting in InCoder style prompt: we feed the prompt to the model as a doctring and only include one test case, to help the model catch the function name which is required in the unit tests.
   ```python
   prompt = f'"""\n{description}\n{test_example}\n"""\n'
   ```
@@ -207,7 +207,7 @@ accelerate launch  main.py \
 Low temperatures generally work better for small $k$ in pass@k.
 
 ### MBPP+
-[MBPP+](https://huggingface.co/datasets/evalplus/mbppplus): MBPP with additional unit tests (35x of the original MBPP) for each of the 164 problems.
+[MBPP+](https://huggingface.co/datasets/evalplus/mbppplus): MBPP with additional unit tests (35x of the original MBPP) for each of the problems.
 
 The generation and evaluation follows the same approach as [MBPP](#mbpp). One only needs to change the task name to `mbppplus` to run the evaluation on MBPP+, such as: