diff --git a/bigcode_eval/tasks/multiple.py b/bigcode_eval/tasks/multiple.py index d9ac8cae5..580e1f031 100644 --- a/bigcode_eval/tasks/multiple.py +++ b/bigcode_eval/tasks/multiple.py @@ -41,7 +41,7 @@ LANGUAGES = [ "py", "sh", - "clj" + "clj", "cpp", "cs", "d", @@ -53,7 +53,7 @@ "js", "jl", "lua", - "ml" + "ml", "pl", "php", "r", @@ -71,13 +71,19 @@ def create_all_tasks(): :return: {task_name: task} e.g. {multiple-py: Task, multiple-java: Task} """ - return {f"multiple-{language}": create_task(language) for language in LANGUAGES} + # The root dataset is HumanEval + tasks = {f"multiple-{language}": create_task("humaneval", language) for language in LANGUAGES} + + # The root dataset is MBPP + for language in LANGUAGES: + tasks[f"multiple-{language}-mbpp"] = create_task("mbpp", language) + return tasks -def create_task(language): +def create_task(source, language): class MultiPLE(GeneralMultiPLE): def __init__(self): - super().__init__(language) + super().__init__(source, language) return MultiPLE @@ -91,9 +97,9 @@ class GeneralMultiPLE(Task): DATASET_NAME = None DATASET_REVISION = "ff5c146da05f10bc69b9ce393b77f381b3825d1b" - def __init__(self, language): + def __init__(self, source, language): self.language = language - self.DATASET_NAME = f"humaneval-{language}" + self.DATASET_NAME = f"{source}-{language}" # we need the dataset to get stop words for each language self.dataset = load_dataset( GeneralMultiPLE.DATASET_PATH, @@ -194,3 +200,5 @@ def process_results(self, generations, references): if k <= len(generations[0]) } return results + +print(create_all_tasks().keys()) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index f3413c354..8610979cd 100644 --- a/docs/README.md +++ b/docs/README.md @@ -184,7 +184,7 @@ For [StarChat-Beta](https://huggingface.co/HuggingFaceH4/starchat-beta) for exam [MBPP](https://huggingface.co/datasets/mbpp): consists of around 1,000 crowd-sourced Python programming problems, designed to be solvable by entry-level programmers. Each problem consists of a task description in English, a code solution and 3 automated test cases. We evaluate on the test set of samples from index 11 to 511. -* Prompts and generation: We use a few-shot setting in InCoder style prompt: we feed the prompt to the model as a doctring and only include one solution, to help the model catch the function name which is required in the unit tests. +* Prompts and generation: We use a few-shot setting in InCoder style prompt: we feed the prompt to the model as a doctring and only include one test case, to help the model catch the function name which is required in the unit tests. ```python prompt = f'"""\n{description}\n{test_example}\n"""\n' ``` @@ -207,7 +207,7 @@ accelerate launch main.py \ Low temperatures generally work better for small $k$ in pass@k. ### MBPP+ -[MBPP+](https://huggingface.co/datasets/evalplus/mbppplus): MBPP with additional unit tests (35x of the original MBPP) for each of the 164 problems. +[MBPP+](https://huggingface.co/datasets/evalplus/mbppplus): MBPP with additional unit tests (35x of the original MBPP) for each of the problems. The generation and evaluation follows the same approach as [MBPP](#mbpp). One only needs to change the task name to `mbppplus` to run the evaluation on MBPP+, such as: