diff --git a/opencompass/configs/datasets/aime2024/README.md b/opencompass/configs/datasets/aime2024/README.md new file mode 100644 index 000000000..b75c9dbb9 --- /dev/null +++ b/opencompass/configs/datasets/aime2024/README.md @@ -0,0 +1,13 @@ +### Description + +Math dataset composed of problems from AIME2024 (American Invitational Mathematics Examination 2024). + +### Performance + +| Qwen2.5-Math-72B-Instruct | Qwen2.5-Math-7B-Instruct | Qwen2-Math-7B-Instruct | Qwen2-Math-1.5B-Instruct | internlm2-math-7b | +| ----------- | ----------- | ----------- | ----------- | ----------- | +| 20.00 | 16.67 | 16.67 | 13.33 | 3.33 | + +| Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat | +| ----------- | ----------- | ----------- | +| 31.25 | 26.44 | 9.13 | \ No newline at end of file diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen.py b/opencompass/configs/datasets/aime2024/aime2024_gen.py new file mode 100644 index 000000000..84aef3874 --- /dev/null +++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .aime2024_gen_6e39a4 import aime2024_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py b/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py new file mode 100644 index 000000000..305a4ec5c --- /dev/null +++ b/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2 + + +aime2024_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +aime2024_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +aime2024_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +aime2024_datasets = [ + dict( + abbr='aime2024', + type=Aime2024Dataset, + path='opencompass/aime2024', + reader_cfg=aime2024_reader_cfg, + infer_cfg=aime2024_infer_cfg, + eval_cfg=aime2024_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/cmo_fib/README.md b/opencompass/configs/datasets/cmo_fib/README.md new file mode 100644 index 000000000..9f397f8b8 --- /dev/null +++ b/opencompass/configs/datasets/cmo_fib/README.md @@ -0,0 +1,13 @@ +### Description + +Math dataset composed of problems from CMO (Chinese Mathematical Olympiad) 2009-2022 . + +### Performance + +| Qwen2.5-Math-72B-Instruct | Qwen2.5-Math-7B-Instruct | Qwen2-Math-7B-Instruct | Qwen2-Math-1.5B-Instruct | internlm2-math-7b | +| ----------- | ----------- | ----------- | ----------- | ----------- | +| 46.15 | 42.79 | 31.73 | 23.56 | 3.37 | + +| Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat | +| ----------- | ----------- | ----------- | +| 20.00 | 16.67 | 6.67 | \ No newline at end of file diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py new file mode 100644 index 000000000..aa12cd51f --- /dev/null +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .cmo_fib_gen_ace24b import cmo_fib_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py new file mode 100644 index 000000000..0fc523e1b --- /dev/null +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2 + + +cmo_fib_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +cmo_fib_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\n请一步一步地推理,并将最终答案写入\\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + +cmo_fib_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2) +) + +cmo_fib_datasets = [ + dict( + abbr='cmo_fib', + type=CMOFibDataset, + path='opencompass/cmo_fib', + reader_cfg=cmo_fib_reader_cfg, + infer_cfg=cmo_fib_infer_cfg, + eval_cfg=cmo_fib_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 85111139b..4b64d77b9 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -1,6 +1,7 @@ from .advglue import * # noqa: F401, F403 from .afqmcd import * # noqa: F401, F403 from .agieval import * # noqa: F401, F403 +from .aime2024 import * # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403 from .apps import * # noqa: F401, F403 @@ -24,6 +25,7 @@ from .cmb import * # noqa: F401, F403 from .cmmlu import * # noqa: F401, F403 from .cmnli import * # noqa: F401, F403 +from .cmo_fib import * # noqa: F401, F403 from .cmrc import * # noqa: F401, F403 from .commonsenseqa import * # noqa: F401, F403 from .commonsenseqa_cn import * # noqa: F401, F403 diff --git a/opencompass/datasets/aime2024.py b/opencompass/datasets/aime2024.py new file mode 100644 index 000000000..92a1ba794 --- /dev/null +++ b/opencompass/datasets/aime2024.py @@ -0,0 +1,25 @@ +import json + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class Aime2024Dataset(BaseDataset): + + @staticmethod + def load(path): + path = get_data_path(path) + dataset = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + origin_prompt = line['origin_prompt'] + line['question'] = origin_prompt[:] + line['answer'] = line['gold_answer'] + dataset.append(line) + return Dataset.from_list(dataset) diff --git a/opencompass/datasets/cmo_fib.py b/opencompass/datasets/cmo_fib.py new file mode 100644 index 000000000..10a7d1866 --- /dev/null +++ b/opencompass/datasets/cmo_fib.py @@ -0,0 +1,25 @@ +import json + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class CMOFibDataset(BaseDataset): + + @staticmethod + def load(path): + path = get_data_path(path) + dataset = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + origin_prompt = line['origin_prompt'] + line['question'] = origin_prompt[:] + line['answer'] = line['gold_answer'] + dataset.append(line) + return Dataset.from_list(dataset) diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index 8306e89e3..4ce37f2a0 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -136,7 +136,7 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: task.run() else: tmp_logs = f'tmp/{os.getpid()}_debug.log' - get_logger().debug( + get_logger().warning( f'Debug mode, log will be saved to {tmp_logs}') with open(tmp_logs, 'a') as log_file: subprocess.run(cmd, diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index cffd46162..01dc1f21b 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -291,6 +291,16 @@ "ms_id": "", "hf_id": "", "local": "./data/test_generation", + }, + "opencompass/aime2024": { + "ms_id": "", + "hf_id": "", + "local": "./data/aime.jsonl", + }, + "opencompass/cmo_fib": { + "ms_id": "", + "hf_id": "", + "local": "./data/cmo.jsonl", } } @@ -455,4 +465,12 @@ "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip", "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7", }, + "/aime": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", + "md5": "fbe2d0577fc210962a549f8cea1a00c8" + }, + "/cmo": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", + "md5": "fad52c81290506a8ca74f46b5400d8fc" + } }