Skip to content

Commit 972dee4

Browse files
Add Qwen2.5 math (#1863)
Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
1 parent 4b3dd3b commit 972dee4

File tree

6 files changed

+85
-2
lines changed

6 files changed

+85
-2
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ Every model is written from scratch to maximize performance and remove layers of
138138
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
139139
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
140140
| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) |
141+
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
141142
| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
142143
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |
143144
| StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) |

litgpt/config.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,74 @@ def norm_class(self) -> Type:
20332033

20342034
qwen_2_5.extend(qwen_2_5_coder)
20352035

2036+
qwen_2_5_math = [
2037+
# https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json
2038+
dict(
2039+
name="Qwen2.5-Math-1.5B{}",
2040+
hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"),
2041+
block_size=4096,
2042+
vocab_size=151643,
2043+
padded_vocab_size=151936,
2044+
n_layer=28,
2045+
n_head=12,
2046+
n_embd=1536,
2047+
n_query_groups=2,
2048+
rotary_percentage=1.0,
2049+
parallel_residual=False,
2050+
bias=False,
2051+
attn_bias=True,
2052+
norm_class_name="RMSNorm",
2053+
mlp_class_name="LLaMAMLP",
2054+
intermediate_size=8960,
2055+
norm_eps=1e-6,
2056+
rope_base=10000
2057+
),
2058+
# https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json
2059+
dict(
2060+
name="Qwen2.5-Math-7B{}",
2061+
hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"),
2062+
block_size=4096,
2063+
vocab_size=151643,
2064+
padded_vocab_size=152064,
2065+
n_layer=28,
2066+
n_head=28,
2067+
n_embd=3584,
2068+
n_query_groups=4,
2069+
rotary_percentage=1.0,
2070+
parallel_residual=False,
2071+
bias=False,
2072+
attn_bias=True,
2073+
norm_class_name="RMSNorm",
2074+
mlp_class_name="LLaMAMLP",
2075+
intermediate_size=18944,
2076+
norm_eps=1e-6,
2077+
rope_base=10000
2078+
),
2079+
# https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json
2080+
dict(
2081+
name="Qwen2.5-Math-72B{}",
2082+
hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"),
2083+
block_size=4096,
2084+
vocab_size=151643,
2085+
padded_vocab_size=152064,
2086+
n_layer=80,
2087+
n_head=64,
2088+
n_embd=8192,
2089+
n_query_groups=8,
2090+
rotary_percentage=1.0,
2091+
parallel_residual=False,
2092+
bias=False,
2093+
attn_bias=True,
2094+
norm_class_name="RMSNorm",
2095+
mlp_class_name="LLaMAMLP",
2096+
intermediate_size=29568,
2097+
norm_eps=1e-5,
2098+
rope_base=10000
2099+
),
2100+
]
2101+
2102+
qwen_2_5.extend(qwen_2_5_math)
2103+
20362104
for c in qwen_2_5:
20372105
for kind in ("", "-Instruct"):
20382106
copy = deepcopy(c)

litgpt/prompts.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,10 @@ def apply(self, prompt: str, **kwargs: str) -> str:
284284
system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
285285
return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
286286

287+
class Qwen2_5_Math(PromptStyle):
288+
def apply(self, prompt: str, **kwargs: str) -> str:
289+
system_message = "Please reason step by step, and put your final answer within \\boxed{}."
290+
return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
287291

288292
class QwQ(PromptStyle):
289293
def apply(self, prompt: str, **kwargs: str) -> str:
@@ -320,6 +324,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
320324
"llama3": Llama3,
321325
"olmo": OLMo,
322326
"qwen2.5": Qwen2_5,
327+
"qwen2.5-math": Qwen2_5_Math,
323328
"qwq": QwQ,
324329
"salamandra": Salamandra,
325330
}
@@ -360,6 +365,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
360365
return Gemma()
361366
if re.search(r"OLMo.*-hf", model_name):
362367
return OLMo()
368+
if re.search(r"Qwen2\.5-Math-.*", model_name):
369+
return Qwen2_5_Math()
363370
if re.search(r"Qwen2\.5-.*", model_name):
364371
return Qwen2_5()
365372
if re.search(r"QwQ-.*", model_name):

tests/test_convert_lit_checkpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ def test_check_conversion_supported_lora():
524524
check_conversion_supported(lit_weights=lit_weights)
525525

526526
@torch.inference_mode()
527-
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "QwQ-32B-Preview"))
527+
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
528528
@pytest.mark.parametrize(
529529
("device", "dtype"),
530530
[

tests/test_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ def test_against_original_gemma_2(model_name, device, dtype):
792792

793793

794794
@torch.inference_mode()
795-
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "QwQ-32B-Preview"))
795+
@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
796796
@pytest.mark.parametrize(
797797
("device", "dtype"),
798798
[

tutorials/download_model_weights.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
3737
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
3838
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
3939
| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) |
40+
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
4041
| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
4142
| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) |
4243
| StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) |
@@ -200,6 +201,12 @@ Qwen/Qwen2.5-Coder-14B
200201
Qwen/Qwen2.5-Coder-14B-Instruct
201202
Qwen/Qwen2.5-Coder-32B
202203
Qwen/Qwen2.5-Coder-32B-Instruct
204+
Qwen/Qwen2.5-Math-1.5B
205+
Qwen/Qwen2.5-Math-1.5B-Instruct
206+
Qwen/Qwen2.5-Math-7B
207+
Qwen/Qwen2.5-Math-7B-Instruct
208+
Qwen/Qwen2.5-Math-72B
209+
Qwen/Qwen2.5-Math-72B-Instruct
203210
Qwen/QwQ-32B-Preview
204211
stabilityai/FreeWilly2
205212
stabilityai/stable-code-3b

0 commit comments

Comments
 (0)