[Bugfix] Fix Phi3.5 mini and MoE LoRA inference (vllm-project#8571)

garg-amit · LeiWang1999 · commit 8785c7d09ed9 · 2025-03-26T10:14:33.000Z
Signed-off-by: LeiWang1999 &lt;leiwang1999@outlook.com&gt;
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -50,7 +50,7 @@
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
-    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
@@ -0,0 +1,17 @@
+# coding=utf-8
+# Adapted from llama.py
+"""Inference-only Phi3 model code inherit from Llama.py"""
+
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+
+class Phi3ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
@@ -491,6 +491,10 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA):
         "o_proj",
         "embed_tokens",
         "lm_head",
+        "w1",
+        "w2",
+        "w3",
+        "gate",
     ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",