From b31355d8cac7b515522d3690ad3e7cc4a71f3191 Mon Sep 17 00:00:00 2001
From: Lu Fang <fanglu@fb.com>
Date: Sat, 5 Apr 2025 19:15:37 -0700
Subject: [PATCH 1/2] fix model test

fix model init for TeleChat2ForCausalLM and llama4 V0

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/attention/backends/flash_attn.py   | 5 +++++
 vllm/model_executor/models/telechat2.py | 8 ++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 27bd292b51f..8f03d443e32 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -617,10 +617,15 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back to global attention for long context, which could impact accuracy"
+            )
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index a38035e37ec..062b1c2cf5f 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -19,7 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, Set, Tuple
+from typing import Iterable, Set, Tuple, Type
 
 import torch
 
@@ -27,6 +27,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
 
+from .llama import LlamaDecoderLayer
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
                     is_pp_missing_parameter)
 
@@ -120,7 +121,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
         },
     )
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
     def load_weights(self, weights: Iterable[Tuple[str,

From 3b5ccf8979dfa70fd4cd3cc508a86d0799d8e75d Mon Sep 17 00:00:00 2001
From: Lu Fang <fanglu@fb.com>
Date: Sat, 5 Apr 2025 20:02:19 -0700
Subject: [PATCH 2/2] fix lint

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/attention/backends/flash_attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 8f03d443e32..c0a572b4aae 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -624,8 +624,8 @@ def __init__(
                 "FlashAttention does not support block-sparse attention.")
         if use_irope:
             logger.warning(
-                "Using irope in V0 is not supported yet, it will fall back to global attention for long context, which could impact accuracy"
-            )
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)