diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 27bd292b51f..c0a572b4aae 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -617,10 +617,15 @@ def __init__( blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + use_irope: bool = False, ) -> None: if blocksparse_params is not None: raise ValueError( "FlashAttention does not support block-sparse attention.") + if use_irope: + logger.warning( + "Using irope in V0 is not supported yet, it will fall back " + "to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index a38035e37ec..062b1c2cf5f 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Set, Tuple +from typing import Iterable, Set, Tuple, Type import torch @@ -27,6 +27,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel +from .llama import LlamaDecoderLayer from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, is_pp_missing_parameter) @@ -120,7 +121,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): }, ) - def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): + def _init_model(self, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) def load_weights(self, weights: Iterable[Tuple[str,