From 7fd38bf90f82cbd74aca2c6d04ba37937bf50a21 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 28 May 2025 22:35:14 +0800 Subject: [PATCH] [Bugfix][V1] Fix deepseek with v1 Signed-off-by: Mengqing Cao --- tests/multicard/test_offline_inference_distributed.py | 3 --- vllm_ascend/attention/mla_v1.py | 6 ++---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index f399ea652..941055cf7 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -22,7 +22,6 @@ """ import os -import pytest import vllm # noqa: F401 from tests.conftest import VllmRunner @@ -47,8 +46,6 @@ def test_models_distributed_QwQ(): vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1", - reason="deepseek v2 lite is not supported on v1") def test_models_distributed_DeepSeek(): example_prompts = [ "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index d39a1499f..054bd953c 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -239,10 +239,8 @@ def build(self, # it blocks on all previous kernels. device = self.runner.device - block_table = self.runner.input_batch.block_table[0].get_device_tensor( - ) - block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( - block_table[:num_reqs]) + block_table = (self.runner.input_batch.block_table[0]. + get_device_tensor()[:num_reqs]) slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to( device, non_blocking=True) input_positions = self.runner.positions_cpu[:num_actual_tokens].to(