[BugFix] Fix chunked prefill bugs in engine v1 (#844)

rjg-lyh · web-flow · commit b4d6672d0186 · 2025-05-22T10:33:50.000+08:00
### What this PR does / why we need it?
Fix the bugs when run deepseek model in engine v1.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

---------

Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -204,6 +204,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "ascend_scheduler_config", None) is not None:
                 additional_scheduler_config = additional_config.get(
                     "ascend_scheduler_config")
+                if vllm_config.scheduler_config.enable_chunked_prefill:
+                    additional_scheduler_config[
+                        "enable_chunked_prefill"] = True
                 from vllm_ascend.core.schedule_config import \
                     AscendSchedulerConfig
                 ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -120,6 +120,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.max_num_reqs = self.scheduler_config.max_num_seqs
 
+        additional_config = vllm_config.additional_config
+        if additional_config and additional_config.get(
+                "ascend_scheduler_config", None) is not None:
+            self.use_v0_scheduler = True
+        else:
+            self.use_v0_scheduler = False
+
         self.graph_block_tables = np.zeros(
             (self.vllm_config.scheduler_config.max_num_seqs,
              (self.model_config.max_model_len + self.block_size - 1) //
@@ -545,13 +552,14 @@ def _process_reqs(
                block_offsets,
                out=self.slot_mapping_np[:total_num_scheduled_tokens])
 
-        if self.chunked_prefill_enabled:
-            attn_state = AscendAttentionState.ChunkedPrefill
-        elif np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
+        if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
             attn_state = AscendAttentionState.PrefillNoCache
         # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
         elif np.all(num_scheduled_tokens == 1):
             attn_state = AscendAttentionState.DecodeOnly
+        # splitfuse
+        elif not self.use_v0_scheduler or self.chunked_prefill_enabled:
+            attn_state = AscendAttentionState.ChunkedPrefill
         else:
             attn_state = AscendAttentionState.PrefillCacheHit