Skip to content

Commit 93952cb

Browse files
committed
fallback v1 FA to chunked_prefill_paged_decode
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
1 parent 6ee8222 commit 93952cb

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

vllm/v1/attention/backends/triton_attn.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,12 @@ def forward(
153153
# performance to make sure it does not introduce any overhead.
154154

155155
num_queries_per_kv = query.shape[1] // key.shape[1]
156-
use_paged_attn = (num_queries_per_kv & (num_queries_per_kv - 1)) != 0
156+
use_prefill_decode_attn = (num_queries_per_kv &
157+
(num_queries_per_kv - 1)) != 0
157158

158159
num_actual_tokens = attn_metadata.num_actual_tokens
159160

160-
if use_paged_attn:
161+
if use_prefill_decode_attn:
161162
key_cache, value_cache = PagedAttention.split_kv_cache(
162163
kv_cache, self.num_kv_heads, self.head_size)
163164

@@ -219,7 +220,7 @@ def forward(
219220
max_seqlen_k = attn_metadata.max_seq_len
220221
block_table = attn_metadata.block_table
221222

222-
if use_paged_attn:
223+
if use_prefill_decode_attn:
223224
# Compute attention and update output up to `num_actual_tokens`.
224225
chunked_prefill_paged_decode(query=query[:num_actual_tokens],
225226
key=key[:num_actual_tokens],

0 commit comments

Comments
 (0)