File tree 1 file changed +4
-3
lines changed
vllm/v1/attention/backends
1 file changed +4
-3
lines changed Original file line number Diff line number Diff line change @@ -153,11 +153,12 @@ def forward(
153
153
# performance to make sure it does not introduce any overhead.
154
154
155
155
num_queries_per_kv = query .shape [1 ] // key .shape [1 ]
156
- use_paged_attn = (num_queries_per_kv & (num_queries_per_kv - 1 )) != 0
156
+ use_prefill_decode_attn = (num_queries_per_kv &
157
+ (num_queries_per_kv - 1 )) != 0
157
158
158
159
num_actual_tokens = attn_metadata .num_actual_tokens
159
160
160
- if use_paged_attn :
161
+ if use_prefill_decode_attn :
161
162
key_cache , value_cache = PagedAttention .split_kv_cache (
162
163
kv_cache , self .num_kv_heads , self .head_size )
163
164
@@ -219,7 +220,7 @@ def forward(
219
220
max_seqlen_k = attn_metadata .max_seq_len
220
221
block_table = attn_metadata .block_table
221
222
222
- if use_paged_attn :
223
+ if use_prefill_decode_attn :
223
224
# Compute attention and update output up to `num_actual_tokens`.
224
225
chunked_prefill_paged_decode (query = query [:num_actual_tokens ],
225
226
key = key [:num_actual_tokens ],
You can’t perform that action at this time.
0 commit comments