@@ -114,6 +114,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
114
114
def __init__ (self , vllm_config : VllmConfig , device : torch .device ):
115
115
self .vllm_config = vllm_config
116
116
self .model_config = vllm_config .model_config
117
+ self .cache_config = vllm_config .cache_config
117
118
self .lora_config = vllm_config .lora_config
118
119
self .scheduler_config = vllm_config .scheduler_config
119
120
self .speculative_config = vllm_config .speculative_config
@@ -172,24 +173,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
172
173
raise NotImplementedError (
173
174
"Non-Attention backend is not supported by V1 NPUModelRunner." )
174
175
175
- self .attn_backend = get_attn_backend (
176
- self .head_size ,
177
- self .dtype ,
178
- self .kv_cache_dtype ,
179
- self .block_size ,
180
- self .model_config .is_attention_free ,
181
- use_mla = self .model_config .use_mla ,
182
- )
183
- if self .attn_backend is None :
184
- error_msg = (
185
- f"Error with get_att_backend: { self .head_size = } , "
186
- f"{ self .dtype = } , { self .kv_cache_dtype = } , { self .block_size = } , "
187
- f"{ self .model_config .is_attention_free = } , "
188
- f"{ self .model_config .use_mla = } " )
189
- logger .error (error_msg )
190
- raise NotImplementedError (
191
- "Non-Attention backend is not supported by V1 GPUModelRunner." )
192
-
193
176
self .attn_metadata_builder = self .attn_backend .get_builder_cls ()(
194
177
weakref .proxy (self ))
195
178
@@ -237,16 +220,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
237
220
pin_memory = True ,
238
221
vocab_size = self .model_config .get_vocab_size (),
239
222
)
240
- else :
241
- self .input_batch = InputBatch (
242
- max_num_reqs = self .max_num_reqs ,
243
- max_model_len = self .model_config .max_model_len ,
244
- max_num_blocks_per_req = self .max_num_blocks_per_req ,
245
- max_num_batched_tokens = self .max_num_tokens ,
246
- device = self .device ,
247
- pin_memory = True ,
248
- vocab_size = self .model_config .get_vocab_size (),
249
- )
250
223
251
224
self .input_ids = torch .zeros (self .max_num_tokens ,
252
225
dtype = torch .int32 ,
@@ -600,7 +573,10 @@ def _process_reqs(
600
573
601
574
block_table_indices = (req_indices * self .max_num_blocks_per_req +
602
575
positions_np // self .block_size )
603
- block_table_cpu = self .input_batch .block_table .get_cpu_tensor ()
576
+ if vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" ):
577
+ block_table_cpu = self .input_batch .block_table .get_cpu_tensor ()
578
+ else :
579
+ block_table_cpu = self .input_batch .block_table [0 ].get_cpu_tensor ()
604
580
block_numbers = block_table_cpu .flatten ()[block_table_indices ].numpy ()
605
581
block_offsets = positions_np % self .block_size
606
582
np .add (block_numbers * self .block_size ,
@@ -1206,6 +1182,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
1206
1182
"""
1207
1183
import torch_npu
1208
1184
kv_caches : Dict [str , torch .Tensor ] = {}
1185
+ if not (vllm_version_is ("0.8.5" ) or vllm_version_is ("0.8.5.post1" )):
1186
+ self .input_batch = InputBatch (
1187
+ max_num_reqs = self .max_num_reqs ,
1188
+ max_model_len = self .model_config .max_model_len ,
1189
+ max_num_batched_tokens = self .max_num_tokens ,
1190
+ device = self .device ,
1191
+ pin_memory = True ,
1192
+ vocab_size = self .model_config .get_vocab_size (),
1193
+ block_size = self .cache_config .block_size ,
1194
+ )
1209
1195
1210
1196
for kv_cache_group in kv_cache_config .kv_cache_groups :
1211
1197
kv_cache_spec = kv_cache_group .kv_cache_spec
0 commit comments