From d23a29af1d8b97a6066f5616fd93009777d3c8d4 Mon Sep 17 00:00:00 2001 From: Alexander Matveev Date: Thu, 12 Sep 2024 18:44:43 +0000 Subject: [PATCH] [Bugfix] Ensure cuda graph compatible multi-step + flashinfer --- vllm/attention/backends/flashinfer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 58d62e02e87..4054d337316 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -597,9 +597,19 @@ def build(self, seq_lens: List[int], query_lens: List[int], # The shape of graph_block_tables is # [max batch size, max context len // block size]. input_block_tables = self.runner.graph_block_tables[:batch_size] + max_blocks = input_block_tables.shape[1] for i, block_table in enumerate(self.block_tables): if block_table: - input_block_tables[i, :len(block_table)] = block_table + num_blocks = len(block_table) + if num_blocks <= max_blocks: + input_block_tables[i, :num_blocks] = block_table + else: + # It may be possible to have more blocks allocated due + # to lookahead slots of multi-step, however, they are + # not used anyway, so can be safely ignored. + input_block_tables[ + i, :max_blocks] = block_table[:max_blocks] + block_tables = torch.from_numpy(input_block_tables).to( device, non_blocking=True)