fix batching inference

Isotr0py · Isotr0py · commit 3772037cb456 · 2024-09-25T10:58:03.000+08:00
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -283,9 +283,9 @@ def forward(
                                       "(b s) ... -> b s ...",
                                       b=batch_size)
         elif is_cpu():
-            bs, seq_length, _, _ = q.shape
+            seq_length = q.size(1)
             q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]]
-            attention_mask = torch.zeros([bs, 1, seq_length, seq_length],
+            attention_mask = torch.zeros([1, seq_length, seq_length],
                                          device=q.device,
                                          dtype=torch.bool)
             for i in range(1, len(cu_seqlens)):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
@@ -187,6 +187,7 @@ def _prepare_prompt(
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
+        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
 
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
@@ -216,7 +217,8 @@ def _prepare_prompt(
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             if mrope_positions:
-                input_positions.extend(mrope_positions)
+                for idx in range(3):
+                    input_mrope_positions[idx].extend(mrope_positions[idx])
             else:
                 input_positions.extend(list(range(computed_len, seq_len)))
 
@@ -242,12 +244,18 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
+        if any(input_mrope_positions):
+            input_positions = None  # type: ignore
+        else:
+            input_mrope_positions = None  # type: ignore
+
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)  # type: ignore
-        input_positions = torch.tensor(input_positions,
+        input_positions = torch.tensor(input_positions
+                                       or input_mrope_positions,
                                        dtype=torch.long,
                                        device=self.device)  # type: ignore
         slot_mapping = torch.tensor(slot_mapping,
@@ -278,6 +286,7 @@ def _prepare_decode(
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
+        input_mrope_positions: List[List[int]] = [[] for _ in range(3)]
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
         block_tables: List[List[int]] = []
@@ -302,7 +311,8 @@ def _prepare_decode(
                         context_len,
                         seq_len,
                     )
-                    input_positions.extend(next_pos)
+                    for idx in range(3):
+                        input_mrope_positions[idx].extend(next_pos[idx])
                 else:
                     input_positions.append(position)
 
@@ -322,12 +332,18 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
+        if any(input_mrope_positions):
+            input_positions = None  # type: ignore
+        else:
+            input_mrope_positions = None  # type: ignore
+
         max_decode_seq_len = max(seq_lens)
 
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
-        input_positions = torch.tensor(input_positions,
+        input_positions = torch.tensor(input_positions
+                                       or input_mrope_positions,
                                        dtype=torch.long,
                                        device=self.device)
         slot_mapping = torch.tensor(slot_mapping,