Merge remote-tracking branch 'upstream/main' into luka/flash-attn-cmake

ProExpertProg · ProExpertProg · commit 9be34de09da3 · 2024-09-19T11:00:06.000-04:00
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -626,9 +626,9 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
-            input_requests = [(prompt, prompt_len, output_len)
+            input_requests = [(prompt, prompt_len, output_len, None)
                               for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
         else:
             assert (
                 tokenizer.chat_template or tokenizer.default_chat_template
@@ -641,9 +641,9 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
-            input_requests = [(prompt_formatted, prompt_len, output_len)
+            input_requests = [(prompt_formatted, prompt_len, output_len, None)
                               for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
 
     elif args.dataset_name == "hf":
         input_requests = sample_hf_requests(
@@ -963,4 +963,4 @@ def main(args: argparse.Namespace):
     )
 
     args = parser.parse_args()
-    main(args)
+    main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -191,7 +191,6 @@ async def run_vllm_async(
         use_v2_block_manager=use_v2_block_manager,
         disable_async_output_proc=disable_async_output_proc,
         worker_use_ray=False,
-        engine_use_ray=False,
         disable_log_requests=True,
     )
 
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
@@ -1342,9 +1342,6 @@ __device__ inline void MarlinMoESingle(
 
 template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
           const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
           const int thread_n_blocks,  // same for n dimension (output)
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
@@ -1459,9 +1456,6 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
 
 template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
           const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
           const int thread_n_blocks,  // same for n dimension (output)
           const int thread_k_blocks,  // same for k dimension (reduction)
           const int stages,  // number of stages for the async global->shared
@@ -1515,26 +1509,24 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
-                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
-                      NUM_THREADS)                                            \
-  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
-           thread_n_blocks == THREAD_N_BLOCKS &&                              \
-           thread_k_blocks == THREAD_K_BLOCKS &&                              \
-           has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
-           num_threads == NUM_THREADS) {                                      \
-    cudaFuncSetAttribute(                                                     \
-        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
-                  THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
-              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
-        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
-            A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
-            g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
-            num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,          \
-            replicate_input, apply_weights, m_block, max_par,                 \
-            exec_cfg.max_m_blocks);                                           \
+#define __CALL_IF_MOE(W_TYPE, THREAD_N_BLOCKS, THREAD_K_BLOCKS, HAS_ACT_ORDER, \
+                      GROUP_BLOCKS, NUM_THREADS)                               \
+  else if (q_type == W_TYPE && thread_n_blocks == THREAD_N_BLOCKS &&           \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&   \
+           num_threads == NUM_THREADS) {                                       \
+    cudaFuncSetAttribute(                                                      \
+        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,  \
+                  STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,                        \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);          \
+    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
+              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                             \
+        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
+            A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,      \
+            g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,             \
+            num_experts, topk, prob_m, prob_n, prob_k, tot_m, locks,           \
+            replicate_input, apply_weights, m_block, max_par,                  \
+            exec_cfg.max_m_blocks);                                            \
   }
 
 typedef struct {
@@ -1711,31 +1703,16 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
   return exec_config_t{0, {-1, -1, -1}};
 }
 
-#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                                    \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
@@ -273,7 +273,8 @@ def test_prepare_prompt(batch_size):
                     "unsupported for encoder/ "
                     "decoder models")
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
-def test_prepare_decode(batch_size):
+@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
+def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     '''
     Test the ability of the encoder/decoder model runner subclass to
     produce decode-phase model inputs & attention metadata.
@@ -288,6 +289,7 @@ def test_prepare_decode(batch_size):
     Arguments:
 
     * batch_size
+    * multiple_seqs_per_seq_group
     * backend_name: The attention backend under test
     * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
     '''
@@ -305,29 +307,40 @@ def test_prepare_decode(batch_size):
     seq_lens: List[int] = []
     encoder_seq_lens: List[int] = []
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
+    block_tables = {
+        0: [1],
+        1: [3]
+    } if multiple_seqs_per_seq_group else {
+        0: [1]
+    }
     cross_block_table = [2]
     for i in range(batch_size):
         # make sure all tokens fit into one block
         seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
         seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(seq_len))))
         encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
-        encoder_seq_lens.append(encoder_seq_len)
         encoder_seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len))))
+
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=False,
-            seq_data={0: seq_data},
+            seq_data={
+                0: seq_data,
+                1: seq_data
+            } if multiple_seqs_per_seq_group else {0: seq_data},
             sampling_params=SamplingParams(temperature=0),
             block_tables=block_tables,
             encoder_seq_data=encoder_seq_data,
             cross_block_table=cross_block_table,
         )
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
+        seq_lens.extend(
+            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        encoder_seq_lens.extend(
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
 
     # Build
     # * Decoder model inputs
@@ -398,19 +411,24 @@ def test_prepare_decode(batch_size):
 
     # Verify block tables are correct for prompts
     # - Decoder self-attention
-    expected = torch.tensor(
-        [block_tables[0] for _ in range(len(seq_group_metadata_list))],
-        dtype=torch.int32,
-        device=model_runner.device)
+    flattened_block_tables = [
+        block_table for block_table in block_tables.values()
+    ]
+    expected = torch.tensor(flattened_block_tables *
+                            len(seq_group_metadata_list),
+                            dtype=torch.int32,
+                            device=model_runner.device)
     assert torch.equal(
         attn_metadata.block_tables,
         expected,
     )
     # - Encoder/decoder cross-attention
-    expected = torch.tensor(
-        [cross_block_table for _ in range(len(seq_group_metadata_list))],
-        dtype=torch.int32,
-        device=model_runner.device)
+    expected = torch.tensor([
+        cross_block_table for seq_group_metadata in seq_group_metadata_list
+        for _ in range(len(seq_group_metadata.seq_data))
+    ],
+                            dtype=torch.int32,
+                            device=model_runner.device)
     assert torch.equal(
         attn_metadata.cross_block_tables,
         expected,
@@ -474,7 +492,8 @@ def test_prepare_decode(batch_size):
 
 
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
-def test_prepare_decode_cuda_graph(batch_size):
+@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
+def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     """
     Tests that for encoder-decoder models with CUDA Graph capture and replay
     enabled, the tensors used during the decode phase are correctly padded 
@@ -489,32 +508,45 @@ def test_prepare_decode_cuda_graph(batch_size):
         enable_chunked_prefill=False,
         enforce_eager=False,
     )
-
+    block_tables = {
+        0: [1],
+        1: [3]
+    } if multiple_seqs_per_seq_group else {
+        0: [1]
+    }
     seq_lens: List[int] = []
     encoder_seq_lens: List[int] = []
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
+
     cross_block_table = [2]
+    expanded_batch_size = 0
     for i in range(batch_size):
         # make sure all tokens fit into one block
         seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
         seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(seq_len))))
         encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
-        encoder_seq_lens.append(encoder_seq_len)
         encoder_seq_data = SequenceData(
             array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len))))
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=False,
-            seq_data={0: seq_data},
+            seq_data={
+                0: seq_data,
+                1: seq_data
+            } if multiple_seqs_per_seq_group else {0: seq_data},
             sampling_params=SamplingParams(temperature=0),
             block_tables=block_tables,
             encoder_seq_data=encoder_seq_data,
             cross_block_table=cross_block_table,
         )
         assert seq_group_metadata.token_chunk_size == 1
+        seq_lens.extend(
+            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        encoder_seq_lens.extend(
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        expanded_batch_size = expanded_batch_size + len(
+            seq_group_metadata.seq_data)
         seq_group_metadata_list.append(seq_group_metadata)
 
     model_input = model_runner.prepare_model_input(seq_group_metadata_list)
@@ -530,8 +562,8 @@ def test_prepare_decode_cuda_graph(batch_size):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = _get_graph_batch_size(batch_size)
-    cuda_graph_pad_size = graph_batch_size - batch_size
+    graph_batch_size = _get_graph_batch_size(expanded_batch_size)
+    cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
         itertools.repeat(1, cuda_graph_pad_size))
@@ -560,10 +592,13 @@ def test_prepare_decode_cuda_graph(batch_size):
 
     # Verify block tables are correct for prompts
     # - Decoder self-attention. Pad the block tables as expected.
-    expected = [block_tables[0] for _ in range(batch_size)]
-    expected.extend([[] for _ in range(cuda_graph_pad_size)])
+    flattened_block_tables = [
+        block_table for _ in range(len(seq_group_metadata_list))
+        for block_table in block_tables.values()
+    ]
+    flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
     expected = make_tensor_with_pad(
-        expected,
+        flattened_block_tables,
         max_len=64,
         pad=0,
         dtype=torch.int32,
@@ -575,7 +610,10 @@ def test_prepare_decode_cuda_graph(batch_size):
     )
     # - Encoder/decoder cross-attention. Pad the cross-attention block tables
     # as expected.
-    expected = [cross_block_table for _ in range(len(seq_group_metadata_list))]
+    expected = [
+        cross_block_table for seq_group_metadata in seq_group_metadata_list
+        for _ in range(len(seq_group_metadata.seq_data))
+    ]
     expected.extend([[] for _ in range(cuda_graph_pad_size)])
     expected = make_tensor_with_pad(
         expected,
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py

Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,6 @@ async def run_vllm_async(`
`191`	`191`	`use_v2_block_manager=use_v2_block_manager,`
`192`	`192`	`disable_async_output_proc=disable_async_output_proc,`
`193`	`193`	`worker_use_ray=False,`
`194`		`- engine_use_ray=False,`
`195`	`194`	`disable_log_requests=True,`
`196`	`195`	`)`
`197`	`196`