@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
186
186
["disable_logprobs" ])
187
187
188
188
189
- # TODO: Open it when vllm-ascend support graph mode and
190
- # @pytest.mark.parametrize(
191
- # "common_llm_kwargs",
192
- # [{
193
- # "enforce_eager": False,
194
-
195
- # # Print spec metrics.
196
- # "disable_log_stats": False,
197
-
198
- # # Precision
199
- # "dtype": PRECISION,
200
-
201
- # # Main model
202
- # "model_name": MAIN_MODEL,
203
- # }])
204
- # @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
205
- # @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
206
- # @pytest.mark.parametrize("test_llm_kwargs", [
207
- # {
208
- # "speculative_config": {
209
- # "model": SPEC_MODEL,
210
- # "num_speculative_tokens": MAX_SPEC_TOKENS,
211
- # },
212
- # },
213
- # ])
214
- # @pytest.mark.parametrize("output_len", [
215
- # 128,
216
- # ])
217
- # @pytest.mark.parametrize("batch_size", [1, 32])
218
- # @pytest.mark.parametrize("seed", [1])
219
- # @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
220
- # def test_medusa_e2e_greedy_correctness_cuda_graph(
221
- # vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
222
- # baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
223
- # seed: int, prefill_chunk_size: int):
224
- # """Verify greedy equality with cuda graph enabled and different
225
- # batch sizes."""
226
- # maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
227
- # run_equality_correctness_test(vllm_runner,
228
- # common_llm_kwargs,
229
- # per_test_common_llm_kwargs,
230
- # baseline_llm_kwargs,
231
- # test_llm_kwargs,
232
- # batch_size,
233
- # max_output_len=output_len,
234
- # seed=seed,
235
- # temperature=0.0)
236
-
237
- # TODO: There is a problem with the preemptive scheduling in the current
238
- # version, which makes this case fail. Please release this case after the
239
- # preemptive scheduling problem is solved.
240
- # @pytest.mark.parametrize(
241
- # "common_llm_kwargs",
242
- # [{
243
- # "block_size": 8,
244
- # # 2 for small prompt, 256//8 for generated.
245
- # "num_gpu_blocks_override": 2 + 256 // 8,
246
- # "max_model_len": (2 + 256 // 8) * 8,
247
-
248
- # # Skip cuda graph recording for fast test.
249
- # "enforce_eager": True,
250
-
251
- # # Precision
252
- # "dtype": PRECISION,
253
-
254
- # # Main model
255
- # "model_name": MAIN_MODEL,
256
- # }])
257
- # @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
258
- # @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
259
- # @pytest.mark.parametrize("test_llm_kwargs", [
260
- # {
261
- # "speculative_config": {
262
- # "model": SPEC_MODEL,
263
- # "num_speculative_tokens": MAX_SPEC_TOKENS,
264
- # },
265
- # },
266
- # ])
267
- # @pytest.mark.parametrize(
268
- # "output_len",
269
- # [
270
- # # Use small output len for fast test.
271
- # 128,
272
- # ])
273
- # @pytest.mark.parametrize("batch_size", [4])
274
- # @pytest.mark.parametrize("seed", [1])
275
- # @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
276
- # def test_medusa_e2e_greedy_correctness_with_preemption(
277
- # vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
278
- # baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
279
- # seed: int, prefill_chunk_size: int):
280
- # """Verify greedy equality, even when some sequences are preempted mid-
281
- # generation.
282
- # """
283
- # maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
284
- # run_equality_correctness_test(vllm_runner,
285
- # common_llm_kwargs,
286
- # per_test_common_llm_kwargs,
287
- # baseline_llm_kwargs,
288
- # test_llm_kwargs,
289
- # batch_size,
290
- # max_output_len=output_len,
291
- # seed=seed,
292
- # temperature=0.0)
189
+ @pytest .mark .skipif (True , reason = "Open it when graph mode ready." )
190
+ @pytest .mark .parametrize (
191
+ "common_llm_kwargs" ,
192
+ [{
193
+ "enforce_eager" : False ,
194
+
195
+ # Print spec metrics.
196
+ "disable_log_stats" : False ,
197
+
198
+ # Precision
199
+ "dtype" : PRECISION ,
200
+
201
+ # Main model
202
+ "model_name" : MAIN_MODEL ,
203
+ }])
204
+ @pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
205
+ @pytest .mark .parametrize ("baseline_llm_kwargs" , [{}])
206
+ @pytest .mark .parametrize ("test_llm_kwargs" , [
207
+ {
208
+ "speculative_config" : {
209
+ "model" : SPEC_MODEL ,
210
+ "num_speculative_tokens" : MAX_SPEC_TOKENS ,
211
+ },
212
+ },
213
+ ])
214
+ @pytest .mark .parametrize ("output_len" , [
215
+ 128 ,
216
+ ])
217
+ @pytest .mark .parametrize ("batch_size" , [1 , 32 ])
218
+ @pytest .mark .parametrize ("seed" , [1 ])
219
+ @pytest .mark .parametrize ("prefill_chunk_size" , PREFILL_CHUNK_SIZE )
220
+ def test_medusa_e2e_greedy_correctness_cuda_graph (
221
+ vllm_runner , common_llm_kwargs , per_test_common_llm_kwargs ,
222
+ baseline_llm_kwargs , test_llm_kwargs , batch_size : int , output_len : int ,
223
+ seed : int , prefill_chunk_size : int ):
224
+ """Verify greedy equality with cuda graph enabled and different
225
+ batch sizes."""
226
+ maybe_enable_chunked_prefill (prefill_chunk_size , test_llm_kwargs )
227
+ run_equality_correctness_test (vllm_runner ,
228
+ common_llm_kwargs ,
229
+ per_test_common_llm_kwargs ,
230
+ baseline_llm_kwargs ,
231
+ test_llm_kwargs ,
232
+ batch_size ,
233
+ max_output_len = output_len ,
234
+ seed = seed ,
235
+ temperature = 0.0 )
236
+
237
+
238
+ @pytest .mark .skipif (True , reason = "Open it when preempt ready." )
239
+ @pytest .mark .parametrize (
240
+ "common_llm_kwargs" ,
241
+ [{
242
+ "block_size" : 16 ,
243
+ # 2 for small prompt, 256//8 for generated.
244
+ "num_gpu_blocks_override" : 2 + 256 // 8 ,
245
+ "max_model_len" : (2 + 256 // 8 ) * 8 ,
246
+
247
+ # Skip cuda graph recording for fast test.
248
+ "enforce_eager" : True ,
249
+
250
+ # Precision
251
+ "dtype" : PRECISION ,
252
+
253
+ # Main model
254
+ "model_name" : MAIN_MODEL ,
255
+ }])
256
+ @pytest .mark .parametrize ("per_test_common_llm_kwargs" , [{}])
257
+ @pytest .mark .parametrize ("baseline_llm_kwargs" , [{}])
258
+ @pytest .mark .parametrize ("test_llm_kwargs" , [
259
+ {
260
+ "speculative_config" : {
261
+ "model" : SPEC_MODEL ,
262
+ "num_speculative_tokens" : MAX_SPEC_TOKENS ,
263
+ },
264
+ },
265
+ ])
266
+ @pytest .mark .parametrize (
267
+ "output_len" ,
268
+ [
269
+ # Use small output len for fast test.
270
+ 128 ,
271
+ ])
272
+ @pytest .mark .parametrize ("batch_size" , [4 ])
273
+ @pytest .mark .parametrize ("seed" , [1 ])
274
+ @pytest .mark .parametrize ("prefill_chunk_size" , PREFILL_CHUNK_SIZE )
275
+ def test_medusa_e2e_greedy_correctness_with_preemption (
276
+ vllm_runner , common_llm_kwargs , per_test_common_llm_kwargs ,
277
+ baseline_llm_kwargs , test_llm_kwargs , batch_size : int , output_len : int ,
278
+ seed : int , prefill_chunk_size : int ):
279
+ """Verify greedy equality, even when some sequences are preempted mid-
280
+ generation.
281
+ """
282
+ maybe_enable_chunked_prefill (prefill_chunk_size , test_llm_kwargs )
283
+ run_equality_correctness_test (vllm_runner ,
284
+ common_llm_kwargs ,
285
+ per_test_common_llm_kwargs ,
286
+ baseline_llm_kwargs ,
287
+ test_llm_kwargs ,
288
+ batch_size ,
289
+ max_output_len = output_len ,
290
+ seed = seed ,
291
+ temperature = 0.0 )
293
292
294
293
295
294
@pytest .mark .parametrize (
0 commit comments