Skip to content

Commit 19c8e13

Browse files
authored
[CI/UT] fix spec ut in vllm-ascend main and vllm main (vllm-project#759)
### What this PR does / why we need it? #### 1. fix spec ut in vllm-ascend main and vllm main As vllm-project#694 and vllm-project#749 verify, Now, vllm-ascend main and vllm 0.8.5, spec UT is happy, but vllm-ascend main and vllm main, CI is fail. I found the reason is a triton bug triton-lang/triton#2266, but i I didn't figure it out that why the bug did not effect vllm-ascend main and vllm 0.8.5, maybe the usage of triton have changed when vllm 0.8.5 to latest main As the bug describe, I changed the minimum block_size in UT from 8 to 16, and the modification is verified locally to be effective. #### 2. modify some case skip form. I modified some commented out cases to skipif form, which is more standardized. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
1 parent 58d2f85 commit 19c8e13

File tree

5 files changed

+281
-289
lines changed

5 files changed

+281
-289
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,7 @@ jobs:
153153
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
154154
155155
- name: Run vllm-project/vllm-ascend Speculative Decode test
156-
# speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main
157-
if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
156+
if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
158157
run: |
159158
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
160159
pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process

tests/singlecard/spec_decode/e2e/test_medusa_correctness.py

Lines changed: 103 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
186186
["disable_logprobs"])
187187

188188

189-
# TODO: Open it when vllm-ascend support graph mode and
190-
# @pytest.mark.parametrize(
191-
# "common_llm_kwargs",
192-
# [{
193-
# "enforce_eager": False,
194-
195-
# # Print spec metrics.
196-
# "disable_log_stats": False,
197-
198-
# # Precision
199-
# "dtype": PRECISION,
200-
201-
# # Main model
202-
# "model_name": MAIN_MODEL,
203-
# }])
204-
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
205-
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
206-
# @pytest.mark.parametrize("test_llm_kwargs", [
207-
# {
208-
# "speculative_config": {
209-
# "model": SPEC_MODEL,
210-
# "num_speculative_tokens": MAX_SPEC_TOKENS,
211-
# },
212-
# },
213-
# ])
214-
# @pytest.mark.parametrize("output_len", [
215-
# 128,
216-
# ])
217-
# @pytest.mark.parametrize("batch_size", [1, 32])
218-
# @pytest.mark.parametrize("seed", [1])
219-
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
220-
# def test_medusa_e2e_greedy_correctness_cuda_graph(
221-
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
222-
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
223-
# seed: int, prefill_chunk_size: int):
224-
# """Verify greedy equality with cuda graph enabled and different
225-
# batch sizes."""
226-
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
227-
# run_equality_correctness_test(vllm_runner,
228-
# common_llm_kwargs,
229-
# per_test_common_llm_kwargs,
230-
# baseline_llm_kwargs,
231-
# test_llm_kwargs,
232-
# batch_size,
233-
# max_output_len=output_len,
234-
# seed=seed,
235-
# temperature=0.0)
236-
237-
# TODO: There is a problem with the preemptive scheduling in the current
238-
# version, which makes this case fail. Please release this case after the
239-
# preemptive scheduling problem is solved.
240-
# @pytest.mark.parametrize(
241-
# "common_llm_kwargs",
242-
# [{
243-
# "block_size": 8,
244-
# # 2 for small prompt, 256//8 for generated.
245-
# "num_gpu_blocks_override": 2 + 256 // 8,
246-
# "max_model_len": (2 + 256 // 8) * 8,
247-
248-
# # Skip cuda graph recording for fast test.
249-
# "enforce_eager": True,
250-
251-
# # Precision
252-
# "dtype": PRECISION,
253-
254-
# # Main model
255-
# "model_name": MAIN_MODEL,
256-
# }])
257-
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
258-
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
259-
# @pytest.mark.parametrize("test_llm_kwargs", [
260-
# {
261-
# "speculative_config": {
262-
# "model": SPEC_MODEL,
263-
# "num_speculative_tokens": MAX_SPEC_TOKENS,
264-
# },
265-
# },
266-
# ])
267-
# @pytest.mark.parametrize(
268-
# "output_len",
269-
# [
270-
# # Use small output len for fast test.
271-
# 128,
272-
# ])
273-
# @pytest.mark.parametrize("batch_size", [4])
274-
# @pytest.mark.parametrize("seed", [1])
275-
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
276-
# def test_medusa_e2e_greedy_correctness_with_preemption(
277-
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
278-
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
279-
# seed: int, prefill_chunk_size: int):
280-
# """Verify greedy equality, even when some sequences are preempted mid-
281-
# generation.
282-
# """
283-
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
284-
# run_equality_correctness_test(vllm_runner,
285-
# common_llm_kwargs,
286-
# per_test_common_llm_kwargs,
287-
# baseline_llm_kwargs,
288-
# test_llm_kwargs,
289-
# batch_size,
290-
# max_output_len=output_len,
291-
# seed=seed,
292-
# temperature=0.0)
189+
@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
190+
@pytest.mark.parametrize(
191+
"common_llm_kwargs",
192+
[{
193+
"enforce_eager": False,
194+
195+
# Print spec metrics.
196+
"disable_log_stats": False,
197+
198+
# Precision
199+
"dtype": PRECISION,
200+
201+
# Main model
202+
"model_name": MAIN_MODEL,
203+
}])
204+
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
205+
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
206+
@pytest.mark.parametrize("test_llm_kwargs", [
207+
{
208+
"speculative_config": {
209+
"model": SPEC_MODEL,
210+
"num_speculative_tokens": MAX_SPEC_TOKENS,
211+
},
212+
},
213+
])
214+
@pytest.mark.parametrize("output_len", [
215+
128,
216+
])
217+
@pytest.mark.parametrize("batch_size", [1, 32])
218+
@pytest.mark.parametrize("seed", [1])
219+
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
220+
def test_medusa_e2e_greedy_correctness_cuda_graph(
221+
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
222+
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
223+
seed: int, prefill_chunk_size: int):
224+
"""Verify greedy equality with cuda graph enabled and different
225+
batch sizes."""
226+
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
227+
run_equality_correctness_test(vllm_runner,
228+
common_llm_kwargs,
229+
per_test_common_llm_kwargs,
230+
baseline_llm_kwargs,
231+
test_llm_kwargs,
232+
batch_size,
233+
max_output_len=output_len,
234+
seed=seed,
235+
temperature=0.0)
236+
237+
238+
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
239+
@pytest.mark.parametrize(
240+
"common_llm_kwargs",
241+
[{
242+
"block_size": 16,
243+
# 2 for small prompt, 256//8 for generated.
244+
"num_gpu_blocks_override": 2 + 256 // 8,
245+
"max_model_len": (2 + 256 // 8) * 8,
246+
247+
# Skip cuda graph recording for fast test.
248+
"enforce_eager": True,
249+
250+
# Precision
251+
"dtype": PRECISION,
252+
253+
# Main model
254+
"model_name": MAIN_MODEL,
255+
}])
256+
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
257+
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
258+
@pytest.mark.parametrize("test_llm_kwargs", [
259+
{
260+
"speculative_config": {
261+
"model": SPEC_MODEL,
262+
"num_speculative_tokens": MAX_SPEC_TOKENS,
263+
},
264+
},
265+
])
266+
@pytest.mark.parametrize(
267+
"output_len",
268+
[
269+
# Use small output len for fast test.
270+
128,
271+
])
272+
@pytest.mark.parametrize("batch_size", [4])
273+
@pytest.mark.parametrize("seed", [1])
274+
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
275+
def test_medusa_e2e_greedy_correctness_with_preemption(
276+
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
277+
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
278+
seed: int, prefill_chunk_size: int):
279+
"""Verify greedy equality, even when some sequences are preempted mid-
280+
generation.
281+
"""
282+
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
283+
run_equality_correctness_test(vllm_runner,
284+
common_llm_kwargs,
285+
per_test_common_llm_kwargs,
286+
baseline_llm_kwargs,
287+
test_llm_kwargs,
288+
batch_size,
289+
max_output_len=output_len,
290+
seed=seed,
291+
temperature=0.0)
293292

294293

295294
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)