Skip to content

Commit e1ea867

Browse files
committed
add support
1 parent ce1907f commit e1ea867

File tree

6 files changed

+98
-89
lines changed

6 files changed

+98
-89
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ steps:
118118
- vllm/core/
119119
- tests/distributed
120120
- tests/spec_decode/e2e/test_integration_dist_tp4
121+
- tests/compile
121122
commands:
123+
- pytest -v -s compile/test_basic_correctness.py
122124
- pytest -v -s distributed/test_pynccl.py
123125
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
124126

@@ -224,7 +226,7 @@ steps:
224226
- vllm/
225227
- tests/compile
226228
commands:
227-
- pytest -v -s compile/test_full_graph_smoke.py
229+
- pytest -v -s compile/test_basic_correctness.py
228230

229231
- label: "PyTorch Fullgraph Test" # 18min
230232
source_file_dependencies:
@@ -388,7 +390,7 @@ steps:
388390
- tests/distributed/
389391
- vllm/compilation
390392
commands:
391-
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
393+
- pytest -v -s ./compile/test_basic_correctness.py
392394
- pytest -v -s ./compile/test_wrapper.py
393395
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
394396
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus

tests/compile/test_full_graph.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import pytest
22

3-
from vllm.compilation.backends import vllm_backend
4-
3+
from ..utils import fork_new_process_for_each_test
54
from .utils import TEST_MODELS, check_full_graph_support
65

76

87
@pytest.mark.parametrize("model_info", TEST_MODELS)
9-
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
10-
def test_full_graph(model_info, backend):
8+
@pytest.mark.parametrize("optimization_level", [1, 2])
9+
@fork_new_process_for_each_test
10+
def test_full_graph(model_info, optimization_level):
1111
model = model_info[0]
1212
model_kwargs = model_info[1]
13-
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
13+
check_full_graph_support(model,
14+
model_kwargs,
15+
optimization_level,
16+
tp_size=1)

tests/compile/test_full_graph_multi_gpu.py

Lines changed: 0 additions & 22 deletions
This file was deleted.

tests/compile/test_full_graph_smoke.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

tests/compile/utils.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,12 @@
44

55
from tests.quantization.utils import is_quant_method_supported
66
from vllm import LLM, SamplingParams
7-
from vllm.plugins import set_torch_compile_backend
87
from vllm.utils import is_hip
98

109
TEST_MODELS_SMOKE = [
11-
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
12-
"quantization": "compressed-tensors"
13-
}),
14-
("meta-llama/Meta-Llama-3-8B", {}),
10+
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
11+
["--quantization", "compressed-tensors"]),
12+
("meta-llama/Meta-Llama-3-8B", []),
1513
]
1614

1715
TEST_MODELS = [
@@ -68,20 +66,20 @@
6866
}))
6967

7068

71-
def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
69+
def check_full_graph_support(model,
70+
model_kwargs,
71+
optimization_level,
72+
tp_size=1):
7273
# make sure these models can be captured in full graph mode
73-
if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
74-
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
75-
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
74+
os.environ["VLLM_TEST_TORCH_COMPILE_LEVEL"] = str(optimization_level)
75+
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
7676

7777
# Inductor doesn't support fp8/gptq_marlin_24 yet.
7878
quantization = model_kwargs.get("quantization")
7979
if (quantization == "fp8" or quantization == "gptq_marlin"
80-
or quantization == "gptq_marlin_24") and backend != "eager":
80+
or quantization == "gptq_marlin_24") and optimization_level > 1:
8181
return
8282

83-
set_torch_compile_backend(backend)
84-
8583
prompts = [
8684
"Hello, my name is",
8785
"The president of the United States is",

tests/utils.py

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -180,18 +180,34 @@ def compare_two_settings(model: str,
180180
env1: The first set of environment variables to pass to the API server.
181181
env2: The second set of environment variables to pass to the API server.
182182
"""
183+
compare_all_settings(model, [arg1, arg2], [env1, env2], max_wait_seconds)
184+
185+
186+
def compare_all_settings(model: str,
187+
all_args: List[List[str]],
188+
all_envs: List[Optional[Dict[str, str]]],
189+
max_wait_seconds: Optional[float] = None) -> None:
190+
"""
191+
Launch API server with several different sets of arguments/environments
192+
and compare the results of the API calls with the first set of arguments.
193+
Args:
194+
model: The model to test.
195+
all_args: A list of argument lists to pass to the API server.
196+
all_envs: A list of environment dictionaries to pass to the API server.
197+
"""
183198

184199
trust_remote_code = "--trust-remote-code"
185-
if trust_remote_code in arg1 or trust_remote_code in arg2:
200+
if any(trust_remote_code in args for args in all_args):
186201
tokenizer = AutoTokenizer.from_pretrained(model,
187202
trust_remote_code=True)
188203
else:
189204
tokenizer = AutoTokenizer.from_pretrained(model)
190205

191206
prompt = "Hello, my name is"
192207
token_ids = tokenizer(prompt)["input_ids"]
193-
results = []
194-
for args, env in ((arg1, env1), (arg2, env2)):
208+
ref_results: List = []
209+
for i, (args, env) in enumerate(zip(all_args, all_envs)):
210+
compare_results: List = []
195211
with RemoteOpenAIServer(model,
196212
args,
197213
env_dict=env,
@@ -202,10 +218,13 @@ def compare_two_settings(model: str,
202218
models = client.models.list()
203219
models = models.data
204220
served_model = models[0]
205-
results.append({
206-
"test": "models_list",
207-
"id": served_model.id,
208-
"root": served_model.root,
221+
(ref_results if i == 0 else compare_results).append({
222+
"test":
223+
"models_list",
224+
"id":
225+
served_model.id,
226+
"root":
227+
served_model.root,
209228
})
210229

211230
# test with text prompt
@@ -214,11 +233,15 @@ def compare_two_settings(model: str,
214233
max_tokens=5,
215234
temperature=0.0)
216235

217-
results.append({
218-
"test": "single_completion",
219-
"text": completion.choices[0].text,
220-
"finish_reason": completion.choices[0].finish_reason,
221-
"usage": completion.usage,
236+
(ref_results if i == 0 else compare_results).append({
237+
"test":
238+
"single_completion",
239+
"text":
240+
completion.choices[0].text,
241+
"finish_reason":
242+
completion.choices[0].finish_reason,
243+
"usage":
244+
completion.usage,
222245
})
223246

224247
# test using token IDs
@@ -229,11 +252,15 @@ def compare_two_settings(model: str,
229252
temperature=0.0,
230253
)
231254

232-
results.append({
233-
"test": "token_ids",
234-
"text": completion.choices[0].text,
235-
"finish_reason": completion.choices[0].finish_reason,
236-
"usage": completion.usage,
255+
(ref_results if i == 0 else compare_results).append({
256+
"test":
257+
"token_ids",
258+
"text":
259+
completion.choices[0].text,
260+
"finish_reason":
261+
completion.choices[0].finish_reason,
262+
"usage":
263+
completion.usage,
237264
})
238265

239266
# test seeded random sampling
@@ -243,11 +270,15 @@ def compare_two_settings(model: str,
243270
seed=33,
244271
temperature=1.0)
245272

246-
results.append({
247-
"test": "seeded_sampling",
248-
"text": completion.choices[0].text,
249-
"finish_reason": completion.choices[0].finish_reason,
250-
"usage": completion.usage,
273+
(ref_results if i == 0 else compare_results).append({
274+
"test":
275+
"seeded_sampling",
276+
"text":
277+
completion.choices[0].text,
278+
"finish_reason":
279+
completion.choices[0].finish_reason,
280+
"usage":
281+
completion.usage,
251282
})
252283

253284
# test seeded random sampling with multiple prompts
@@ -257,7 +288,7 @@ def compare_two_settings(model: str,
257288
seed=33,
258289
temperature=1.0)
259290

260-
results.append({
291+
(ref_results if i == 0 else compare_results).append({
261292
"test":
262293
"seeded_sampling",
263294
"text": [choice.text for choice in completion.choices],
@@ -275,10 +306,13 @@ def compare_two_settings(model: str,
275306
temperature=0.0,
276307
)
277308

278-
results.append({
279-
"test": "simple_list",
280-
"text0": batch.choices[0].text,
281-
"text1": batch.choices[1].text,
309+
(ref_results if i == 0 else compare_results).append({
310+
"test":
311+
"simple_list",
312+
"text0":
313+
batch.choices[0].text,
314+
"text1":
315+
batch.choices[1].text,
282316
})
283317

284318
# test streaming
@@ -294,18 +328,25 @@ def compare_two_settings(model: str,
294328
assert len(chunk.choices) == 1
295329
choice = chunk.choices[0]
296330
texts[choice.index] += choice.text
297-
results.append({
331+
(ref_results if i == 0 else compare_results).append({
298332
"test": "streaming",
299333
"texts": texts,
300334
})
301335

302-
n = len(results) // 2
303-
arg1_results = results[:n]
304-
arg2_results = results[n:]
305-
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
306-
assert arg1_result == arg2_result, (
307-
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
308-
f"{arg1_result=} != {arg2_result=}")
336+
if i > 0:
337+
# if any setting fails, raise an error early
338+
ref_args = all_args[0]
339+
ref_envs = all_envs[0]
340+
compare_args = all_args[i]
341+
compare_envs = all_envs[i]
342+
for ref_result, compare_result in zip(ref_results,
343+
compare_results):
344+
assert ref_result == compare_result, (
345+
f"Results for {model=} are not the same.\n"
346+
f"{ref_args=} {ref_envs=}\n"
347+
f"{compare_args=} {compare_envs=}\n"
348+
f"{ref_result=}\n"
349+
f"{compare_result=}\n")
309350

310351

311352
def init_test_distributed_environment(

0 commit comments

Comments
 (0)