9
9
# label(str): the name of the test. emoji allowed.
10
10
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
11
11
# fast_check_only(bool): run this test on fastcheck pipeline only
12
+ # optional(bool): never run this test by default (i.e. need to unblock manually)
12
13
# command(str): the single command to run for tests. incompatible with commands.
13
14
# commands(list): the list of commands to run for test. incompatbile with command.
14
15
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,17 +40,19 @@ steps:
39
40
# Check API reference (if it fails, you may have missing mock imports)
40
41
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
41
42
42
- - label : Async Engine, Inputs, Utils, Worker Test # 15min
43
+ - label : Async Engine, Inputs, Utils, Worker Test # 24min
43
44
fast_check : true
44
45
source_file_dependencies :
45
46
- vllm/
47
+ - tests/mq_llm_engine
46
48
- tests/async_engine
47
49
- tests/test_inputs
48
50
- tests/multimodal
49
51
- tests/test_utils
50
52
- tests/worker
51
53
commands :
52
- - pytest -v -s async_engine # Async Engine
54
+ - pytest -v -s mq_llm_engine # MQLLMEngine
55
+ - pytest -v -s async_engine # AsyncLLMEngine
53
56
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
54
57
- pytest -v -s test_inputs.py
55
58
- pytest -v -s multimodal
@@ -61,14 +64,22 @@ steps:
61
64
fast_check : true
62
65
source_file_dependencies :
63
66
- vllm/
64
- - tests/basic_correctness
67
+ - tests/basic_correctness/test_basic_correctness
68
+ - tests/basic_correctness/test_cpu_offload
69
+ - tests/basic_correctness/test_preemption
65
70
commands :
66
71
- pytest -v -s basic_correctness/test_basic_correctness.py
67
72
- pytest -v -s basic_correctness/test_cpu_offload.py
73
+ - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
74
+
75
+ - label : Chunked Prefill Test
76
+ source_file_dependencies :
77
+ - vllm/
78
+ - tests/basic_correctness/test_chunked_prefill
79
+ commands :
68
80
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
69
81
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
70
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
71
-
82
+
72
83
- label : Core Test # 10min
73
84
mirror_hardwares : [amd]
74
85
fast_check : true
@@ -79,18 +90,22 @@ steps:
79
90
commands :
80
91
- pytest -v -s core
81
92
82
- - label : Entrypoints Test # 20min
93
+ - label : Entrypoints Test # 40min
83
94
working_dir : " /vllm-workspace/tests"
84
95
fast_check : true
85
- # mirror_hardwares: [amd]
96
+ mirror_hardwares : [amd]
86
97
source_file_dependencies :
87
98
- vllm/
88
99
commands :
89
100
- pip install -e ./plugins/vllm_add_dummy_model
90
101
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
91
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
102
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
92
103
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
93
- - pytest -v -s entrypoints/openai
104
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
105
+ - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
106
+ - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
107
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
108
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
94
109
- pytest -v -s entrypoints/test_chat_utils.py
95
110
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
96
111
@@ -145,7 +160,7 @@ steps:
145
160
# OOM in the CI unless we run this separately
146
161
- pytest -v -s tokenization
147
162
148
- - label : Examples Test # 12min
163
+ - label : Examples Test # 15min
149
164
working_dir : " /vllm-workspace/examples"
150
165
# mirror_hardwares: [amd]
151
166
source_file_dependencies :
@@ -163,22 +178,15 @@ steps:
163
178
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
164
179
- python3 offline_inference_encoder_decoder.py
165
180
166
- - label : torch compile integration test
167
- source_file_dependencies :
168
- - vllm/
169
- commands :
170
- - pytest -v -s ./compile/test_full_graph.py
171
- - pytest -v -s ./compile/test_wrapper.py
172
-
173
- - label : Prefix Caching Test # 7min
181
+ - label : Prefix Caching Test # 9min
174
182
# mirror_hardwares: [amd]
175
183
source_file_dependencies :
176
184
- vllm/
177
185
- tests/prefix_caching
178
186
commands :
179
187
- pytest -v -s prefix_caching
180
188
181
- - label : Samplers Test # 18min
189
+ - label : Samplers Test # 36min
182
190
source_file_dependencies :
183
191
- vllm/model_executor/layers
184
192
- vllm/sampling_metadata.py
@@ -194,25 +202,38 @@ steps:
194
202
- tests/test_logits_processor
195
203
command : pytest -v -s test_logits_processor.py
196
204
197
- - label : Speculative decoding tests # 22min
205
+ - label : Speculative decoding tests # 30min
198
206
source_file_dependencies :
199
207
- vllm/spec_decode
200
208
- tests/spec_decode
201
209
commands :
202
- # See https://github.com/vllm-project/vllm/issues/5152
203
- - export VLLM_ATTENTION_BACKEND=XFORMERS
204
210
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
205
211
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
206
212
207
- - label : LoRA Test %N # 30min each
213
+ - label : LoRA Test %N # 15min each
208
214
mirror_hardwares : [amd]
209
215
source_file_dependencies :
210
216
- vllm/lora
211
217
- tests/lora
212
218
command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
213
219
parallelism : 4
214
220
215
- - label : Kernels Test %N # 30min each
221
+ - label : " PyTorch Fullgraph Smoke Test" # 9min
222
+ fast_check : true
223
+ source_file_dependencies :
224
+ - vllm/
225
+ - tests/compile
226
+ commands :
227
+ - pytest -v -s compile/test_full_graph_smoke.py
228
+
229
+ - label : " PyTorch Fullgraph Test" # 18min
230
+ source_file_dependencies :
231
+ - vllm/
232
+ - tests/compile
233
+ commands :
234
+ - pytest -v -s compile/test_full_graph.py
235
+
236
+ - label : Kernels Test %N # 1h each
216
237
mirror_hardwares : [amd]
217
238
source_file_dependencies :
218
239
- csrc/
@@ -242,7 +263,7 @@ steps:
242
263
- pip install aiohttp
243
264
- bash run-benchmarks.sh
244
265
245
- - label : Quantization Test # 15min
266
+ - label : Quantization Test # 33min
246
267
source_file_dependencies :
247
268
- csrc/
248
269
- vllm/model_executor/layers/quantization
@@ -259,6 +280,13 @@ steps:
259
280
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
260
281
- bash ./run-tests.sh -c configs/models-small.txt -t 1
261
282
283
+ - label : Encoder Decoder tests # 5min
284
+ source_file_dependencies :
285
+ - vllm/
286
+ - tests/encoder_decoder
287
+ commands :
288
+ - pytest -v -s encoder_decoder
289
+
262
290
- label : OpenAI-Compatible Tool Use # 20 min
263
291
fast_check : false
264
292
mirror_hardwares : [ amd ]
@@ -279,15 +307,15 @@ steps:
279
307
- pytest -v -s models/test_oot_registration.py # it needs a clean process
280
308
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
281
309
282
- - label : Decoder-only Language Models Test # 1h3min
310
+ - label : Decoder-only Language Models Test # 1h36min
283
311
# mirror_hardwares: [amd]
284
312
source_file_dependencies :
285
313
- vllm/
286
314
- tests/models/decoder_only/language
287
315
commands :
288
316
- pytest -v -s models/decoder_only/language
289
317
290
- - label : Decoder-only Multi-Modal Models Test # 56min
318
+ - label : Decoder-only Multi-Modal Models Test # 1h31min
291
319
# mirror_hardwares: [amd]
292
320
source_file_dependencies :
293
321
- vllm/
@@ -297,15 +325,25 @@ steps:
297
325
- pytest -v -s models/decoder_only/audio_language
298
326
- pytest -v -s models/decoder_only/vision_language
299
327
300
- - label : Other Models Test # 5min
328
+ - label : Other Models Test # 6min
301
329
# mirror_hardwares: [amd]
302
330
source_file_dependencies :
303
331
- vllm/
304
332
- tests/models/embedding/language
305
333
- tests/models/encoder_decoder/language
334
+ - tests/models/encoder_decoder/vision_language
306
335
commands :
307
336
- pytest -v -s models/embedding/language
308
337
- pytest -v -s models/encoder_decoder/language
338
+ - pytest -v -s models/encoder_decoder/vision_language
339
+
340
+ - label : Custom Models Test
341
+ # mirror_hardwares: [amd]
342
+ optional : true
343
+ commands :
344
+ # PR authors can temporarily add commands below to test individual models
345
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
346
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
309
347
310
348
# #### 1 GPU test #####
311
349
# #### multi gpus test #####
@@ -338,7 +376,7 @@ steps:
338
376
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
339
377
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
340
378
341
- - label : Distributed Tests (2 GPUs) # 28min
379
+ - label : Distributed Tests (2 GPUs) # 40min
342
380
# mirror_hardwares: [amd]
343
381
working_dir : " /vllm-workspace/tests"
344
382
num_gpus : 2
@@ -348,18 +386,23 @@ steps:
348
386
- vllm/executor/
349
387
- vllm/model_executor/models/
350
388
- tests/distributed/
389
+ - vllm/compilation
351
390
commands :
391
+ - pytest -v -s ./compile/test_full_graph_multi_gpu.py
392
+ - pytest -v -s ./compile/test_wrapper.py
352
393
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353
394
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354
395
# Avoid importing model tests that cause CUDA reinitialization error
355
- - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
396
+ - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
397
+ - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
398
+ - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
356
399
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
357
400
- pip install -e ./plugins/vllm_add_dummy_model
358
401
- pytest -v -s distributed/test_distributed_oot.py
359
402
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
360
403
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
361
404
362
- - label : Multi-step Tests (4 GPUs) # 21min
405
+ - label : Multi-step Tests (4 GPUs) # 36min
363
406
working_dir : " /vllm-workspace/tests"
364
407
num_gpus : 4
365
408
source_file_dependencies :
@@ -377,7 +420,7 @@ steps:
377
420
- pytest -v -s multi_step/test_correctness_async_llm.py
378
421
- pytest -v -s multi_step/test_correctness_llm.py
379
422
380
- - label : Pipeline Parallelism Test # 23min
423
+ - label : Pipeline Parallelism Test # 45min
381
424
working_dir : " /vllm-workspace/tests"
382
425
num_gpus : 4
383
426
source_file_dependencies :
@@ -403,7 +446,7 @@ steps:
403
446
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
404
447
- pytest -v -s -x lora/test_long_context.py
405
448
406
- - label : Weight Loading Multiple GPU Test
449
+ - label : Weight Loading Multiple GPU Test # 33min
407
450
working_dir : " /vllm-workspace/tests"
408
451
num_gpus : 2
409
452
source_file_dependencies :
@@ -436,7 +479,7 @@ steps:
436
479
# NOTE: don't test llama model here, it seems hf implementation is buggy
437
480
# see https://github.com/vllm-project/vllm/pull/5689 for details
438
481
- pytest -v -s distributed/test_custom_all_reduce.py
439
- - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
482
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
440
483
- pytest -v -s -x lora/test_mixtral.py
441
484
442
485
- label : LM Eval Large Models # optional
0 commit comments