9
9
# label(str): the name of the test. emoji allowed.
10
10
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
11
11
# fast_check_only(bool): run this test on fastcheck pipeline only
12
+ # optional(bool): never run this test by default (i.e. need to unblock manually)
12
13
# command(str): the single command to run for tests. incompatible with commands.
13
14
# commands(list): the list of commands to run for test. incompatbile with command.
14
15
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
39
40
# Check API reference (if it fails, you may have missing mock imports)
40
41
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
41
42
42
- - label : Async Engine, Inputs, Utils, Worker Test # 15min
43
+ - label : Async Engine, Inputs, Utils, Worker Test # 24min
43
44
fast_check : true
44
45
source_file_dependencies :
45
46
- vllm/
@@ -63,13 +64,21 @@ steps:
63
64
fast_check : true
64
65
source_file_dependencies :
65
66
- vllm/
66
- - tests/basic_correctness
67
+ - tests/basic_correctness/test_basic_correctness
68
+ - tests/basic_correctness/test_cpu_offload
69
+ - tests/basic_correctness/test_preemption
67
70
commands :
68
71
- pytest -v -s basic_correctness/test_basic_correctness.py
69
72
- pytest -v -s basic_correctness/test_cpu_offload.py
73
+ - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
74
+
75
+ - label : Chunked Prefill Test
76
+ source_file_dependencies :
77
+ - vllm/
78
+ - tests/basic_correctness/test_chunked_prefill
79
+ commands :
70
80
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
71
81
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
72
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
73
82
74
83
- label : Core Test # 10min
75
84
mirror_hardwares : [amd]
81
90
commands :
82
91
- pytest -v -s core
83
92
84
- - label : Entrypoints Test # 20min
93
+ - label : Entrypoints Test # 40min
85
94
working_dir : " /vllm-workspace/tests"
86
95
fast_check : true
87
96
mirror_hardwares : [amd]
@@ -151,7 +160,7 @@ steps:
151
160
# OOM in the CI unless we run this separately
152
161
- pytest -v -s tokenization
153
162
154
- - label : Examples Test # 12min
163
+ - label : Examples Test # 15min
155
164
working_dir : " /vllm-workspace/examples"
156
165
# mirror_hardwares: [amd]
157
166
source_file_dependencies :
@@ -169,15 +178,15 @@ steps:
169
178
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
170
179
- python3 offline_inference_encoder_decoder.py
171
180
172
- - label : Prefix Caching Test # 7min
181
+ - label : Prefix Caching Test # 9min
173
182
# mirror_hardwares: [amd]
174
183
source_file_dependencies :
175
184
- vllm/
176
185
- tests/prefix_caching
177
186
commands :
178
187
- pytest -v -s prefix_caching
179
188
180
- - label : Samplers Test # 18min
189
+ - label : Samplers Test # 36min
181
190
source_file_dependencies :
182
191
- vllm/model_executor/layers
183
192
- vllm/sampling_metadata.py
@@ -193,40 +202,38 @@ steps:
193
202
- tests/test_logits_processor
194
203
command : pytest -v -s test_logits_processor.py
195
204
196
- - label : Speculative decoding tests # 22min
205
+ - label : Speculative decoding tests # 30min
197
206
source_file_dependencies :
198
207
- vllm/spec_decode
199
208
- tests/spec_decode
200
209
commands :
201
- # See https://github.com/vllm-project/vllm/issues/5152
202
- - export VLLM_ATTENTION_BACKEND=XFORMERS
203
210
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
204
- - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
211
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
205
212
206
- - label : LoRA Test %N # 30min each
213
+ - label : LoRA Test %N # 15min each
207
214
mirror_hardwares : [amd]
208
215
source_file_dependencies :
209
216
- vllm/lora
210
217
- tests/lora
211
218
command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
212
219
parallelism : 4
213
220
214
- - label : " PyTorch Fullgraph Smoke Test"
221
+ - label : " PyTorch Fullgraph Smoke Test" # 9min
215
222
fast_check : true
216
223
source_file_dependencies :
217
224
- vllm/
218
225
- tests/compile
219
226
commands :
220
227
- pytest -v -s compile/test_full_graph_smoke.py
221
228
222
- - label : " PyTorch Fullgraph Test"
229
+ - label : " PyTorch Fullgraph Test" # 18min
223
230
source_file_dependencies :
224
231
- vllm/
225
232
- tests/compile
226
233
commands :
227
234
- pytest -v -s compile/test_full_graph.py
228
235
229
- - label : Kernels Test %N # 30min each
236
+ - label : Kernels Test %N # 1h each
230
237
mirror_hardwares : [amd]
231
238
source_file_dependencies :
232
239
- csrc/
@@ -256,7 +263,7 @@ steps:
256
263
- pip install aiohttp
257
264
- bash run-benchmarks.sh
258
265
259
- - label : Quantization Test # 15min
266
+ - label : Quantization Test # 33min
260
267
source_file_dependencies :
261
268
- csrc/
262
269
- vllm/model_executor/layers/quantization
@@ -300,15 +307,15 @@ steps:
300
307
- pytest -v -s models/test_oot_registration.py # it needs a clean process
301
308
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
302
309
303
- - label : Decoder-only Language Models Test # 1h3min
310
+ - label : Decoder-only Language Models Test # 1h36min
304
311
# mirror_hardwares: [amd]
305
312
source_file_dependencies :
306
313
- vllm/
307
314
- tests/models/decoder_only/language
308
315
commands :
309
316
- pytest -v -s models/decoder_only/language
310
317
311
- - label : Decoder-only Multi-Modal Models Test # 56min
318
+ - label : Decoder-only Multi-Modal Models Test # 1h31min
312
319
# mirror_hardwares: [amd]
313
320
source_file_dependencies :
314
321
- vllm/
@@ -318,15 +325,25 @@ steps:
318
325
- pytest -v -s models/decoder_only/audio_language
319
326
- pytest -v -s models/decoder_only/vision_language
320
327
321
- - label : Other Models Test # 5min
328
+ - label : Other Models Test # 6min
322
329
# mirror_hardwares: [amd]
323
330
source_file_dependencies :
324
331
- vllm/
325
332
- tests/models/embedding/language
326
333
- tests/models/encoder_decoder/language
334
+ - tests/models/encoder_decoder/vision_language
327
335
commands :
328
336
- pytest -v -s models/embedding/language
329
337
- pytest -v -s models/encoder_decoder/language
338
+ - pytest -v -s models/encoder_decoder/vision_language
339
+
340
+ - label : Custom Models Test
341
+ # mirror_hardwares: [amd]
342
+ optional : true
343
+ commands :
344
+ # PR authors can temporarily add commands below to test individual models
345
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
346
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
330
347
331
348
# #### 1 GPU test #####
332
349
# #### multi gpus test #####
@@ -359,7 +376,7 @@ steps:
359
376
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
360
377
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
361
378
362
- - label : Distributed Tests (2 GPUs) # 28min
379
+ - label : Distributed Tests (2 GPUs) # 40min
363
380
# mirror_hardwares: [amd]
364
381
working_dir : " /vllm-workspace/tests"
365
382
num_gpus : 2
@@ -376,14 +393,16 @@ steps:
376
393
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
377
394
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
378
395
# Avoid importing model tests that cause CUDA reinitialization error
379
- - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
396
+ - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
397
+ - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
398
+ - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
380
399
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
381
400
- pip install -e ./plugins/vllm_add_dummy_model
382
401
- pytest -v -s distributed/test_distributed_oot.py
383
402
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
384
403
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
385
404
386
- - label : Multi-step Tests (4 GPUs) # 21min
405
+ - label : Multi-step Tests (4 GPUs) # 36min
387
406
working_dir : " /vllm-workspace/tests"
388
407
num_gpus : 4
389
408
source_file_dependencies :
@@ -401,7 +420,7 @@ steps:
401
420
- pytest -v -s multi_step/test_correctness_async_llm.py
402
421
- pytest -v -s multi_step/test_correctness_llm.py
403
422
404
- - label : Pipeline Parallelism Test # 23min
423
+ - label : Pipeline Parallelism Test # 45min
405
424
working_dir : " /vllm-workspace/tests"
406
425
num_gpus : 4
407
426
source_file_dependencies :
@@ -427,7 +446,7 @@ steps:
427
446
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
428
447
- pytest -v -s -x lora/test_long_context.py
429
448
430
- - label : Weight Loading Multiple GPU Test
449
+ - label : Weight Loading Multiple GPU Test # 33min
431
450
working_dir : " /vllm-workspace/tests"
432
451
num_gpus : 2
433
452
source_file_dependencies :
0 commit comments