Skip to content

Commit a0bba9b

Browse files
authored
Merge branch 'vllm-project:main' into main
2 parents 58f3116 + 22f5851 commit a0bba9b

File tree

458 files changed

+23557
-10612
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

458 files changed

+23557
-10612
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
2+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.764
8+
- name: "exact_match,flexible-extract"
9+
value: 0.764
10+
limit: 250
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
Meta-Llama-3-8B-Instruct.yaml
22
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
33
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
4+
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
45
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
56
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
67
Minitron-4B-Base-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
4949
results = launch_lm_eval(eval_config)
5050

5151
# Confirm scores match ground truth.
52+
success = True
5253
for task in eval_config["tasks"]:
5354
for metric in task["metrics"]:
5455
ground_truth = metric["value"]
5556
measured_value = results["results"][task["name"]][metric["name"]]
5657
print(f'{task["name"]} | {metric["name"]}: '
5758
f'ground_truth={ground_truth} | measured={measured_value}')
58-
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
59+
success = success and numpy.isclose(
60+
ground_truth, measured_value, rtol=RTOL)
61+
62+
# Assert at the end, print all scores even on failure for debugging.
63+
assert success

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ steps:
88
containers:
99
- image: badouralix/curl-jq
1010
command:
11-
- sh
12-
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
11+
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1312
- wait
1413
- label: "A100"
1514
agents:

.buildkite/nightly-benchmarks/scripts/wait-for-image.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
33
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
44

5+
TIMEOUT_SECONDS=10
6+
57
retries=0
68
while [ $retries -lt 1000 ]; do
7-
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
9+
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
810
exit 0
911
fi
1012

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ steps:
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
99
# rename the files to change linux -> manylinux1
1010
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
11-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
12-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
11+
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
12+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
13+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
1314
env:
1415
DOCKER_BUILDKIT: "1"
1516

.buildkite/run-amd-test.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
8383
--ignore=kernels/test_encoder_decoder_attn.py \
8484
--ignore=kernels/test_flash_attn.py \
8585
--ignore=kernels/test_flashinfer.py \
86+
--ignore=kernels/test_gguf.py \
8687
--ignore=kernels/test_int8_quant.py \
8788
--ignore=kernels/test_machete_gemm.py \
8889
--ignore=kernels/test_mamba_ssm.py \
@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
9394
--ignore=kernels/test_sampler.py"
9495
fi
9596

97+
#ignore certain Entrypoints tests
98+
if [[ $commands == *" entrypoints/openai "* ]]; then
99+
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
100+
--ignore=entrypoints/openai/test_accuracy.py \
101+
--ignore=entrypoints/openai/test_audio.py \
102+
--ignore=entrypoints/openai/test_encoder_decoder.py \
103+
--ignore=entrypoints/openai/test_embedding.py \
104+
--ignore=entrypoints/openai/test_oot_registration.py "}
105+
fi
106+
96107
PARALLEL_JOB_COUNT=8
97108
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
98109
if [[ $commands == *"--shard-id="* ]]; then

.buildkite/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image and launch offline inference
14-
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
14+
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py

.buildkite/test-pipeline.yaml

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
1111
# fast_check_only(bool): run this test on fastcheck pipeline only
12+
# optional(bool): never run this test by default (i.e. need to unblock manually)
1213
# command(str): the single command to run for tests. incompatible with commands.
1314
# commands(list): the list of commands to run for test. incompatbile with command.
1415
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,17 +40,19 @@ steps:
3940
# Check API reference (if it fails, you may have missing mock imports)
4041
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
4142

42-
- label: Async Engine, Inputs, Utils, Worker Test # 15min
43+
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4344
fast_check: true
4445
source_file_dependencies:
4546
- vllm/
47+
- tests/mq_llm_engine
4648
- tests/async_engine
4749
- tests/test_inputs
4850
- tests/multimodal
4951
- tests/test_utils
5052
- tests/worker
5153
commands:
52-
- pytest -v -s async_engine # Async Engine
54+
- pytest -v -s mq_llm_engine # MQLLMEngine
55+
- pytest -v -s async_engine # AsyncLLMEngine
5356
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
5457
- pytest -v -s test_inputs.py
5558
- pytest -v -s multimodal
@@ -61,14 +64,22 @@ steps:
6164
fast_check: true
6265
source_file_dependencies:
6366
- vllm/
64-
- tests/basic_correctness
67+
- tests/basic_correctness/test_basic_correctness
68+
- tests/basic_correctness/test_cpu_offload
69+
- tests/basic_correctness/test_preemption
6570
commands:
6671
- pytest -v -s basic_correctness/test_basic_correctness.py
6772
- pytest -v -s basic_correctness/test_cpu_offload.py
73+
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
74+
75+
- label: Chunked Prefill Test
76+
source_file_dependencies:
77+
- vllm/
78+
- tests/basic_correctness/test_chunked_prefill
79+
commands:
6880
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
6981
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
70-
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
71-
82+
7283
- label: Core Test # 10min
7384
mirror_hardwares: [amd]
7485
fast_check: true
@@ -79,18 +90,22 @@ steps:
7990
commands:
8091
- pytest -v -s core
8192

82-
- label: Entrypoints Test # 20min
93+
- label: Entrypoints Test # 40min
8394
working_dir: "/vllm-workspace/tests"
8495
fast_check: true
85-
#mirror_hardwares: [amd]
96+
mirror_hardwares: [amd]
8697
source_file_dependencies:
8798
- vllm/
8899
commands:
89100
- pip install -e ./plugins/vllm_add_dummy_model
90101
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
91-
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
102+
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
92103
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
93-
- pytest -v -s entrypoints/openai
104+
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
105+
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
106+
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
107+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
108+
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
94109
- pytest -v -s entrypoints/test_chat_utils.py
95110
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
96111

@@ -145,7 +160,7 @@ steps:
145160
# OOM in the CI unless we run this separately
146161
- pytest -v -s tokenization
147162

148-
- label: Examples Test # 12min
163+
- label: Examples Test # 15min
149164
working_dir: "/vllm-workspace/examples"
150165
#mirror_hardwares: [amd]
151166
source_file_dependencies:
@@ -163,22 +178,15 @@ steps:
163178
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
164179
- python3 offline_inference_encoder_decoder.py
165180

166-
- label: torch compile integration test
167-
source_file_dependencies:
168-
- vllm/
169-
commands:
170-
- pytest -v -s ./compile/test_full_graph.py
171-
- pytest -v -s ./compile/test_wrapper.py
172-
173-
- label: Prefix Caching Test # 7min
181+
- label: Prefix Caching Test # 9min
174182
#mirror_hardwares: [amd]
175183
source_file_dependencies:
176184
- vllm/
177185
- tests/prefix_caching
178186
commands:
179187
- pytest -v -s prefix_caching
180188

181-
- label: Samplers Test # 18min
189+
- label: Samplers Test # 36min
182190
source_file_dependencies:
183191
- vllm/model_executor/layers
184192
- vllm/sampling_metadata.py
@@ -194,25 +202,38 @@ steps:
194202
- tests/test_logits_processor
195203
command: pytest -v -s test_logits_processor.py
196204

197-
- label: Speculative decoding tests # 22min
205+
- label: Speculative decoding tests # 30min
198206
source_file_dependencies:
199207
- vllm/spec_decode
200208
- tests/spec_decode
201209
commands:
202-
# See https://github.com/vllm-project/vllm/issues/5152
203-
- export VLLM_ATTENTION_BACKEND=XFORMERS
204210
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
205211
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
206212

207-
- label: LoRA Test %N # 30min each
213+
- label: LoRA Test %N # 15min each
208214
mirror_hardwares: [amd]
209215
source_file_dependencies:
210216
- vllm/lora
211217
- tests/lora
212218
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
213219
parallelism: 4
214220

215-
- label: Kernels Test %N # 30min each
221+
- label: "PyTorch Fullgraph Smoke Test" # 9min
222+
fast_check: true
223+
source_file_dependencies:
224+
- vllm/
225+
- tests/compile
226+
commands:
227+
- pytest -v -s compile/test_full_graph_smoke.py
228+
229+
- label: "PyTorch Fullgraph Test" # 18min
230+
source_file_dependencies:
231+
- vllm/
232+
- tests/compile
233+
commands:
234+
- pytest -v -s compile/test_full_graph.py
235+
236+
- label: Kernels Test %N # 1h each
216237
mirror_hardwares: [amd]
217238
source_file_dependencies:
218239
- csrc/
@@ -242,7 +263,7 @@ steps:
242263
- pip install aiohttp
243264
- bash run-benchmarks.sh
244265

245-
- label: Quantization Test # 15min
266+
- label: Quantization Test # 33min
246267
source_file_dependencies:
247268
- csrc/
248269
- vllm/model_executor/layers/quantization
@@ -259,6 +280,13 @@ steps:
259280
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
260281
- bash ./run-tests.sh -c configs/models-small.txt -t 1
261282

283+
- label: Encoder Decoder tests # 5min
284+
source_file_dependencies:
285+
- vllm/
286+
- tests/encoder_decoder
287+
commands:
288+
- pytest -v -s encoder_decoder
289+
262290
- label: OpenAI-Compatible Tool Use # 20 min
263291
fast_check: false
264292
mirror_hardwares: [ amd ]
@@ -279,15 +307,15 @@ steps:
279307
- pytest -v -s models/test_oot_registration.py # it needs a clean process
280308
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
281309

282-
- label: Decoder-only Language Models Test # 1h3min
310+
- label: Decoder-only Language Models Test # 1h36min
283311
#mirror_hardwares: [amd]
284312
source_file_dependencies:
285313
- vllm/
286314
- tests/models/decoder_only/language
287315
commands:
288316
- pytest -v -s models/decoder_only/language
289317

290-
- label: Decoder-only Multi-Modal Models Test # 56min
318+
- label: Decoder-only Multi-Modal Models Test # 1h31min
291319
#mirror_hardwares: [amd]
292320
source_file_dependencies:
293321
- vllm/
@@ -297,15 +325,25 @@ steps:
297325
- pytest -v -s models/decoder_only/audio_language
298326
- pytest -v -s models/decoder_only/vision_language
299327

300-
- label: Other Models Test # 5min
328+
- label: Other Models Test # 6min
301329
#mirror_hardwares: [amd]
302330
source_file_dependencies:
303331
- vllm/
304332
- tests/models/embedding/language
305333
- tests/models/encoder_decoder/language
334+
- tests/models/encoder_decoder/vision_language
306335
commands:
307336
- pytest -v -s models/embedding/language
308337
- pytest -v -s models/encoder_decoder/language
338+
- pytest -v -s models/encoder_decoder/vision_language
339+
340+
- label: Custom Models Test
341+
#mirror_hardwares: [amd]
342+
optional: true
343+
commands:
344+
# PR authors can temporarily add commands below to test individual models
345+
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
346+
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
309347

310348
##### 1 GPU test #####
311349
##### multi gpus test #####
@@ -338,7 +376,7 @@ steps:
338376
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
339377
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
340378

341-
- label: Distributed Tests (2 GPUs) # 28min
379+
- label: Distributed Tests (2 GPUs) # 40min
342380
#mirror_hardwares: [amd]
343381
working_dir: "/vllm-workspace/tests"
344382
num_gpus: 2
@@ -348,18 +386,23 @@ steps:
348386
- vllm/executor/
349387
- vllm/model_executor/models/
350388
- tests/distributed/
389+
- vllm/compilation
351390
commands:
391+
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
392+
- pytest -v -s ./compile/test_wrapper.py
352393
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353394
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354395
# Avoid importing model tests that cause CUDA reinitialization error
355-
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
396+
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
397+
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
398+
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
356399
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
357400
- pip install -e ./plugins/vllm_add_dummy_model
358401
- pytest -v -s distributed/test_distributed_oot.py
359402
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
360403
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
361404

362-
- label: Multi-step Tests (4 GPUs) # 21min
405+
- label: Multi-step Tests (4 GPUs) # 36min
363406
working_dir: "/vllm-workspace/tests"
364407
num_gpus: 4
365408
source_file_dependencies:
@@ -377,7 +420,7 @@ steps:
377420
- pytest -v -s multi_step/test_correctness_async_llm.py
378421
- pytest -v -s multi_step/test_correctness_llm.py
379422

380-
- label: Pipeline Parallelism Test # 23min
423+
- label: Pipeline Parallelism Test # 45min
381424
working_dir: "/vllm-workspace/tests"
382425
num_gpus: 4
383426
source_file_dependencies:
@@ -403,7 +446,7 @@ steps:
403446
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
404447
- pytest -v -s -x lora/test_long_context.py
405448

406-
- label: Weight Loading Multiple GPU Test
449+
- label: Weight Loading Multiple GPU Test # 33min
407450
working_dir: "/vllm-workspace/tests"
408451
num_gpus: 2
409452
source_file_dependencies:
@@ -436,7 +479,7 @@ steps:
436479
# NOTE: don't test llama model here, it seems hf implementation is buggy
437480
# see https://github.com/vllm-project/vllm/pull/5689 for details
438481
- pytest -v -s distributed/test_custom_all_reduce.py
439-
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
482+
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
440483
- pytest -v -s -x lora/test_mixtral.py
441484

442485
- label: LM Eval Large Models # optional

0 commit comments

Comments
 (0)