Skip to content

Commit 5e80285

Browse files
committed
Merge branch 'main' into separate-total-output-tokens-throughput
2 parents b47ff92 + 2838d6b commit 5e80285

File tree

178 files changed

+6135
-3674
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

178 files changed

+6135
-3674
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ steps:
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
99
# rename the files to change linux -> manylinux1
1010
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
11-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
12-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
11+
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
12+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
13+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
1314
env:
1415
DOCKER_BUILDKIT: "1"
1516

.buildkite/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image and launch offline inference
14-
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
14+
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py

.buildkite/test-pipeline.yaml

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
1111
# fast_check_only(bool): run this test on fastcheck pipeline only
12+
# optional(bool): never run this test by default (i.e. need to unblock manually)
1213
# command(str): the single command to run for tests. incompatible with commands.
1314
# commands(list): the list of commands to run for test. incompatbile with command.
1415
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,7 +40,7 @@ steps:
3940
# Check API reference (if it fails, you may have missing mock imports)
4041
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
4142

42-
- label: Async Engine, Inputs, Utils, Worker Test # 15min
43+
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4344
fast_check: true
4445
source_file_dependencies:
4546
- vllm/
@@ -63,13 +64,21 @@ steps:
6364
fast_check: true
6465
source_file_dependencies:
6566
- vllm/
66-
- tests/basic_correctness
67+
- tests/basic_correctness/test_basic_correctness
68+
- tests/basic_correctness/test_cpu_offload
69+
- tests/basic_correctness/test_preemption
6770
commands:
6871
- pytest -v -s basic_correctness/test_basic_correctness.py
6972
- pytest -v -s basic_correctness/test_cpu_offload.py
73+
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
74+
75+
- label: Chunked Prefill Test
76+
source_file_dependencies:
77+
- vllm/
78+
- tests/basic_correctness/test_chunked_prefill
79+
commands:
7080
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
7181
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
72-
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
7382

7483
- label: Core Test # 10min
7584
mirror_hardwares: [amd]
@@ -81,7 +90,7 @@ steps:
8190
commands:
8291
- pytest -v -s core
8392

84-
- label: Entrypoints Test # 20min
93+
- label: Entrypoints Test # 40min
8594
working_dir: "/vllm-workspace/tests"
8695
fast_check: true
8796
mirror_hardwares: [amd]
@@ -151,7 +160,7 @@ steps:
151160
# OOM in the CI unless we run this separately
152161
- pytest -v -s tokenization
153162

154-
- label: Examples Test # 12min
163+
- label: Examples Test # 15min
155164
working_dir: "/vllm-workspace/examples"
156165
#mirror_hardwares: [amd]
157166
source_file_dependencies:
@@ -169,15 +178,15 @@ steps:
169178
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
170179
- python3 offline_inference_encoder_decoder.py
171180

172-
- label: Prefix Caching Test # 7min
181+
- label: Prefix Caching Test # 9min
173182
#mirror_hardwares: [amd]
174183
source_file_dependencies:
175184
- vllm/
176185
- tests/prefix_caching
177186
commands:
178187
- pytest -v -s prefix_caching
179188

180-
- label: Samplers Test # 18min
189+
- label: Samplers Test # 36min
181190
source_file_dependencies:
182191
- vllm/model_executor/layers
183192
- vllm/sampling_metadata.py
@@ -193,40 +202,38 @@ steps:
193202
- tests/test_logits_processor
194203
command: pytest -v -s test_logits_processor.py
195204

196-
- label: Speculative decoding tests # 22min
205+
- label: Speculative decoding tests # 30min
197206
source_file_dependencies:
198207
- vllm/spec_decode
199208
- tests/spec_decode
200209
commands:
201-
# See https://github.com/vllm-project/vllm/issues/5152
202-
- export VLLM_ATTENTION_BACKEND=XFORMERS
203210
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
204-
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
211+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
205212

206-
- label: LoRA Test %N # 30min each
213+
- label: LoRA Test %N # 15min each
207214
mirror_hardwares: [amd]
208215
source_file_dependencies:
209216
- vllm/lora
210217
- tests/lora
211218
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
212219
parallelism: 4
213220

214-
- label: "PyTorch Fullgraph Smoke Test"
221+
- label: "PyTorch Fullgraph Smoke Test" # 9min
215222
fast_check: true
216223
source_file_dependencies:
217224
- vllm/
218225
- tests/compile
219226
commands:
220227
- pytest -v -s compile/test_full_graph_smoke.py
221228

222-
- label: "PyTorch Fullgraph Test"
229+
- label: "PyTorch Fullgraph Test" # 18min
223230
source_file_dependencies:
224231
- vllm/
225232
- tests/compile
226233
commands:
227234
- pytest -v -s compile/test_full_graph.py
228235

229-
- label: Kernels Test %N # 30min each
236+
- label: Kernels Test %N # 1h each
230237
mirror_hardwares: [amd]
231238
source_file_dependencies:
232239
- csrc/
@@ -256,7 +263,7 @@ steps:
256263
- pip install aiohttp
257264
- bash run-benchmarks.sh
258265

259-
- label: Quantization Test # 15min
266+
- label: Quantization Test # 33min
260267
source_file_dependencies:
261268
- csrc/
262269
- vllm/model_executor/layers/quantization
@@ -300,15 +307,15 @@ steps:
300307
- pytest -v -s models/test_oot_registration.py # it needs a clean process
301308
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
302309

303-
- label: Decoder-only Language Models Test # 1h3min
310+
- label: Decoder-only Language Models Test # 1h36min
304311
#mirror_hardwares: [amd]
305312
source_file_dependencies:
306313
- vllm/
307314
- tests/models/decoder_only/language
308315
commands:
309316
- pytest -v -s models/decoder_only/language
310317

311-
- label: Decoder-only Multi-Modal Models Test # 56min
318+
- label: Decoder-only Multi-Modal Models Test # 1h31min
312319
#mirror_hardwares: [amd]
313320
source_file_dependencies:
314321
- vllm/
@@ -318,15 +325,25 @@ steps:
318325
- pytest -v -s models/decoder_only/audio_language
319326
- pytest -v -s models/decoder_only/vision_language
320327

321-
- label: Other Models Test # 5min
328+
- label: Other Models Test # 6min
322329
#mirror_hardwares: [amd]
323330
source_file_dependencies:
324331
- vllm/
325332
- tests/models/embedding/language
326333
- tests/models/encoder_decoder/language
334+
- tests/models/encoder_decoder/vision_language
327335
commands:
328336
- pytest -v -s models/embedding/language
329337
- pytest -v -s models/encoder_decoder/language
338+
- pytest -v -s models/encoder_decoder/vision_language
339+
340+
- label: Custom Models Test
341+
#mirror_hardwares: [amd]
342+
optional: true
343+
commands:
344+
# PR authors can temporarily add commands below to test individual models
345+
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
346+
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
330347

331348
##### 1 GPU test #####
332349
##### multi gpus test #####
@@ -359,7 +376,7 @@ steps:
359376
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
360377
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
361378

362-
- label: Distributed Tests (2 GPUs) # 28min
379+
- label: Distributed Tests (2 GPUs) # 40min
363380
#mirror_hardwares: [amd]
364381
working_dir: "/vllm-workspace/tests"
365382
num_gpus: 2
@@ -376,14 +393,16 @@ steps:
376393
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
377394
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
378395
# Avoid importing model tests that cause CUDA reinitialization error
379-
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
396+
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
397+
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
398+
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
380399
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
381400
- pip install -e ./plugins/vllm_add_dummy_model
382401
- pytest -v -s distributed/test_distributed_oot.py
383402
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
384403
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
385404

386-
- label: Multi-step Tests (4 GPUs) # 21min
405+
- label: Multi-step Tests (4 GPUs) # 36min
387406
working_dir: "/vllm-workspace/tests"
388407
num_gpus: 4
389408
source_file_dependencies:
@@ -401,7 +420,7 @@ steps:
401420
- pytest -v -s multi_step/test_correctness_async_llm.py
402421
- pytest -v -s multi_step/test_correctness_llm.py
403422

404-
- label: Pipeline Parallelism Test # 23min
423+
- label: Pipeline Parallelism Test # 45min
405424
working_dir: "/vllm-workspace/tests"
406425
num_gpus: 4
407426
source_file_dependencies:
@@ -427,7 +446,7 @@ steps:
427446
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
428447
- pytest -v -s -x lora/test_long_context.py
429448

430-
- label: Weight Loading Multiple GPU Test
449+
- label: Weight Loading Multiple GPU Test # 33min
431450
working_dir: "/vllm-workspace/tests"
432451
num_gpus: 2
433452
source_file_dependencies:

.dockerignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
vllm/*.so
1+
/.github/
22
/.venv
33
/build
44
dist
5+
Dockerfile*
6+
vllm/*.so

.github/CODEOWNERS

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# See https://help.github.com/articles/about-codeowners/
2+
# for more info about CODEOWNERS file
3+
4+
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
5+
/tests/test_inputs.py @DarkLight1337 @ywang96
6+
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
7+
/tests/models @DarkLight1337 @ywang96
8+
/tests/multimodal @DarkLight1337 @ywang96
9+
/tests/prefix_caching @comaniac @KuntaiDu
10+
/tests/spec_decode @njhill @LiuXiaoxuanPKU
11+
/tests/kernels @tlrmchlsmth @WoosukKwon
12+
/tests/quantization @mgoin @robertgshaw2-neuralmagic
13+
/.buildkite/lm-eval-harness @mgoin @simon-mo
14+
/tests/distributed/test_multi_node_assignment.py @youkaichao
15+
/tests/distributed/test_pipeline_parallel.py @youkaichao
16+
/tests/distributed/test_same_node.py @youkaichao
17+
/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac
18+
/tests/weight_loading @mgoin @youkaichao
19+
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ share/python-wheels/
3333
.installed.cfg
3434
*.egg
3535
MANIFEST
36+
/.deps/
3637

3738
# PyInstaller
3839
# Usually these files are written by a python script from a template

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
166166
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
167167
endif()
168168

169+
170+
#
171+
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
172+
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
173+
#
169174
include(FetchContent)
175+
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
176+
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
177+
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
178+
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
170179

171180
#
172181
# Define other extension targets

Dockerfile.xpu

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
1+
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
22

33
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
44
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -7,20 +7,49 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
77
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
88
chmod 644 /usr/share/keyrings/intel-graphics.gpg
99

10-
RUN apt-get update -y && \
11-
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
12-
13-
COPY ./ /workspace/vllm
10+
RUN apt-get update -y && \
11+
apt-get install -y --no-install-recommends --fix-missing \
12+
curl \
13+
ffmpeg \
14+
git \
15+
libsndfile1 \
16+
libsm6 \
17+
libxext6 \
18+
libgl1 \
19+
lsb-release \
20+
numactl \
21+
python3 \
22+
python3-dev \
23+
python3-pip \
24+
# vim \
25+
wget
1426

1527
WORKDIR /workspace/vllm
28+
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
29+
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
1630

1731
RUN --mount=type=cache,target=/root/.cache/pip \
18-
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
19-
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
20-
-r requirements-xpu.txt
32+
pip install --no-cache-dir \
33+
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
34+
-r requirements-xpu.txt
35+
36+
COPY ./ /workspace/vllm
37+
38+
ENV VLLM_TARGET_DEVICE=xpu
2139

2240
RUN --mount=type=cache,target=/root/.cache/pip \
2341
--mount=type=bind,source=.git,target=.git \
24-
VLLM_TARGET_DEVICE=xpu python3 setup.py install
42+
python3 setup.py install
2543

2644
CMD ["/bin/bash"]
45+
46+
FROM vllm-base AS vllm-openai
47+
48+
# install additional dependencies for openai api server
49+
RUN --mount=type=cache,target=/root/.cache/pip \
50+
pip install accelerate hf_transfer 'modelscope!=1.15.0'
51+
52+
ENV VLLM_USAGE_SOURCE production-docker-image \
53+
TRITON_XPU_PROFILE 1
54+
55+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

0 commit comments

Comments
 (0)