Skip to content

Commit 0b4b062

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fix-enc-dec-seed
2 parents 19a6439 + 3d49776 commit 0b4b062

File tree

149 files changed

+3341
-1228
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+3341
-1228
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
2+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.764
8+
- name: "exact_match,flexible-extract"
9+
value: 0.764
10+
limit: 250
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
Meta-Llama-3-8B-Instruct.yaml
22
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
33
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
4+
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
45
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
56
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
67
Minitron-4B-Base-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
4949
results = launch_lm_eval(eval_config)
5050

5151
# Confirm scores match ground truth.
52+
success = True
5253
for task in eval_config["tasks"]:
5354
for metric in task["metrics"]:
5455
ground_truth = metric["value"]
5556
measured_value = results["results"][task["name"]][metric["name"]]
5657
print(f'{task["name"]} | {metric["name"]}: '
5758
f'ground_truth={ground_truth} | measured={measured_value}')
58-
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
59+
success = success and numpy.isclose(
60+
ground_truth, measured_value, rtol=RTOL)
61+
62+
# Assert at the end, print all scores even on failure for debugging.
63+
assert success

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ steps:
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
99
# rename the files to change linux -> manylinux1
1010
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
11-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
12-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
11+
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
12+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
13+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
1314
env:
1415
DOCKER_BUILDKIT: "1"
1516

.buildkite/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image and launch offline inference
14-
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
14+
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py

.buildkite/test-pipeline.yaml

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
1111
# fast_check_only(bool): run this test on fastcheck pipeline only
12+
# optional(bool): never run this test by default (i.e. need to unblock manually)
1213
# command(str): the single command to run for tests. incompatible with commands.
1314
# commands(list): the list of commands to run for test. incompatbile with command.
1415
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,7 +40,7 @@ steps:
3940
# Check API reference (if it fails, you may have missing mock imports)
4041
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
4142

42-
- label: Async Engine, Inputs, Utils, Worker Test # 15min
43+
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4344
fast_check: true
4445
source_file_dependencies:
4546
- vllm/
@@ -81,7 +82,7 @@ steps:
8182
commands:
8283
- pytest -v -s core
8384

84-
- label: Entrypoints Test # 20min
85+
- label: Entrypoints Test # 40min
8586
working_dir: "/vllm-workspace/tests"
8687
fast_check: true
8788
mirror_hardwares: [amd]
@@ -95,7 +96,8 @@ steps:
9596
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
9697
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
9798
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
98-
- pytest -v -s entrypoints/openai
99+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
100+
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
99101
- pytest -v -s entrypoints/test_chat_utils.py
100102
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
101103

@@ -150,7 +152,7 @@ steps:
150152
# OOM in the CI unless we run this separately
151153
- pytest -v -s tokenization
152154

153-
- label: Examples Test # 12min
155+
- label: Examples Test # 15min
154156
working_dir: "/vllm-workspace/examples"
155157
#mirror_hardwares: [amd]
156158
source_file_dependencies:
@@ -168,15 +170,15 @@ steps:
168170
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
169171
- python3 offline_inference_encoder_decoder.py
170172

171-
- label: Prefix Caching Test # 7min
173+
- label: Prefix Caching Test # 9min
172174
#mirror_hardwares: [amd]
173175
source_file_dependencies:
174176
- vllm/
175177
- tests/prefix_caching
176178
commands:
177179
- pytest -v -s prefix_caching
178180

179-
- label: Samplers Test # 18min
181+
- label: Samplers Test # 36min
180182
source_file_dependencies:
181183
- vllm/model_executor/layers
182184
- vllm/sampling_metadata.py
@@ -192,7 +194,7 @@ steps:
192194
- tests/test_logits_processor
193195
command: pytest -v -s test_logits_processor.py
194196

195-
- label: Speculative decoding tests # 22min
197+
- label: Speculative decoding tests # 30min
196198
source_file_dependencies:
197199
- vllm/spec_decode
198200
- tests/spec_decode
@@ -202,30 +204,30 @@ steps:
202204
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
203205
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
204206

205-
- label: LoRA Test %N # 30min each
207+
- label: LoRA Test %N # 15min each
206208
mirror_hardwares: [amd]
207209
source_file_dependencies:
208210
- vllm/lora
209211
- tests/lora
210212
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
211213
parallelism: 4
212214

213-
- label: "PyTorch Fullgraph Smoke Test"
215+
- label: "PyTorch Fullgraph Smoke Test" # 9min
214216
fast_check: true
215217
source_file_dependencies:
216218
- vllm/
217219
- tests/compile
218220
commands:
219221
- pytest -v -s compile/test_full_graph_smoke.py
220222

221-
- label: "PyTorch Fullgraph Test"
223+
- label: "PyTorch Fullgraph Test" # 18min
222224
source_file_dependencies:
223225
- vllm/
224226
- tests/compile
225227
commands:
226228
- pytest -v -s compile/test_full_graph.py
227229

228-
- label: Kernels Test %N # 30min each
230+
- label: Kernels Test %N # 1h each
229231
mirror_hardwares: [amd]
230232
source_file_dependencies:
231233
- csrc/
@@ -255,7 +257,7 @@ steps:
255257
- pip install aiohttp
256258
- bash run-benchmarks.sh
257259

258-
- label: Quantization Test # 15min
260+
- label: Quantization Test # 33min
259261
source_file_dependencies:
260262
- csrc/
261263
- vllm/model_executor/layers/quantization
@@ -299,15 +301,15 @@ steps:
299301
- pytest -v -s models/test_oot_registration.py # it needs a clean process
300302
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
301303

302-
- label: Decoder-only Language Models Test # 1h3min
304+
- label: Decoder-only Language Models Test # 1h36min
303305
#mirror_hardwares: [amd]
304306
source_file_dependencies:
305307
- vllm/
306308
- tests/models/decoder_only/language
307309
commands:
308310
- pytest -v -s models/decoder_only/language
309311

310-
- label: Decoder-only Multi-Modal Models Test # 56min
312+
- label: Decoder-only Multi-Modal Models Test # 1h31min
311313
#mirror_hardwares: [amd]
312314
source_file_dependencies:
313315
- vllm/
@@ -317,15 +319,25 @@ steps:
317319
- pytest -v -s models/decoder_only/audio_language
318320
- pytest -v -s models/decoder_only/vision_language
319321

320-
- label: Other Models Test # 5min
322+
- label: Other Models Test # 6min
321323
#mirror_hardwares: [amd]
322324
source_file_dependencies:
323325
- vllm/
324326
- tests/models/embedding/language
325327
- tests/models/encoder_decoder/language
328+
- tests/models/encoder_decoder/vision_language
326329
commands:
327330
- pytest -v -s models/embedding/language
328331
- pytest -v -s models/encoder_decoder/language
332+
- pytest -v -s models/encoder_decoder/vision_language
333+
334+
- label: Custom Models Test
335+
#mirror_hardwares: [amd]
336+
optional: true
337+
commands:
338+
# PR authors can temporarily add commands below to test individual models
339+
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
340+
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
329341

330342
##### 1 GPU test #####
331343
##### multi gpus test #####
@@ -358,7 +370,7 @@ steps:
358370
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
359371
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
360372

361-
- label: Distributed Tests (2 GPUs) # 28min
373+
- label: Distributed Tests (2 GPUs) # 40min
362374
#mirror_hardwares: [amd]
363375
working_dir: "/vllm-workspace/tests"
364376
num_gpus: 2
@@ -375,14 +387,16 @@ steps:
375387
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
376388
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
377389
# Avoid importing model tests that cause CUDA reinitialization error
378-
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
390+
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
391+
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
392+
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
379393
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
380394
- pip install -e ./plugins/vllm_add_dummy_model
381395
- pytest -v -s distributed/test_distributed_oot.py
382396
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
383397
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
384398

385-
- label: Multi-step Tests (4 GPUs) # 21min
399+
- label: Multi-step Tests (4 GPUs) # 36min
386400
working_dir: "/vllm-workspace/tests"
387401
num_gpus: 4
388402
source_file_dependencies:
@@ -400,7 +414,7 @@ steps:
400414
- pytest -v -s multi_step/test_correctness_async_llm.py
401415
- pytest -v -s multi_step/test_correctness_llm.py
402416

403-
- label: Pipeline Parallelism Test # 23min
417+
- label: Pipeline Parallelism Test # 45min
404418
working_dir: "/vllm-workspace/tests"
405419
num_gpus: 4
406420
source_file_dependencies:
@@ -426,7 +440,7 @@ steps:
426440
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
427441
- pytest -v -s -x lora/test_long_context.py
428442

429-
- label: Weight Loading Multiple GPU Test
443+
- label: Weight Loading Multiple GPU Test # 33min
430444
working_dir: "/vllm-workspace/tests"
431445
num_gpus: 2
432446
source_file_dependencies:
@@ -459,7 +473,7 @@ steps:
459473
# NOTE: don't test llama model here, it seems hf implementation is buggy
460474
# see https://github.com/vllm-project/vllm/pull/5689 for details
461475
- pytest -v -s distributed/test_custom_all_reduce.py
462-
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
476+
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
463477
- pytest -v -s -x lora/test_mixtral.py
464478

465479
- label: LM Eval Large Models # optional

.dockerignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
vllm/*.so
1+
/.github/
22
/.venv
33
/build
44
dist
5+
Dockerfile*
6+
vllm/*.so

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ share/python-wheels/
3333
.installed.cfg
3434
*.egg
3535
MANIFEST
36+
/.deps/
3637

3738
# PyInstaller
3839
# Usually these files are written by a python script from a template

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
166166
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
167167
endif()
168168

169+
170+
#
171+
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
172+
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
173+
#
169174
include(FetchContent)
175+
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
176+
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
177+
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
178+
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
170179

171180
#
172181
# Define other extension targets

Dockerfile.xpu

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
1+
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
22

33
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
44
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -7,20 +7,49 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
77
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
88
chmod 644 /usr/share/keyrings/intel-graphics.gpg
99

10-
RUN apt-get update -y && \
11-
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
12-
13-
COPY ./ /workspace/vllm
10+
RUN apt-get update -y && \
11+
apt-get install -y --no-install-recommends --fix-missing \
12+
curl \
13+
ffmpeg \
14+
git \
15+
libsndfile1 \
16+
libsm6 \
17+
libxext6 \
18+
libgl1 \
19+
lsb-release \
20+
numactl \
21+
python3 \
22+
python3-dev \
23+
python3-pip \
24+
# vim \
25+
wget
1426

1527
WORKDIR /workspace/vllm
28+
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
29+
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
1630

1731
RUN --mount=type=cache,target=/root/.cache/pip \
18-
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
19-
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
20-
-r requirements-xpu.txt
32+
pip install --no-cache-dir \
33+
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
34+
-r requirements-xpu.txt
35+
36+
COPY ./ /workspace/vllm
37+
38+
ENV VLLM_TARGET_DEVICE=xpu
2139

2240
RUN --mount=type=cache,target=/root/.cache/pip \
2341
--mount=type=bind,source=.git,target=.git \
24-
VLLM_TARGET_DEVICE=xpu python3 setup.py install
42+
python3 setup.py install
2543

2644
CMD ["/bin/bash"]
45+
46+
FROM vllm-base AS vllm-openai
47+
48+
# install additional dependencies for openai api server
49+
RUN --mount=type=cache,target=/root/.cache/pip \
50+
pip install accelerate hf_transfer 'modelscope!=1.15.0'
51+
52+
ENV VLLM_USAGE_SOURCE production-docker-image \
53+
TRITON_XPU_PROFILE 1
54+
55+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

0 commit comments

Comments
 (0)