Skip to content

Commit 004b0c0

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fuyu
2 parents bf89017 + eeceada commit 004b0c0

File tree

208 files changed

+9478
-2362
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

208 files changed

+9478
-2362
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
2+
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.593
8+
- name: "exact_match,flexible-extract"
9+
value: 0.588
10+
limit: 1000
11+
num_fewshot: 5
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
2+
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.595
8+
- name: "exact_match,flexible-extract"
9+
value: 0.582
10+
limit: 1000
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ Meta-Llama-3-8B-Instruct.yaml
22
Meta-Llama-3-8B-Instruct-FP8.yaml
33
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
44
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
5+
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# We use this for fp8, which HF does not support.
44
#
55
# Make sure you have lm-eval-harness installed:
6-
# pip install lm-eval==0.4.2
6+
# pip install lm-eval==0.4.3
77

88
usage() {
99
echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
4646
done
4747

4848
lm_eval --model vllm \
49-
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \
49+
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray" \
5050
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
5151
--batch_size $BATCH_SIZE

.buildkite/nightly-benchmarks/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# vLLM benchmark suite
22

3+
34
## Introduction
45

56
This directory contains the performance benchmarking CI for vllm.

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ steps:
1111
- sh
1212
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1313
- wait
14-
- label: "A100 Benchmark"
14+
- label: "A100"
1515
agents:
1616
queue: A100
1717
plugins:
@@ -42,21 +42,20 @@ steps:
4242
- name: devshm
4343
emptyDir:
4444
medium: Memory
45-
# - label: "H100: NVIDIA SMI"
46-
# agents:
47-
# queue: H100
48-
# plugins:
49-
# - docker#v5.11.0:
50-
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
51-
# command:
52-
# - bash
53-
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
54-
# mount-buildkite-agent: true
55-
# propagate-environment: true
56-
# propagate-uid-gid: false
57-
# ipc: host
58-
# gpus: all
59-
# environment:
60-
# - VLLM_USAGE_SOURCE
61-
# - HF_TOKEN
45+
- label: "H100"
46+
agents:
47+
queue: H100
48+
plugins:
49+
- docker#v5.11.0:
50+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
51+
command:
52+
- bash
53+
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
54+
mount-buildkite-agent: true
55+
propagate-environment: true
56+
ipc: host
57+
gpus: all
58+
environment:
59+
- VLLM_USAGE_SOURCE
60+
- HF_TOKEN
6261

.buildkite/nightly-benchmarks/kickoff-pipeline.sh

Lines changed: 0 additions & 27 deletions
This file was deleted.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
2+
# Nightly benchmark
3+
4+
The main goal of this benchmarking is two-fold:
5+
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
6+
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
7+
8+
9+
## Docker images
10+
11+
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
12+
- vllm/vllm-openai:v0.5.0.post1
13+
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
14+
- openmmlab/lmdeploy:v0.5.0
15+
- ghcr.io/huggingface/text-generation-inference:2.1
16+
17+
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
18+
19+
20+
## Hardware
21+
22+
One AWS node with 8x NVIDIA A100 GPUs.
23+
24+
25+
## Workload description
26+
27+
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
28+
29+
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
30+
- Output length: the corresponding output length of these 500 prompts.
31+
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
32+
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
33+
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
34+
35+
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
36+
37+
## Plots
38+
39+
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
40+
41+
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
42+
43+
## Results
44+
45+
{nightly_results_benchmarking_table}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
common_pod_spec: &common_pod_spec
2+
priorityClassName: perf-benchmark
3+
nodeSelector:
4+
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
5+
volumes:
6+
- name: devshm
7+
emptyDir:
8+
medium: Memory
9+
- name: hf-cache
10+
hostPath:
11+
path: /root/.cache/huggingface
12+
type: Directory
13+
14+
common_container_settings: &common_container_settings
15+
command:
16+
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
17+
resources:
18+
limits:
19+
nvidia.com/gpu: 8
20+
volumeMounts:
21+
- name: devshm
22+
mountPath: /dev/shm
23+
- name: hf-cache
24+
mountPath: /root/.cache/huggingface
25+
env:
26+
- name: VLLM_USAGE_SOURCE
27+
value: ci-test
28+
- name: HF_HOME
29+
value: /root/.cache/huggingface
30+
- name: VLLM_SOURCE_CODE_LOC
31+
value: /workspace/build/buildkite/vllm/performance-benchmark
32+
- name: HF_TOKEN
33+
valueFrom:
34+
secretKeyRef:
35+
name: hf-token-secret
36+
key: token
37+
38+
steps:
39+
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
40+
- label: "A100 trt benchmark"
41+
priority: 100
42+
agents:
43+
queue: A100
44+
plugins:
45+
- kubernetes:
46+
podSpec:
47+
<<: *common_pod_spec
48+
containers:
49+
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
50+
<<: *common_container_settings
51+
52+
- label: "A100 lmdeploy benchmark"
53+
priority: 100
54+
agents:
55+
queue: A100
56+
plugins:
57+
- kubernetes:
58+
podSpec:
59+
<<: *common_pod_spec
60+
containers:
61+
- image: openmmlab/lmdeploy:v0.5.0
62+
<<: *common_container_settings
63+
64+
65+
- label: "A100 vllm benchmark"
66+
priority: 100
67+
agents:
68+
queue: A100
69+
plugins:
70+
- kubernetes:
71+
podSpec:
72+
<<: *common_pod_spec
73+
containers:
74+
- image: vllm/vllm-openai:latest
75+
<<: *common_container_settings
76+
77+
- label: "A100 tgi benchmark"
78+
priority: 100
79+
agents:
80+
queue: A100
81+
plugins:
82+
- kubernetes:
83+
podSpec:
84+
<<: *common_pod_spec
85+
containers:
86+
- image: ghcr.io/huggingface/text-generation-inference:2.1
87+
<<: *common_container_settings
88+
89+
- wait
90+
91+
- label: "Plot"
92+
priority: 100
93+
agents:
94+
queue: A100
95+
plugins:
96+
- kubernetes:
97+
podSpec:
98+
<<: *common_pod_spec
99+
containers:
100+
- image: vllm/vllm-openai:v0.5.0.post1
101+
command:
102+
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
103+
resources:
104+
limits:
105+
nvidia.com/gpu: 8
106+
volumeMounts:
107+
- name: devshm
108+
mountPath: /dev/shm
109+
env:
110+
- name: VLLM_USAGE_SOURCE
111+
value: ci-test
112+
- name: VLLM_SOURCE_CODE_LOC
113+
value: /workspace/build/buildkite/vllm/performance-benchmark
114+
- name: HF_TOKEN
115+
valueFrom:
116+
secretKeyRef:
117+
name: hf-token-secret
118+
key: token
119+
120+
- wait

.buildkite/nightly-benchmarks/run-benchmarks-suite.sh

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ wait_for_server() {
5454
# wait for vllm server to start
5555
# return 1 if vllm server crashes
5656
timeout 1200 bash -c '
57-
until curl localhost:8000/v1/completions; do
57+
until curl -X POST localhost:8000/v1/completions; do
5858
sleep 1
5959
done' && return 0 || return 1
6060
}
@@ -73,8 +73,17 @@ kill_gpu_processes() {
7373
echo "All GPU processes have been killed."
7474
fi
7575

76+
# Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
77+
# since we are in container anyway
78+
pkill -9 -f python
79+
pkill -9 -f python3
80+
7681
# waiting for GPU processes to be fully killed
77-
sleep 10
82+
# loop while nvidia-smi returns any processes
83+
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
84+
sleep 1
85+
echo "Waiting for GPU processes to be killed"
86+
done
7887

7988
# remove vllm config file
8089
rm -rf ~/.config/vllm
@@ -90,12 +99,19 @@ upload_to_buildkite() {
9099
# upload the benchmarking results to buildkite
91100

92101
# if the agent binary is not found, skip uploading the results, exit 0
93-
if [ ! -f /workspace/buildkite-agent ]; then
102+
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
103+
if command -v buildkite-agent >/dev/null 2>&1; then
104+
BUILDKITE_AGENT_COMMAND="buildkite-agent"
105+
elif [ -f /workspace/buildkite-agent ]; then
106+
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
107+
else
94108
echo "buildkite-agent binary not found. Skip uploading the results."
95109
return 0
96110
fi
97-
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
98-
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
111+
112+
# Use the determined command to annotate and upload artifacts
113+
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
114+
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
99115
}
100116

101117
run_latency_tests() {
@@ -269,6 +285,7 @@ run_serving_tests() {
269285
echo "Running test case $test_name"
270286
echo "Server command: $server_command"
271287
eval "$server_command" &
288+
server_pid=$!
272289

273290
# wait until the server is alive
274291
wait_for_server
@@ -318,6 +335,7 @@ run_serving_tests() {
318335
done
319336

320337
# clean up
338+
kill -9 $server_pid
321339
kill_gpu_processes
322340
done
323341
}

0 commit comments

Comments
 (0)