Skip to content

Commit 188df15

Browse files
authored
[Misc] Add v0.7.3 benchmark (#678)
Cherry-pick benchmark script from main Signed-off-by: wangli <wangli858794774@gmail.com>
1 parent abf1faa commit 188df15

File tree

6 files changed

+401
-0
lines changed

6 files changed

+401
-0
lines changed

benchmarks/README.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Introduction
2+
This document outlines the benchmarking process for vllm-ascend, designed to evaluate its performance under various workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance.To maintain consistency with the vllm community, we have reused the vllm community [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script.
3+
# Overview
4+
**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon).
5+
- Latency tests
6+
- Input length: 32 tokens.
7+
- Output length: 128 tokens.
8+
- Batch size: fixed (8).
9+
- Models: llama-3.1 8B.
10+
- Evaluation metrics: end-to-end latency (mean, median, p99).
11+
12+
- Throughput tests
13+
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
14+
- Output length: the corresponding output length of these 200 prompts.
15+
- Batch size: dynamically determined by vllm to achieve maximum throughput.
16+
- Models: llama-3.1 8B .
17+
- Evaluation metrics: throughput.
18+
- Serving tests
19+
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
20+
- Output length: the corresponding output length of these 200 prompts.
21+
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
22+
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
23+
- Models: llama-3.1 8B.
24+
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
25+
26+
**Benchmarking Duration**: about 800senond for single model.
27+
28+
29+
# Quick Use
30+
## Prerequisites
31+
Before running the benchmarks, ensure the following:
32+
- vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices.
33+
- Install necessary dependencies for benchmarks:
34+
```
35+
pip install -r benchmarks/requirements-bench.txt
36+
```
37+
38+
- Models and datasets are cached locally to accelerate execution. Modify the paths in the JSON files located in benchmarks/tests accordingly. feel free to add your own models and parameters in the JSON to run your customized benchmarks.
39+
40+
## Run benchmarks
41+
The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory:
42+
```
43+
bash benchmarks/scripts/run-performance-benchmarks.sh
44+
```
45+
Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following:
46+
```
47+
|-- latency_llama8B_tp1.json
48+
|-- serving_llama8B_tp1_sharegpt_qps_1.json
49+
|-- serving_llama8B_tp1_sharegpt_qps_16.json
50+
|-- serving_llama8B_tp1_sharegpt_qps_4.json
51+
|-- serving_llama8B_tp1_sharegpt_qps_inf.json
52+
|-- throughput_llama8B_tp1.json
53+
```
54+
These files contain detailed benchmarking results for further analysis.

benchmarks/requirements-bench.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pandas
2+
datasets
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
#!/bin/bash
2+
3+
4+
check_npus() {
5+
# shellcheck disable=SC2155
6+
declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ')
7+
8+
if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then
9+
echo "Need at least 1 NPU to run benchmarking."
10+
exit 1
11+
else
12+
echo "found NPU conut: $npu_count"
13+
fi
14+
15+
npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}')
16+
17+
echo "NPU type is: $npu_type"
18+
}
19+
20+
ensure_sharegpt_downloaded() {
21+
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
22+
if [ ! -f "$FILE" ]; then
23+
echo "$FILE not found, downloading from hf-mirror ..."
24+
wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
25+
else
26+
echo "$FILE already exists."
27+
fi
28+
}
29+
30+
json2args() {
31+
# transforms the JSON string to command line args, and '_' is replaced to '-'
32+
# example:
33+
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
34+
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
35+
local json_string=$1
36+
local args
37+
args=$(
38+
echo "$json_string" | jq -r '
39+
to_entries |
40+
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
41+
join(" ")
42+
'
43+
)
44+
echo "$args"
45+
}
46+
47+
wait_for_server() {
48+
# wait for vllm server to start
49+
# return 1 if vllm server crashes
50+
timeout 1200 bash -c '
51+
until curl -X POST localhost:8000/v1/completions; do
52+
sleep 1
53+
done' && return 0 || return 1
54+
}
55+
56+
get_cur_npu_id() {
57+
npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}'
58+
}
59+
60+
kill_npu_processes() {
61+
ps -aux
62+
lsof -t -i:8000 | xargs -r kill -9
63+
pgrep python3 | xargs -r kill -9
64+
65+
sleep 4
66+
rm -rf ~/.config/vllm
67+
68+
}
69+
70+
71+
run_latency_tests() {
72+
# run latency tests using `benchmark_latency.py`
73+
# $1: a json file specifying latency test cases
74+
75+
local latency_test_file
76+
latency_test_file=$1
77+
78+
# Iterate over latency tests
79+
jq -c '.[]' "$latency_test_file" | while read -r params; do
80+
# get the test name, and append the NPU type back to it.
81+
test_name=$(echo "$params" | jq -r '.test_name')
82+
if [[ ! "$test_name" =~ ^latency_ ]]; then
83+
echo "In latency-test.json, test_name must start with \"latency_\"."
84+
exit 1
85+
fi
86+
87+
# if TEST_SELECTOR is set, only run the test cases that match the selector
88+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
89+
echo "Skip test case $test_name."
90+
continue
91+
fi
92+
93+
# get arguments
94+
latency_params=$(echo "$params" | jq -r '.parameters')
95+
latency_args=$(json2args "$latency_params")
96+
97+
latency_command="python3 vllm_benchmarks/benchmark_latency.py \
98+
--output-json $RESULTS_FOLDER/${test_name}.json \
99+
$latency_args"
100+
101+
echo "Running test case $test_name"
102+
echo "Latency command: $latency_command"
103+
104+
# run the benchmark
105+
eval "$latency_command"
106+
107+
kill_npu_processes
108+
109+
done
110+
}
111+
112+
run_throughput_tests() {
113+
# run throughput tests using `benchmark_throughput.py`
114+
# $1: a json file specifying throughput test cases
115+
116+
local throughput_test_file
117+
throughput_test_file=$1
118+
119+
# Iterate over throughput tests
120+
jq -c '.[]' "$throughput_test_file" | while read -r params; do
121+
# get the test name, and append the NPU type back to it.
122+
test_name=$(echo "$params" | jq -r '.test_name')
123+
if [[ ! "$test_name" =~ ^throughput_ ]]; then
124+
echo "In throughput-test.json, test_name must start with \"throughput_\"."
125+
exit 1
126+
fi
127+
128+
# if TEST_SELECTOR is set, only run the test cases that match the selector
129+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
130+
echo "Skip test case $test_name."
131+
continue
132+
fi
133+
134+
# get arguments
135+
throughput_params=$(echo "$params" | jq -r '.parameters')
136+
throughput_args=$(json2args "$throughput_params")
137+
138+
throughput_command="python3 vllm_benchmarks/benchmark_throughput.py \
139+
--output-json $RESULTS_FOLDER/${test_name}.json \
140+
$throughput_args"
141+
142+
echo "Running test case $test_name"
143+
echo "Throughput command: $throughput_command"
144+
145+
# run the benchmark
146+
eval "$throughput_command"
147+
148+
kill_npu_processes
149+
150+
done
151+
}
152+
153+
run_serving_tests() {
154+
# run serving tests using `benchmark_serving.py`
155+
# $1: a json file specifying serving test cases
156+
157+
local serving_test_file
158+
serving_test_file=$1
159+
160+
# Iterate over serving tests
161+
jq -c '.[]' "$serving_test_file" | while read -r params; do
162+
# get the test name, and append the NPU type back to it.
163+
test_name=$(echo "$params" | jq -r '.test_name')
164+
if [[ ! "$test_name" =~ ^serving_ ]]; then
165+
echo "In serving-test.json, test_name must start with \"serving_\"."
166+
exit 1
167+
fi
168+
169+
# if TEST_SELECTOR is set, only run the test cases that match the selector
170+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
171+
echo "Skip test case $test_name."
172+
continue
173+
fi
174+
175+
# get client and server arguments
176+
server_params=$(echo "$params" | jq -r '.server_parameters')
177+
client_params=$(echo "$params" | jq -r '.client_parameters')
178+
server_args=$(json2args "$server_params")
179+
client_args=$(json2args "$client_params")
180+
qps_list=$(echo "$params" | jq -r '.qps_list')
181+
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
182+
echo "Running over qps list $qps_list"
183+
184+
# check if server model and client model is aligned
185+
server_model=$(echo "$server_params" | jq -r '.model')
186+
client_model=$(echo "$client_params" | jq -r '.model')
187+
if [[ $server_model != "$client_model" ]]; then
188+
echo "Server model and client model must be the same. Skip testcase $test_name."
189+
continue
190+
fi
191+
192+
server_command="python3 \
193+
-m vllm.entrypoints.openai.api_server \
194+
$server_args"
195+
196+
# run the server
197+
echo "Running test case $test_name"
198+
echo "Server command: $server_command"
199+
bash -c "$server_command" &
200+
server_pid=$!
201+
202+
# wait until the server is alive
203+
if wait_for_server; then
204+
echo ""
205+
echo "vllm server is up and running."
206+
else
207+
echo ""
208+
echo "vllm failed to start within the timeout period."
209+
fi
210+
211+
# iterate over different QPS
212+
for qps in $qps_list; do
213+
# remove the surrounding single quote from qps
214+
if [[ "$qps" == *"inf"* ]]; then
215+
echo "qps was $qps"
216+
qps="inf"
217+
echo "now qps is $qps"
218+
fi
219+
220+
new_test_name=$test_name"_qps_"$qps
221+
222+
client_command="python3 vllm_benchmarks/benchmark_serving.py \
223+
--save-result \
224+
--result-dir $RESULTS_FOLDER \
225+
--result-filename ${new_test_name}.json \
226+
--request-rate $qps \
227+
$client_args"
228+
229+
echo "Running test case $test_name with qps $qps"
230+
echo "Client command: $client_command"
231+
232+
bash -c "$client_command"
233+
done
234+
235+
# clean up
236+
kill -9 $server_pid
237+
kill_npu_processes
238+
done
239+
}
240+
241+
cleanup() {
242+
rm -rf ./vllm_benchmarks
243+
}
244+
245+
get_benchmarks_scripts() {
246+
git clone -b main --depth=1 git@github.com:vllm-project/vllm.git && \
247+
mv vllm/benchmarks vllm_benchmarks
248+
rm -rf ./vllm
249+
}
250+
251+
main() {
252+
253+
START_TIME=$(date +%s)
254+
check_npus
255+
256+
# dependencies
257+
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
258+
(which jq) || (apt-get update && apt-get -y install jq)
259+
(which lsof) || (apt-get update && apt-get install -y lsof)
260+
261+
# get the current IP address, required by benchmark_serving.py
262+
# shellcheck disable=SC2155
263+
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
264+
# turn of the reporting of the status of each request, to clean up the terminal output
265+
export VLLM_LOG_LEVEL="WARNING"
266+
267+
# set env
268+
export VLLM_USE_MODELSCOPE="True"
269+
export HF_ENDPOINT="https://hf-mirror.com"
270+
271+
# prepare for benchmarking
272+
cd benchmarks || exit 1
273+
get_benchmarks_scripts
274+
trap cleanup EXIT
275+
276+
QUICK_BENCHMARK_ROOT=./
277+
278+
declare -g RESULTS_FOLDER=results
279+
mkdir -p $RESULTS_FOLDER
280+
281+
ensure_sharegpt_downloaded
282+
# benchmarks
283+
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
284+
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
285+
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
286+
287+
END_TIME=$(date +%s)
288+
ELAPSED_TIME=$((END_TIME - START_TIME))
289+
echo "Total execution time: $ELAPSED_TIME seconds"
290+
291+
}
292+
293+
main "$@"

benchmarks/tests/latency-tests.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"parameters": {
5+
"model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
6+
"tensor_parallel_size": 1,
7+
"load_format": "dummy",
8+
"num_iters_warmup": 5,
9+
"num_iters": 15
10+
}
11+
}
12+
]

benchmarks/tests/serving-tests.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1",
4+
"qps_list": [
5+
1,
6+
4,
7+
16,
8+
"inf"
9+
],
10+
"server_parameters": {
11+
"model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
12+
"tensor_parallel_size": 1,
13+
"swap_space": 16,
14+
"disable_log_stats": "",
15+
"disable_log_requests": "",
16+
"load_format": "dummy"
17+
},
18+
"client_parameters": {
19+
"model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
20+
"backend": "vllm",
21+
"dataset_name": "sharegpt",
22+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
23+
"num_prompts": 200
24+
}
25+
}
26+
]

0 commit comments

Comments
 (0)