@@ -41,86 +41,31 @@ wait_for_server() {
41
41
}
42
42
43
43
44
- main () {
45
-
46
- (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
47
- (which jq) || (apt-get -y install jq)
48
- (which socat) || (apt-get -y install socat)
44
+ benchmark () {
49
45
50
- cd " $( dirname " $0 " ) "
46
+ # compare chunked prefill with disaggregated prefill
51
47
52
- cd ..
53
- # create sonnet-4x.txt
54
- echo " " > sonnet_4x.txt
55
- for _ in {1..4}
56
- do
57
- cat sonnet.txt >> sonnet_4x.txt
58
- done
59
- cd disagg_benchmarks
60
-
61
-
62
- mkdir -p results
63
48
results_folder=" ./results"
64
49
model=" neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
65
50
dataset_name=" sonnet"
66
51
dataset_path=" ../sonnet_4x.txt"
67
52
num_prompts=500
68
- qps=4
53
+ qps=$1
69
54
prefix_len=64
70
55
input_len=2048
71
- output_len=11
72
-
73
-
74
- # chunked prefill with tp=8
75
- python3 -m vllm.entrypoints.openai.api_server \
76
- --model $model \
77
- --port 8000 \
78
- -tp 8 \
79
- --disable-log-stats \
80
- --disable-log-requests \
81
- --enable-chunked-prefill &
82
- wait_for_server 8000
83
-
84
- python3 ../benchmark_serving.py \
85
- --backend vllm \
86
- --model $model \
87
- --dataset-name $dataset_name \
88
- --dataset-path $dataset_path \
89
- --sonnet-input-len $input_len \
90
- --sonnet-output-len $output_len \
91
- --sonnet-prefix-len $prefix_len \
92
- --num-prompts $num_prompts \
93
- --port 8000 \
94
- --save-result \
95
- --result-dir $results_folder \
96
- --result-filename chunked_prefill_tp8.json \
97
- --request-rate $qps
98
- kill_gpu_processes
56
+ output_len=$2
99
57
100
58
101
59
# chunked prefill with tp=4
102
60
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
103
61
-m vllm.entrypoints.openai.api_server \
104
62
--model $model \
105
- --port 8100 \
63
+ --port 8000 \
106
64
-tp 4 \
107
65
--disable-log-stats \
108
66
--disable-log-requests \
109
67
--enable-chunked-prefill &
110
-
111
- # CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
112
- # -m vllm.entrypoints.openai.api_server \
113
- # --model $model \
114
- # --port 8200 \
115
- # -tp 4 \
116
- # --disable-log-stats \
117
- # --disable-log-requests \
118
- # --enable-chunked-prefill &
119
-
120
- wait_for_server 8100
121
- # wait_for_server 8200
122
- # # launch round robin proxy
123
- # bash ./round_robin_proxy.sh &
68
+ wait_for_server 8000
124
69
125
70
python3 ../benchmark_serving.py \
126
71
--backend vllm \
@@ -131,17 +76,15 @@ main() {
131
76
--sonnet-output-len $output_len \
132
77
--sonnet-prefix-len $prefix_len \
133
78
--num-prompts $(( num_prompts / 2 )) \
134
- --port 8100 \
79
+ --port 8000 \
135
80
--save-result \
136
81
--result-dir $results_folder \
137
82
--result-filename chunked_prefill_tp4.json \
138
83
--request-rate $(( qps / 2 ))
139
84
kill_gpu_processes
140
- # pkill -f round_robin_proxy.sh
141
85
142
86
143
87
# disaggregated prefill
144
-
145
88
# prefill with tp=4
146
89
python3 -m vllm.entrypoints.openai.api_server \
147
90
--model $model \
@@ -150,7 +93,6 @@ main() {
150
93
--disable-log-stats \
151
94
--disable-log-requests &
152
95
wait_for_server 8000
153
-
154
96
# set output-len to 1 so that it only do prefilling
155
97
python3 ../benchmark_serving.py \
156
98
--backend vllm \
@@ -177,7 +119,6 @@ main() {
177
119
--disable-log-stats \
178
120
--disable-log-requests &
179
121
wait_for_server 8000
180
-
181
122
# skip prefilling
182
123
# by enabling APC and force the input tokens be the same
183
124
python3 ../benchmark_serving.py \
@@ -187,7 +128,7 @@ main() {
187
128
--dataset-path $dataset_path \
188
129
--sonnet-input-len $input_len \
189
130
--sonnet-output-len $output_len \
190
- --sonnet-prefix-len $(( input_len - 1 )) \
131
+ --sonnet-prefix-len $input_len \
191
132
--num-prompts $num_prompts \
192
133
--port 8000 \
193
134
--save-result \
@@ -196,7 +137,48 @@ main() {
196
137
--request-rate $qps
197
138
kill_gpu_processes
198
139
140
+ python3 analyze_results.py \
141
+ --results-folder $results_folder \
142
+ --output-len $output_len \
143
+ --qps $qps
144
+
145
+ }
146
+
147
+
148
+ main () {
149
+
150
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
151
+ (which jq) || (apt-get -y install jq)
152
+ (which socat) || (apt-get -y install socat)
153
+
154
+ cd " $( dirname " $0 " ) "
155
+
156
+ cd ..
157
+ # create sonnet-4x.txt
158
+ echo " " > sonnet_4x.txt
159
+ for _ in {1..4}
160
+ do
161
+ cat sonnet.txt >> sonnet_4x.txt
162
+ done
163
+ cd disagg_benchmarks
164
+
165
+ rm -rf results
166
+ mkdir results
167
+
168
+ default_qps=4
169
+ default_output_len=12
170
+
171
+ for target_qps in 1 2 4 8 16
172
+ do
173
+ benchmark $target_qps $default_output_len
174
+ done
175
+
176
+ for output_len in 5 10 20 40 80
177
+ do
178
+ benchmark $default_qps $output_len
179
+ done
180
+
199
181
}
200
182
201
183
202
- main " $@ "
184
+ main " $@ "
0 commit comments