24
24
push :
25
25
branches :
26
26
- master
27
- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
27
+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
28
28
pull_request_target :
29
29
types : [opened, synchronize, reopened]
30
- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
30
+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
31
31
schedule :
32
32
- cron : ' 04 2 * * *'
33
33
34
34
concurrency :
35
- group : ${{ github.workflow }}-${{ github.ref }}
35
+ group : ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
36
36
cancel-in-progress : true
37
37
38
38
jobs :
@@ -42,11 +42,33 @@ jobs:
42
42
RUNNER_LABEL : Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
43
43
N_USERS : 8
44
44
DURATION : 10m
45
- if : ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
45
+
46
+ strategy :
47
+ matrix :
48
+ model : [phi-2]
49
+ ftype : [q4_0, q8_0, f16]
50
+ include :
51
+ - model : phi-2
52
+ ftype : q4_0
53
+ pr_comment_enabled : " true"
54
+
55
+ if : |
56
+ inputs.gpu-series == 'Standard_NC4as_T4_v3'
57
+ || (
58
+ github.event_name == 'schedule'
59
+ && github.ref_name == 'master'
60
+ && github.repository_owner == 'ggerganov'
61
+ )
62
+ || github.event_name == 'pull_request_target'
63
+ || (
64
+ github.event_name == 'push'
65
+ && github.event.ref == 'refs/heads/master'
66
+ && github.repository_owner == 'ggerganov'
67
+ )
46
68
steps :
47
69
- name : Clone
48
70
id : checkout
49
- uses : actions/checkout@v3
71
+ uses : actions/checkout@v4
50
72
with :
51
73
fetch-depth : 0
52
74
ref : ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -69,20 +91,24 @@ jobs:
69
91
sleep 0.1
70
92
done
71
93
72
- - name : Install k6
94
+ - name : Set up Go
95
+ uses : actions/setup-go@v5
96
+ with :
97
+ go-version : ' 1.21'
98
+
99
+ - name : Install k6 and xk6-sse
73
100
id : k6_installation
74
101
run : |
75
102
cd examples/server/bench
76
- wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
77
- tar xzf k6*.tar.gz --strip-components=1
103
+ go install go.k6.io/xk6/cmd/xk6@latest
104
+ xk6 build master \
105
+ --with github.com/phymbert/xk6-sse
78
106
79
107
- name : Build
80
108
id : cmake_build
81
109
run : |
82
110
set -eux
83
- mkdir build
84
- cd build
85
- cmake .. \
111
+ cmake -B build \
86
112
-DLLAMA_NATIVE=OFF \
87
113
-DLLAMA_BUILD_SERVER=ON \
88
114
-DLLAMA_CURL=ON \
93
119
-DLLAMA_FATAL_WARNINGS=OFF \
94
120
-DLLAMA_ALL_WARNINGS=OFF \
95
121
-DCMAKE_BUILD_TYPE=Release;
96
- cmake --build . --config Release -j $(nproc) --target server
122
+ cmake --build build --config Release -j $(nproc) --target server
97
123
98
124
- name : Download the dataset
99
125
id : download_dataset
@@ -108,15 +134,15 @@ jobs:
108
134
109
135
cd examples/server/bench
110
136
source venv/bin/activate
111
- BENCH_K6_BIN_PATH=./k6 python bench.py \
137
+ python bench.py \
112
138
--runner-label ${{ env.RUNNER_LABEL }} \
113
139
--name ${{ github.job }} \
114
140
--branch ${{ github.head_ref || github.ref_name }} \
115
141
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
116
142
--scenario script.js \
117
143
--duration ${{ github.event.inputs.duration || env.DURATION }} \
118
144
--hf-repo ggml-org/models \
119
- --hf-file phi-2 /ggml-model-q4_0 .gguf \
145
+ --hf-file ${{ matrix.model }} /ggml-model-${{ matrix.ftype }} .gguf \
120
146
--model-path-prefix /models \
121
147
--parallel ${{ env.N_USERS }} \
122
148
-ngl 33 \
@@ -134,7 +160,7 @@ jobs:
134
160
135
161
- uses : actions/upload-artifact@v4
136
162
with :
137
- name : benchmark-results
163
+ name : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
138
164
compression-level : 9
139
165
path : |
140
166
examples/server/bench/*.jpg
@@ -146,7 +172,7 @@ jobs:
146
172
with :
147
173
authToken : ${{secrets.GITHUB_TOKEN}}
148
174
sha : ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
149
- context : bench-server-baseline
175
+ context : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
150
176
description : |
151
177
${{ env.BENCH_RESULTS }}
152
178
state : ' success'
@@ -203,21 +229,26 @@ jobs:
203
229
- name : Comment PR
204
230
uses : mshick/add-pr-comment@v2
205
231
id : comment_pr
206
- if : ${{ github.event.pull_request != '' }}
232
+ if : ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
207
233
with :
208
- message-id : bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
234
+ message-id : bench-server- ${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
209
235
message : |
210
- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
236
+ <p align="center">
211
237
212
- - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
213
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
214
- - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
215
- - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
216
- - ${{ env.BENCH_GRAPH_XLABEL }}
238
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
239
+
240
+ </p>
217
241
218
242
<details>
219
243
220
- <summary>Time series</summary>
244
+ <summary>Expand details for performance related PR only</summary>
245
+
246
+ - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
247
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
248
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
249
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
250
+ - ${{ env.BENCH_GRAPH_XLABEL }}
251
+
221
252
222
253
<p align="center">
223
254
0 commit comments