Skip to content

Commit 68e7c25

Browse files
authored
Merge branch 'master' into smooth-pr
2 parents 91f3db8 + 8c570c9 commit 68e7c25

File tree

276 files changed

+76245
-55840
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

276 files changed

+76245
-55840
lines changed

.devops/full-cuda.Dockerfile

+3-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
1212
ARG CUDA_DOCKER_ARCH=all
1313

1414
RUN apt-get update && \
15-
apt-get install -y build-essential python3 python3-pip git
15+
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
1616

1717
COPY requirements.txt requirements.txt
1818
COPY requirements requirements
@@ -28,6 +28,8 @@ COPY . .
2828
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
2929
# Enable CUDA
3030
ENV LLAMA_CUDA=1
31+
# Enable cURL
32+
ENV LLAMA_CURL=1
3133

3234
RUN make
3335

.devops/full-rocm.Dockerfile

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
4040
ENV CC=/opt/rocm/llvm/bin/clang
4141
ENV CXX=/opt/rocm/llvm/bin/clang++
4242

43+
# Enable cURL
44+
ENV LLAMA_CURL=1
45+
RUN apt-get update && \
46+
apt-get install -y libcurl4-openssl-dev
47+
4348
RUN make
4449

4550
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/full.Dockerfile

+4-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
33
FROM ubuntu:$UBUNTU_VERSION as build
44

55
RUN apt-get update && \
6-
apt-get install -y build-essential python3 python3-pip git
6+
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
77

88
COPY requirements.txt requirements.txt
99
COPY requirements requirements
@@ -15,6 +15,9 @@ WORKDIR /app
1515

1616
COPY . .
1717

18+
ENV LLAMA_CURL=1
19+
20+
1821
RUN make
1922

2023
ENV LC_ALL=C.utf8

.devops/main-intel.Dockerfile

+3-5
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,12 @@ WORKDIR /app
1010

1111
COPY . .
1212

13-
RUN mkdir build && \
14-
cd build && \
15-
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
13+
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
1614
echo "LLAMA_SYCL_F16 is set" && \
1715
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
1816
fi && \
19-
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
20-
cmake --build . --config Release --target main
17+
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
18+
cmake --build build --config Release --target main
2119

2220
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
2321

.devops/main-vulkan.Dockerfile

+2-4
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
1414
# Build it
1515
WORKDIR /app
1616
COPY . .
17-
RUN mkdir build && \
18-
cd build && \
19-
cmake .. -DLLAMA_VULKAN=1 && \
20-
cmake --build . --config Release --target main
17+
RUN cmake -B build -DLLAMA_VULKAN=1 && \
18+
cmake --build build --config Release --target main
2119

2220
# Clean up
2321
WORKDIR /

.devops/server-cuda.Dockerfile

+6-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
1212
ARG CUDA_DOCKER_ARCH=all
1313

1414
RUN apt-get update && \
15-
apt-get install -y build-essential git
15+
apt-get install -y build-essential git libcurl4-openssl-dev
1616

1717
WORKDIR /app
1818

@@ -22,11 +22,16 @@ COPY . .
2222
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
2323
# Enable CUDA
2424
ENV LLAMA_CUDA=1
25+
# Enable cURL
26+
ENV LLAMA_CURL=1
2527

2628
RUN make
2729

2830
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
2931

32+
RUN apt-get update && \
33+
apt-get install -y libcurl4-openssl-dev
34+
3035
COPY --from=build /app/server /server
3136

3237
ENTRYPOINT [ "/server" ]

.devops/server-intel.Dockerfile

+7-6
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,24 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
44

55
ARG LLAMA_SYCL_F16=OFF
66
RUN apt-get update && \
7-
apt-get install -y git
7+
apt-get install -y git libcurl4-openssl-dev
88

99
WORKDIR /app
1010

1111
COPY . .
1212

13-
RUN mkdir build && \
14-
cd build && \
15-
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
13+
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
1614
echo "LLAMA_SYCL_F16 is set" && \
1715
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
1816
fi && \
19-
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
20-
cmake --build . --config Release --target server
17+
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
18+
cmake --build build --config Release --target server
2119

2220
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
2321

22+
RUN apt-get update && \
23+
apt-get install -y libcurl4-openssl-dev
24+
2425
COPY --from=build /app/build/bin/server /server
2526

2627
ENV LC_ALL=C.utf8

.devops/server-rocm.Dockerfile

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
4040
ENV CC=/opt/rocm/llvm/bin/clang
4141
ENV CXX=/opt/rocm/llvm/bin/clang++
4242

43+
# Enable cURL
44+
ENV LLAMA_CURL=1
45+
RUN apt-get update && \
46+
apt-get install -y libcurl4-openssl-dev
47+
4348
RUN make
4449

4550
ENTRYPOINT [ "/app/server" ]

.devops/server-vulkan.Dockerfile

+6-4
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
1111
apt update -y && \
1212
apt-get install -y vulkan-sdk
1313

14+
# Install cURL
15+
RUN apt-get update && \
16+
apt-get install -y libcurl4-openssl-dev
17+
1418
# Build it
1519
WORKDIR /app
1620
COPY . .
17-
RUN mkdir build && \
18-
cd build && \
19-
cmake .. -DLLAMA_VULKAN=1 && \
20-
cmake --build . --config Release --target server
21+
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
22+
cmake --build build --config Release --target server
2123

2224
# Clean up
2325
WORKDIR /

.devops/server.Dockerfile

+6-1
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04
33
FROM ubuntu:$UBUNTU_VERSION as build
44

55
RUN apt-get update && \
6-
apt-get install -y build-essential git
6+
apt-get install -y build-essential git libcurl4-openssl-dev
77

88
WORKDIR /app
99

1010
COPY . .
1111

12+
ENV LLAMA_CURL=1
13+
1214
RUN make
1315

1416
FROM ubuntu:$UBUNTU_VERSION as runtime
1517

18+
RUN apt-get update && \
19+
apt-get install -y libcurl4-openssl-dev
20+
1621
COPY --from=build /app/server /server
1722

1823
ENV LC_ALL=C.utf8

.flake8

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
11
[flake8]
22
max-line-length = 125
3-
ignore = W503
3+
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4+
exclude =
5+
# Do not traverse examples
6+
examples,
7+
# Do not include package initializers
8+
__init__.py,
9+
# No need to traverse our git directory
10+
.git,
11+
# There's no value in checking cache directories
12+
__pycache__,
13+
# No need to include the build path
14+
build,
15+
# This contains builds that we don't want to check
16+
dist # This is generated with `python build .` for package releases
17+
# max-complexity = 10

.github/workflows/bench.yml

+56-25
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ on:
2424
push:
2525
branches:
2626
- master
27-
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
27+
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
2828
pull_request_target:
2929
types: [opened, synchronize, reopened]
30-
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
30+
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
3131
schedule:
3232
- cron: '04 2 * * *'
3333

3434
concurrency:
35-
group: ${{ github.workflow }}-${{ github.ref }}
35+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
3636
cancel-in-progress: true
3737

3838
jobs:
@@ -42,11 +42,33 @@ jobs:
4242
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
4343
N_USERS: 8
4444
DURATION: 10m
45-
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
45+
46+
strategy:
47+
matrix:
48+
model: [phi-2]
49+
ftype: [q4_0, q8_0, f16]
50+
include:
51+
- model: phi-2
52+
ftype: q4_0
53+
pr_comment_enabled: "true"
54+
55+
if: |
56+
inputs.gpu-series == 'Standard_NC4as_T4_v3'
57+
|| (
58+
github.event_name == 'schedule'
59+
&& github.ref_name == 'master'
60+
&& github.repository_owner == 'ggerganov'
61+
)
62+
|| github.event_name == 'pull_request_target'
63+
|| (
64+
github.event_name == 'push'
65+
&& github.event.ref == 'refs/heads/master'
66+
&& github.repository_owner == 'ggerganov'
67+
)
4668
steps:
4769
- name: Clone
4870
id: checkout
49-
uses: actions/checkout@v3
71+
uses: actions/checkout@v4
5072
with:
5173
fetch-depth: 0
5274
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -69,20 +91,24 @@ jobs:
6991
sleep 0.1
7092
done
7193
72-
- name: Install k6
94+
- name: Set up Go
95+
uses: actions/setup-go@v5
96+
with:
97+
go-version: '1.21'
98+
99+
- name: Install k6 and xk6-sse
73100
id: k6_installation
74101
run: |
75102
cd examples/server/bench
76-
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
77-
tar xzf k6*.tar.gz --strip-components=1
103+
go install go.k6.io/xk6/cmd/xk6@latest
104+
xk6 build master \
105+
--with github.com/phymbert/xk6-sse
78106
79107
- name: Build
80108
id: cmake_build
81109
run: |
82110
set -eux
83-
mkdir build
84-
cd build
85-
cmake .. \
111+
cmake -B build \
86112
-DLLAMA_NATIVE=OFF \
87113
-DLLAMA_BUILD_SERVER=ON \
88114
-DLLAMA_CURL=ON \
@@ -93,7 +119,7 @@ jobs:
93119
-DLLAMA_FATAL_WARNINGS=OFF \
94120
-DLLAMA_ALL_WARNINGS=OFF \
95121
-DCMAKE_BUILD_TYPE=Release;
96-
cmake --build . --config Release -j $(nproc) --target server
122+
cmake --build build --config Release -j $(nproc) --target server
97123
98124
- name: Download the dataset
99125
id: download_dataset
@@ -108,15 +134,15 @@ jobs:
108134
109135
cd examples/server/bench
110136
source venv/bin/activate
111-
BENCH_K6_BIN_PATH=./k6 python bench.py \
137+
python bench.py \
112138
--runner-label ${{ env.RUNNER_LABEL }} \
113139
--name ${{ github.job }} \
114140
--branch ${{ github.head_ref || github.ref_name }} \
115141
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
116142
--scenario script.js \
117143
--duration ${{ github.event.inputs.duration || env.DURATION }} \
118144
--hf-repo ggml-org/models \
119-
--hf-file phi-2/ggml-model-q4_0.gguf \
145+
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
120146
--model-path-prefix /models \
121147
--parallel ${{ env.N_USERS }} \
122148
-ngl 33 \
@@ -134,7 +160,7 @@ jobs:
134160
135161
- uses: actions/upload-artifact@v4
136162
with:
137-
name: benchmark-results
163+
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
138164
compression-level: 9
139165
path: |
140166
examples/server/bench/*.jpg
@@ -146,7 +172,7 @@ jobs:
146172
with:
147173
authToken: ${{secrets.GITHUB_TOKEN}}
148174
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
149-
context: bench-server-baseline
175+
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
150176
description: |
151177
${{ env.BENCH_RESULTS }}
152178
state: 'success'
@@ -203,21 +229,26 @@ jobs:
203229
- name: Comment PR
204230
uses: mshick/add-pr-comment@v2
205231
id: comment_pr
206-
if: ${{ github.event.pull_request != '' }}
232+
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
207233
with:
208-
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
234+
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
209235
message: |
210-
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
236+
<p align="center">
211237
212-
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
213-
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
214-
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
215-
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
216-
- ${{ env.BENCH_GRAPH_XLABEL }}
238+
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
239+
240+
</p>
217241
218242
<details>
219243
220-
<summary>Time series</summary>
244+
<summary>Expand details for performance related PR only</summary>
245+
246+
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
247+
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
248+
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
249+
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
250+
- ${{ env.BENCH_GRAPH_XLABEL }}
251+
221252
222253
<p align="center">
223254

0 commit comments

Comments
 (0)