kalomaze
diff --git a/‎.devops/full-cuda.Dockerfile
+3-1 b/‎.devops/full-cuda.Dockerfile
+3-1
diff --git a/‎.devops/full-rocm.Dockerfile
+5 b/‎.devops/full-rocm.Dockerfile
+5
diff --git a/‎.devops/full.Dockerfile
+4-1 b/‎.devops/full.Dockerfile
+4-1
diff --git a/‎.devops/main-intel.Dockerfile
+3-5 b/‎.devops/main-intel.Dockerfile
+3-5
diff --git a/‎.devops/main-vulkan.Dockerfile
+2-4 b/‎.devops/main-vulkan.Dockerfile
+2-4
diff --git a/‎.devops/server-cuda.Dockerfile
+6-1 b/‎.devops/server-cuda.Dockerfile
+6-1
diff --git a/‎.devops/server-intel.Dockerfile
+7-6 b/‎.devops/server-intel.Dockerfile
+7-6
diff --git a/‎.devops/server-rocm.Dockerfile
+5 b/‎.devops/server-rocm.Dockerfile
+5
diff --git a/‎.devops/server-vulkan.Dockerfile
+6-4 b/‎.devops/server-vulkan.Dockerfile
+6-4
diff --git a/‎.devops/server.Dockerfile
+6-1 b/‎.devops/server.Dockerfile
+6-1
diff --git a/‎.flake8
+15-1 b/‎.flake8
+15-1
diff --git a/‎.github/workflows/bench.yml
+56-25 b/‎.github/workflows/bench.yml
+56-25
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -28,6 +28,8 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
 
 RUN make
 
 
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make
 
 ENTRYPOINT ["/app/.devops/tools.sh"]
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -15,6 +15,9 @@ WORKDIR /app
 
 COPY . .
 
+ENV LLAMA_CURL=1
+
+
 RUN make
 
 ENV LC_ALL=C.utf8
 
@@ -10,14 +10,12 @@ WORKDIR /app
 
 COPY . .
 
-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target main
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target main
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 
 
@@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target main
+RUN cmake -B build -DLLAMA_VULKAN=1 && \
+    cmake --build build --config Release --target main
 
 # Clean up
 WORKDIR /
 
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev
 
 WORKDIR /app
 
@@ -22,11 +22,16 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
 
 RUN make
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server
 
 ENTRYPOINT [ "/server" ]
@@ -4,23 +4,24 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git
+    apt-get install -y git libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target server
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target server
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/build/bin/server /server
 
 ENV LC_ALL=C.utf8
 
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make
 
 ENTRYPOINT [ "/app/server" ]
@@ -11,13 +11,15 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
     apt update -y && \
     apt-get install -y vulkan-sdk
 
+# Install cURL
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 # Build it
 WORKDIR /app
 COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target server
+RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target server
 
 # Clean up
 WORKDIR /
 
@@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
+ENV LLAMA_CURL=1
+
 RUN make
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server
 
 ENV LC_ALL=C.utf8
 
@@ -1,3 +1,17 @@
 [flake8]
 max-line-length = 125
-ignore = W503
+ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
+exclude =
+    # Do not traverse examples
+    examples,
+    # Do not include package initializers
+    __init__.py,
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # No need to include the build path
+    build,
+    # This contains builds that we don't want to check
+    dist  # This is generated with `python build .` for package releases
+# max-complexity = 10
@@ -24,15 +24,15 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
   pull_request_target:
     types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
   schedule:
     -  cron: '04 2 * * *'
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -42,11 +42,33 @@ jobs:
       RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
       N_USERS: 8
       DURATION: 10m
-    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
+
+    strategy:
+      matrix:
+        model: [phi-2]
+        ftype: [q4_0, q8_0, f16]
+        include:
+          - model: phi-2
+            ftype: q4_0
+            pr_comment_enabled: "true"
+
+    if: |
+      inputs.gpu-series == 'Standard_NC4as_T4_v3'
+      || (
+        github.event_name == 'schedule'
+        && github.ref_name == 'master'
+        && github.repository_owner == 'ggerganov'
+      )
+      || github.event_name == 'pull_request_target'
+      || (
+        github.event_name == 'push'
+        && github.event.ref == 'refs/heads/master'
+        && github.repository_owner == 'ggerganov'
+      )
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -69,20 +91,24 @@ jobs:
             sleep 0.1
           done
 
-      - name: Install k6
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install k6 and xk6-sse
         id: k6_installation
         run: |
           cd examples/server/bench
-          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
-          tar xzf k6*.tar.gz --strip-components=1
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse
 
       - name: Build
         id: cmake_build
         run: |
           set -eux
-          mkdir build
-          cd build
-          cmake .. \
+          cmake -B build \
               -DLLAMA_NATIVE=OFF \
               -DLLAMA_BUILD_SERVER=ON \
               -DLLAMA_CURL=ON \
@@ -93,7 +119,7 @@ jobs:
               -DLLAMA_FATAL_WARNINGS=OFF \
               -DLLAMA_ALL_WARNINGS=OFF \
               -DCMAKE_BUILD_TYPE=Release;
-          cmake --build . --config Release -j $(nproc) --target server
+          cmake --build build --config Release -j $(nproc) --target server
 
       - name: Download the dataset
         id: download_dataset
@@ -108,15 +134,15 @@ jobs:
 
           cd examples/server/bench
           source venv/bin/activate
-          BENCH_K6_BIN_PATH=./k6 python bench.py \
+          python bench.py \
               --runner-label ${{ env.RUNNER_LABEL }} \
               --name ${{ github.job }} \
               --branch ${{ github.head_ref || github.ref_name }} \
               --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
               --scenario script.js \
               --duration ${{ github.event.inputs.duration || env.DURATION }} \
               --hf-repo ggml-org/models	 \
-              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
               --model-path-prefix /models \
               --parallel ${{ env.N_USERS }} \
               -ngl 33 \
@@ -134,7 +160,7 @@ jobs:
 
       - uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results
+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           compression-level: 9
           path: |
             examples/server/bench/*.jpg
@@ -146,7 +172,7 @@ jobs:
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
           sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-baseline
+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           description: |
             ${{ env.BENCH_RESULTS }}
           state: 'success'
@@ -203,21 +229,26 @@ jobs:
       - name: Comment PR
         uses: mshick/add-pr-comment@v2
         id: comment_pr
-        if: ${{ github.event.pull_request != '' }}
+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
         with:
-          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           message: |
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            <p align="center">
 
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - ${{ env.BENCH_GRAPH_XLABEL }}
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+            </p>
 
             <details>
 
-            <summary>Time series</summary>
+            <summary>Expand details for performance related PR only</summary>
+
+            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - ${{ env.BENCH_GRAPH_XLABEL }}
+
 
             <p align="center">