vllm-project
diff --git a/‎.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Lines changed: 11 additions & 0 deletions b/‎.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
Lines changed: 11 additions & 0 deletions b/‎.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/models-small.txt
Lines changed: 1 addition & 0 deletions b/‎.buildkite/lm-eval-harness/configs/models-small.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Lines changed: 2 additions & 2 deletions b/‎.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/nightly-benchmarks/README.md
Lines changed: 1 addition & 0 deletions b/‎.buildkite/nightly-benchmarks/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Lines changed: 17 additions & 18 deletions b/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Lines changed: 17 additions & 18 deletions
diff --git a/‎.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Lines changed: 0 additions & 27 deletions b/‎.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Lines changed: 0 additions & 27 deletions
diff --git a/‎.buildkite/nightly-benchmarks/nightly-descriptions.md
Lines changed: 45 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/nightly-descriptions.md
Lines changed: 45 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Lines changed: 120 additions & 0 deletions b/‎.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Lines changed: 120 additions & 0 deletions
diff --git a/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Lines changed: 23 additions & 5 deletions b/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Lines changed: 23 additions & 5 deletions
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.593
+  - name: "exact_match,flexible-extract"
+    value: 0.588
+limit: 1000
+num_fewshot: 5
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.595
+  - name: "exact_match,flexible-extract"
+    value: 0.582
+limit: 1000
+num_fewshot: 5
@@ -2,3 +2,4 @@ Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.2
+#   pip install lm-eval==0.4.3
 
 usage() {
     echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray" \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
@@ -1,5 +1,6 @@
 # vLLM benchmark suite
 
+
 ## Introduction
 
 This directory contains the performance benchmarking CI for vllm.
 
@@ -11,7 +11,7 @@ steps:
             - sh
             - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
   - wait
-  - label: "A100 Benchmark"
+  - label: "A100"
     agents:
       queue: A100
     plugins:
@@ -42,21 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       propagate-uid-gid: false
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
 
@@ -0,0 +1,45 @@
+
+# Nightly benchmark
+
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+
+
+## Docker images
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1
+
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
+
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
+
+## Results
+
+{nightly_results_benchmarking_table}
@@ -0,0 +1,120 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+  - label: "A100 trt benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.5.0
+                <<: *common_container_settings
+  
+
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:latest 
+                <<: *common_container_settings
+
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+                <<: *common_container_settings
+        
+  - wait
+
+  - label: "Plot"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - wait
@@ -54,7 +54,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -X POST localhost:8000/v1/completions; do
       sleep 1
     done' && return 0 || return 1
 }
@@ -73,8 +73,17 @@ kill_gpu_processes() {
       echo "All GPU processes have been killed."
   fi
 
+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3
+
   # waiting for GPU processes to be fully killed
-  sleep 10
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+    sleep 1
+    echo "Waiting for GPU processes to be killed"
+  done
 
   # remove vllm config file
   rm -rf ~/.config/vllm
@@ -90,12 +99,19 @@ upload_to_buildkite() {
   # upload the benchmarking results to buildkite
 
   # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
 run_latency_tests() {
@@ -269,6 +285,7 @@ run_serving_tests() {
     echo "Running test case $test_name"
     echo "Server command: $server_command"
     eval "$server_command" &
+    server_pid=$!
 
     # wait until the server is alive
     wait_for_server
@@ -318,6 +335,7 @@ run_serving_tests() {
     done
 
     # clean up
+    kill -9 $server_pid
     kill_gpu_processes
   done
 }