Added Initial version of DocSum support for benchmarking scripts for OPEA (#1840)

vrantala · lvliang-intel · ZePan110 · web-flow · commit 29d449b3ca26 · 2025-04-21T10:32:28.000+08:00
Signed-off-by: Valtteri Rantala &lt;valtteri.rantala@intel.com&gt;
Co-authored-by: Liang Lv &lt;liang1.lv@intel.com&gt;
Co-authored-by: ZePan110 &lt;ze.pan@intel.com&gt;
diff --git a/DocSum/benchmark_docsum.yaml b/DocSum/benchmark_docsum.yaml
@@ -0,0 +1,87 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deploy:
+  device: gaudi
+  version: 1.2.0
+  modelUseHostPath: /mnt/models
+  HUGGINGFACEHUB_API_TOKEN: "" # mandatory
+  node: [1]
+  namespace: ""
+  node_name: []
+  timeout: 1000  # timeout in seconds for services to be ready, default 30 minutes
+  interval: 5    # interval in seconds between service ready checks, default 5 seconds
+
+  services:
+    backend:
+      resources:
+        enabled: False
+        cores_per_instance: "16"
+        memory_capacity: "8000Mi"
+      replicaCount: [1]
+
+    teirerank:
+      enabled: False
+
+    llm:
+      engine: vllm  # or tgi
+      model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory
+      replicaCount:
+        without_teirerank: [1]   # When teirerank.enabled is False
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      model_params:
+        vllm:  # VLLM specific parameters
+          batch_params:
+            enabled: True
+            max_num_seqs: "8"    # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: True
+            max_input_length: ""
+            max_total_tokens: ""
+            max_batch_total_tokens: ""
+            max_batch_prefill_tokens: ""
+        tgi:   # TGI specific parameters
+          batch_params:
+            enabled: True
+            max_batch_size: [1]  # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: "1280"
+            max_total_tokens: "2048"
+            max_batch_total_tokens: "65536"
+            max_batch_prefill_tokens: "4096"
+
+    docsum-ui:
+      replicaCount: [1]
+
+    whisper:
+      replicaCount: [1]
+
+    llm-uservice:
+      model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory
+      replicaCount: [1]
+
+    nginx:
+      replicaCount: [1]
+
+benchmark:
+  # http request behavior related fields
+  user_queries:              [16]
+  concurrency:               [4]
+  load_shape_type:           "constant" # "constant" or "poisson"
+  poisson_arrival_rate:      1.0  # only used when load_shape_type is "poisson"
+  warmup_iterations:         10
+  seed:                      1024
+  collect_service_metric:    True
+
+  # workload, all of the test cases will run for benchmark
+  bench_target: ["docsumfixed"] # specify the bench_target for benchmark
+  dataset: "/home/sdp/upload.txt"  # specify the absolute path to the dataset file
+  summary_type: "stuff"
+  stream: True
+
+  llm:
+    # specify the llm output token size
+    max_token_size:          [1024]
diff --git a/benchmark.py b/benchmark.py
@@ -12,6 +12,7 @@
 # only support chatqna for now
 service_endpoints = {
     "chatqna": "/v1/chatqna",
+    "docsum": "/v1/docsum",
 }
 
 
@@ -35,6 +36,9 @@ def construct_benchmark_config(test_suite_config):
         "dataset": test_suite_config.get("dataset", ""),
         "prompt": test_suite_config.get("prompt", [10]),
         "llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", [128]),
+        "collect_service_metric": test_suite_config.get("collect_service_metric", False),
+        "summary_type": test_suite_config.get("summary_type", "auto"),
+        "stream": test_suite_config.get("stream", "auto"),
     }
 
 
@@ -144,6 +148,8 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
                 "llm-model": test_params["llm_model"],
                 "deployment-type": test_params["deployment_type"],
                 "load-shape": load_shape,
+                "summary_type": test_params.get("summary_type", "auto"),
+                "stream": test_params.get("stream", True),
             },
             "runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
         }
@@ -373,7 +379,9 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
         "user_queries": parsed_data["user_queries"],  # num of user queries
         "random_prompt": False,  # whether to use random prompt, set to False by default
         "run_time": "30m",  # The max total run time for the test suite, set to 60m by default
-        "collect_service_metric": False,  # whether to collect service metrics, set to False by default
+        "collect_service_metric": (
+            parsed_data["collect_service_metric"] if parsed_data["collect_service_metric"] else False
+        ),  # Metrics collection set to False by default
         "llm_model": llm_model,  # The LLM model used for the test
         "deployment_type": "k8s",  # Default is "k8s", can also be "docker"
         "service_ip": None,  # Leave as None for k8s, specify for Docker
@@ -398,9 +406,15 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
         "dataset": parsed_data["dataset"],
         "prompt": parsed_data["prompt"],
         "llm_max_token_size": parsed_data["llm_max_token_size"],
+        "summary_type": parsed_data["summary_type"],
+        "stream": parsed_data["stream"],
     }
 
-    dataset = None
+    if parsed_data["dataset"]:  # This checks if user provided dataset/document for DocSum service
+        dataset = parsed_data["dataset"]
+    else:
+        dataset = None
+
     query_data = None
     os.environ["MODEL_NAME"] = test_suite_config.get("llm_model", "meta-llama/Meta-Llama-3-8B-Instruct")
     # Do benchmark in for-loop for different llm_max_token_size
@@ -428,6 +442,21 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
                 "max_output": llm_max_token,  # max number of output tokens
                 "k": 1,  # number of retrieved documents
             }
+        if chart_name == "docsum":
+            case_data = {
+                "run_test": True,
+                "service_name": "docsum",
+                "service_list": [
+                    "docsum",
+                    "docsum-llm-uservice",
+                    "docsum-vllm",
+                ],
+                "stream": parsed_data["stream"],
+                "max_output": llm_max_token,  # max number of output tokens
+                "summary_type": parsed_data["summary_type"],  # Summary_type for DocSum
+                "dataset": dataset,  # Dataset used for document summary
+            }
+
         output_folder = _run_service_test(chart_name, case_data, test_suite_config, namespace)
 
     print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.")
diff --git a/deploy.py b/deploy.py
@@ -177,6 +177,8 @@ def configure_models(values, deploy_config):
             values[service_name]["EMBEDDING_MODEL_ID"] = model_id
         elif service_name == "teirerank":
             values[service_name]["RERANK_MODEL_ID"] = model_id
+        elif service_name == "llm-uservice":
+            values[service_name]["LLM_MODEL_ID"] = model_id
 
     return values