Skip to content

Commit 29d449b

Browse files
vrantalalvliang-intelZePan110
authored
Added Initial version of DocSum support for benchmarking scripts for OPEA (#1840)
Signed-off-by: Valtteri Rantala <valtteri.rantala@intel.com> Co-authored-by: Liang Lv <liang1.lv@intel.com> Co-authored-by: ZePan110 <ze.pan@intel.com>
1 parent 338f814 commit 29d449b

File tree

3 files changed

+120
-2
lines changed

3 files changed

+120
-2
lines changed

DocSum/benchmark_docsum.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
deploy:
5+
device: gaudi
6+
version: 1.2.0
7+
modelUseHostPath: /mnt/models
8+
HUGGINGFACEHUB_API_TOKEN: "" # mandatory
9+
node: [1]
10+
namespace: ""
11+
node_name: []
12+
timeout: 1000 # timeout in seconds for services to be ready, default 30 minutes
13+
interval: 5 # interval in seconds between service ready checks, default 5 seconds
14+
15+
services:
16+
backend:
17+
resources:
18+
enabled: False
19+
cores_per_instance: "16"
20+
memory_capacity: "8000Mi"
21+
replicaCount: [1]
22+
23+
teirerank:
24+
enabled: False
25+
26+
llm:
27+
engine: vllm # or tgi
28+
model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory
29+
replicaCount:
30+
without_teirerank: [1] # When teirerank.enabled is False
31+
resources:
32+
enabled: False
33+
cards_per_instance: 1
34+
model_params:
35+
vllm: # VLLM specific parameters
36+
batch_params:
37+
enabled: True
38+
max_num_seqs: "8" # Each value triggers an LLM service upgrade
39+
token_params:
40+
enabled: True
41+
max_input_length: ""
42+
max_total_tokens: ""
43+
max_batch_total_tokens: ""
44+
max_batch_prefill_tokens: ""
45+
tgi: # TGI specific parameters
46+
batch_params:
47+
enabled: True
48+
max_batch_size: [1] # Each value triggers an LLM service upgrade
49+
token_params:
50+
enabled: False
51+
max_input_length: "1280"
52+
max_total_tokens: "2048"
53+
max_batch_total_tokens: "65536"
54+
max_batch_prefill_tokens: "4096"
55+
56+
docsum-ui:
57+
replicaCount: [1]
58+
59+
whisper:
60+
replicaCount: [1]
61+
62+
llm-uservice:
63+
model_id: "meta-llama/Llama-3.2-3B-Instruct" # mandatory
64+
replicaCount: [1]
65+
66+
nginx:
67+
replicaCount: [1]
68+
69+
benchmark:
70+
# http request behavior related fields
71+
user_queries: [16]
72+
concurrency: [4]
73+
load_shape_type: "constant" # "constant" or "poisson"
74+
poisson_arrival_rate: 1.0 # only used when load_shape_type is "poisson"
75+
warmup_iterations: 10
76+
seed: 1024
77+
collect_service_metric: True
78+
79+
# workload, all of the test cases will run for benchmark
80+
bench_target: ["docsumfixed"] # specify the bench_target for benchmark
81+
dataset: "/home/sdp/upload.txt" # specify the absolute path to the dataset file
82+
summary_type: "stuff"
83+
stream: True
84+
85+
llm:
86+
# specify the llm output token size
87+
max_token_size: [1024]

benchmark.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# only support chatqna for now
1313
service_endpoints = {
1414
"chatqna": "/v1/chatqna",
15+
"docsum": "/v1/docsum",
1516
}
1617

1718

@@ -35,6 +36,9 @@ def construct_benchmark_config(test_suite_config):
3536
"dataset": test_suite_config.get("dataset", ""),
3637
"prompt": test_suite_config.get("prompt", [10]),
3738
"llm_max_token_size": test_suite_config.get("llm", {}).get("max_token_size", [128]),
39+
"collect_service_metric": test_suite_config.get("collect_service_metric", False),
40+
"summary_type": test_suite_config.get("summary_type", "auto"),
41+
"stream": test_suite_config.get("stream", "auto"),
3842
}
3943

4044

@@ -144,6 +148,8 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
144148
"llm-model": test_params["llm_model"],
145149
"deployment-type": test_params["deployment_type"],
146150
"load-shape": load_shape,
151+
"summary_type": test_params.get("summary_type", "auto"),
152+
"stream": test_params.get("stream", True),
147153
},
148154
"runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
149155
}
@@ -373,7 +379,9 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
373379
"user_queries": parsed_data["user_queries"], # num of user queries
374380
"random_prompt": False, # whether to use random prompt, set to False by default
375381
"run_time": "30m", # The max total run time for the test suite, set to 60m by default
376-
"collect_service_metric": False, # whether to collect service metrics, set to False by default
382+
"collect_service_metric": (
383+
parsed_data["collect_service_metric"] if parsed_data["collect_service_metric"] else False
384+
), # Metrics collection set to False by default
377385
"llm_model": llm_model, # The LLM model used for the test
378386
"deployment_type": "k8s", # Default is "k8s", can also be "docker"
379387
"service_ip": None, # Leave as None for k8s, specify for Docker
@@ -398,9 +406,15 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
398406
"dataset": parsed_data["dataset"],
399407
"prompt": parsed_data["prompt"],
400408
"llm_max_token_size": parsed_data["llm_max_token_size"],
409+
"summary_type": parsed_data["summary_type"],
410+
"stream": parsed_data["stream"],
401411
}
402412

403-
dataset = None
413+
if parsed_data["dataset"]: # This checks if user provided dataset/document for DocSum service
414+
dataset = parsed_data["dataset"]
415+
else:
416+
dataset = None
417+
404418
query_data = None
405419
os.environ["MODEL_NAME"] = test_suite_config.get("llm_model", "meta-llama/Meta-Llama-3-8B-Instruct")
406420
# Do benchmark in for-loop for different llm_max_token_size
@@ -428,6 +442,21 @@ def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model
428442
"max_output": llm_max_token, # max number of output tokens
429443
"k": 1, # number of retrieved documents
430444
}
445+
if chart_name == "docsum":
446+
case_data = {
447+
"run_test": True,
448+
"service_name": "docsum",
449+
"service_list": [
450+
"docsum",
451+
"docsum-llm-uservice",
452+
"docsum-vllm",
453+
],
454+
"stream": parsed_data["stream"],
455+
"max_output": llm_max_token, # max number of output tokens
456+
"summary_type": parsed_data["summary_type"], # Summary_type for DocSum
457+
"dataset": dataset, # Dataset used for document summary
458+
}
459+
431460
output_folder = _run_service_test(chart_name, case_data, test_suite_config, namespace)
432461

433462
print(f"[OPEA BENCHMARK] 🚀 Test Finished. Output saved in {output_folder}.")

deploy.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ def configure_models(values, deploy_config):
177177
values[service_name]["EMBEDDING_MODEL_ID"] = model_id
178178
elif service_name == "teirerank":
179179
values[service_name]["RERANK_MODEL_ID"] = model_id
180+
elif service_name == "llm-uservice":
181+
values[service_name]["LLM_MODEL_ID"] = model_id
180182

181183
return values
182184

0 commit comments

Comments
 (0)