Code Enhancement for vllm inference (#1729)

Yongbozzz · web-flow · commit 1a0c5f03c628 · 2025-04-03T13:37:49.000+08:00
Signed-off-by: Yongbozzz &lt;yongbo.zhu@intel.com&gt;
diff --git a/EdgeCraftRAG/README.md b/EdgeCraftRAG/README.md
@@ -17,7 +17,7 @@ quality and performance.
 
 ### (Optional) Build Docker Images for Mega Service, Server and UI by your own
 
-If you want to build the images by your own, please follow the steps:
+**All the docker images can be automatically‌ pulled**, If you want to build the images by your own, please follow the steps:
 
 ```bash
 cd GenAIExamples/EdgeCraftRAG
@@ -101,6 +101,26 @@ export HUGGINGFACEHUB_API_TOKEN=#your HF token
 docker compose -f compose_vllm.yaml up -d
 ```
 
+#### Launch services with vLLM for multi Intel Arc GPUs inference service
+
+The docker file can be pulled automatically‌, you can also pull the image manually:
+
+```bash
+docker pull intelanalytics/ipex-llm-serving-xpu:latest
+```
+
+Set up Additional Environment Variables and start with compose_vllm_multi-arc.yaml
+
+```bash
+export LLM_MODEL=#your model id
+export VLLM_SERVICE_PORT=8008
+export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
+export LLM_MODEL_PATH=#your model path
+export TENSOR_PARALLEL_SIZE=#your Intel Arc GPU number to do inference
+
+docker compose -f compose_vllm_multi-arc.yaml up -d
+```
+
 ### ChatQnA with LLM Example (Command Line)
 
 ```bash
diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose_vllm_multi-arc.yaml b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose_vllm_multi-arc.yaml
@@ -0,0 +1,93 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  server:
+    image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest}
+    container_name: edgecraftrag-server
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_ENDPOINT: ${HF_ENDPOINT}
+      vLLM_ENDPOINT: ${vLLM_ENDPOINT}
+      LLM_MODEL: ${LLM_MODEL}
+      ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
+    volumes:
+      - ${MODEL_PATH:-${PWD}}:/home/user/models
+      - ${DOC_PATH:-${PWD}}:/home/user/docs
+      - ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
+      - ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
+      - ${PROMPT_PATH:-${PWD}}:/templates/custom
+    ports:
+      - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
+    devices:
+      - /dev/dri:/dev/dri
+    group_add:
+      - ${VIDEOGROUPID:-44}
+      - ${RENDERGROUPID:-109}
+  ecrag:
+    image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
+    container_name: edgecraftrag
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
+      PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
+      PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
+    ports:
+      - ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011}
+    depends_on:
+      - server
+  ui:
+    image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest}
+    container_name: edgecraftrag-ui
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
+      PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
+      PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
+      UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
+      UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
+    volumes:
+      - ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
+    ports:
+      - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
+    restart: always
+    depends_on:
+      - server
+      - ecrag
+  llm-serving-xpu:
+    container_name: ipex-llm-serving-xpu-container
+    image: intelanalytics/ipex-llm-serving-xpu:latest
+    privileged: true
+    ports:
+      - ${VLLM_SERVICE_PORT:-8008}:8000
+    group_add:
+      - video
+      - ${VIDEOGROUPID:-44}
+      - ${RENDERGROUPID:-109}
+    volumes:
+      - ${LLM_MODEL_PATH:-${PWD}}:/llm/models
+    devices:
+      - /dev/dri
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_ENDPOINT: ${HF_ENDPOINT}
+      MODEL_PATH: "/llm/models"
+      SERVED_MODEL_NAME: ${LLM_MODEL}
+      TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
+    shm_size: '16g'
+    entrypoint: /bin/bash -c "\
+      cd /llm && \
+      bash start-vllm-service.sh"
+networks:
+  default:
+    driver: bridge
diff --git a/EdgeCraftRAG/ui/gradio/ecrag_client.py b/EdgeCraftRAG/ui/gradio/ecrag_client.py
@@ -156,13 +156,16 @@ def get_benchmark(name):
 
         if data.get("Benchmark enabled", False):
             benchmark_data = data.get("last_benchmark_data", {})
-            if benchmark_data.get("generator", "N/A"):
-                benchmark = (
-                    f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s      "
-                    f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s      "
-                    f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
-                ).rstrip()
-                return benchmark
+            if benchmark_data and "generator" in benchmark_data:
+                if benchmark_data.get("generator", "N/A"):
+                    benchmark = (
+                        f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s      "
+                        f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s      "
+                        f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
+                    ).rstrip()
+                    return benchmark
+                else:
+                    return None
             else:
                 return None
         else:
diff --git a/EdgeCraftRAG/ui/gradio/ecragui.py b/EdgeCraftRAG/ui/gradio/ecragui.py
@@ -85,9 +85,9 @@ def get_system_status():
 
 def get_benchmark():
     time.sleep(0.5)
-    active_pipeline_nam = get_actived_pipeline()
-    if active_pipeline_nam:
-        data = cli.get_benchmark(active_pipeline_nam)
+    active_pipeline_name = get_actived_pipeline()
+    if active_pipeline_name:
+        data = cli.get_benchmark(active_pipeline_name)
         if data:
             return gr.update(
                 visible=True,