Docker compose updated

ezelanza · ezelanza · commit 5db12a5c5980 · 2025-02-28T22:12:22.000+09:00
Signed-off-by: Ezequiel Lanza &lt;ezequiel.lanza@gmail.com&gt;
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -138,22 +138,27 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
 
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
     ports:
       - "9009:80"
     volumes:
       - "./data:/data"
-    shm_size: 1g
+    shm_size: 128g
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
 
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
@@ -164,7 +169,7 @@ services:
       - dataprep-milvus-service
       - retriever
       - tei-reranking-service
-      - tgi-service
+      - vllm-service
     ports:
       - "8888:8888"
     environment:
diff --git a/ChatQnA/tests/test_compose_milvus_on_xeon.sh b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -38,7 +38,7 @@ function build_docker_images() {
     cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep retriever nginx"
+    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -59,8 +59,8 @@ function start_services() {
 
     n=0
     until [[ "$n" -ge 100 ]]; do
-        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
             break
         fi
         sleep 5s
@@ -171,8 +171,8 @@ function validate_microservices() {
     validate_service \
         "${ip_address}:9009/v1/chat/completions" \
         "content" \
-        "tgi-llm" \
-        "tgi-service" \
+        "vllm-llm" \
+        "vllm-service" \
         '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
 }