From cf60682c8290f1191c5d3e4609a8ad3b8d1b162a Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:02:03 +0700 Subject: [PATCH 01/22] DocSum - add files for deploy app with ROCm vLLM Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 ++ .../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++ .../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++ .../amd/gpu/rocm-vllm/set_env.sh | 16 ++ DocSum/docker_image_build/build.yaml | 9 + DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++ 6 files changed, 574 insertions(+) create mode 100644 DocSum/Dockerfile-vllm-rocm create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm new file mode 100644 index 0000000000..f0e8a8743a --- /dev/null +++ b/DocSum/Dockerfile-vllm-rocm @@ -0,0 +1,18 @@ +FROM rocm/vllm-dev:main + +# Set the working directory +WORKDIR /workspace + +# Copy the api_server.py into the image +ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py + +# Expose the port used by the API server +EXPOSE 8011 + +# Set environment variables +ENV HUGGINGFACE_HUB_CACHE=/workspace +ENV WILM_USE_TRITON_FLASH_ATTENTION=0 +ENV PYTORCH_JIT=0 + +# Set the entrypoint to the api_server.py script +ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md new file mode 100644 index 0000000000..4d41a5cd31 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md @@ -0,0 +1,175 @@ +# Build and deploy DocSum Application on AMD GPU (ROCm) + +## Build images + +## 🚀 Build Docker Images + +First of all, you need to build Docker Images locally and install the python package of it. + +### 1. Build LLM Image + +```bash +git clone https://github.com/opea-project/GenAIComps.git +cd GenAIComps +docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . +``` + +Then run the command `docker images`, you will have the following four Docker Images: + +### 2. Build MegaService Docker Image + +To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: + +```bash +git clone https://github.com/opea-project/GenAIExamples +cd GenAIExamples/DocSum/ +docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . +``` + +### 3. Build UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` + +### 4. Build React UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . + +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` +4. `opea/docsum-react-ui:latest` + +## 🚀 Start Microservices and MegaService + +### Required Models + +Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. +For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. + +### Setup Environment Variables + +Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. + +```bash +export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${host_ip} +export DOCSUM_TGI_SERVICE_PORT="18882" +export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export DOCSUM_LLM_SERVER_PORT="8008" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DocSum_COMPONENT_NAME="OpeaDocSumTgi" +``` + +Note: Please replace with `host_ip` with your external IP address, do not use localhost. + +Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +Example for set isolation for 1 GPU + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 +``` + +Example for set isolation for 2 GPUs + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 + - /dev/dri/card1:/dev/dri/card1 + - /dev/dri/renderD129:/dev/dri/renderD129 +``` + +Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +### Start Microservice Docker Containers + +```bash +cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm +docker compose up -d +``` + +### Validate Microservices + +1. TGI Service + + ```bash + curl http://${host_ip}:8008/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -H 'Content-Type: application/json' + ``` + +2. LLM Microservice + + ```bash + curl http://${host_ip}:9000/v1/docsum \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ + -H 'Content-Type: application/json' + ``` + +3. MegaService + + ```bash + curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ + "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false + }' + ``` + +## 🚀 Launch the Svelte UI + +Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + +![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) + +Here is an example for summarizing a article. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) + +## 🚀 Launch the React UI (Optional) + +To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: + +```yaml +docsum-rocm-react-ui-server: + image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} + container_name: docsum-rocm-react-ui-server + depends_on: + - docsum-rocm-backend-server + ports: + - "5174:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} +``` + +Open this URL `http://{host_ip}:5175` in your browser to access the frontend. + +![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml new file mode 100644 index 0000000000..037aa06395 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml @@ -0,0 +1,107 @@ +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +services: + docsum-vllm-service: + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} + container_name: docsum-vllm-service + ports: + - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + WILM_USE_TRITON_FLASH_ATTENTION: 0 + PYTORCH_JIT: 0 + volumes: + - "./data:/data" + shm_size: 20G + devices: + - /dev/kfd:/dev/kfd + - /dev/dri/:/dev/dri/ + cap_add: + - SYS_PTRACE + group_add: + - video + security_opt: + - seccomp:unconfined + - apparmor=unconfined + command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" + ipc: host + + docsum-llm-server: + image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + container_name: docsum-llm-server + depends_on: + - docsum-vllm-service + ports: + - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" + ipc: host + cap_add: + - SYS_PTRACE + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} + LOGFLAG: ${DOCSUM_LOGFLAG:-False} + MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} + restart: unless-stopped + + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "${DOCSUM_WHISPER_PORT:-7066}:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + + docsum-backend-server: + image: ${REGISTRY:-opea}/docsum:${TAG:-latest} + container_name: docsum-backend-server + depends_on: + - docsum-tgi-service + - docsum-llm-server + ports: + - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + MEGA_SERVICE_HOST_IP: ${HOST_IP} + LLM_SERVICE_HOST_IP: ${HOST_IP} + ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} + ipc: host + restart: always + + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} + container_name: docsum-ui-server + depends_on: + - docsum-backend-server + ports: + - "${DOCSUM_FRONTEND_PORT:-5173}:5173" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + +networks: + default: + driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh new file mode 100644 index 0000000000..43e71e0fbf --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +export HOST_IP="" +export DOCSUM_MAX_INPUT_TOKENS=2048 +export DOCSUM_MAX_TOTAL_TOKENS=4096 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index 095fd28c93..dc0d546189 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,3 +47,12 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + vllm_rocm: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + context: ../ + dockerfile: ./Dockerfile-vllm-rocm + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh new file mode 100644 index 0000000000..d0919a019a --- /dev/null +++ b/DocSum/tests/test_compose_on_rocm_vllm.sh @@ -0,0 +1,249 @@ +#!/bin/bash +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${ip_address} +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export MEGA_SERVICE_HOST_IP=${HOST_IP} +export LLM_SERVICE_HOST_IP=${HOST_IP} +export ASR_SERVICE_HOST_IP=${HOST_IP} +export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker images && sleep 1s +} + +function start_services() { + cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm + sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env + # Start Docker Containers + docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log + sleep 1m +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "EXPECTED_RESULT==> $EXPECTED_RESULT" + echo "CONTENT==> $CONTENT" + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +get_base64_str() { + local file_name=$1 + base64 -w 0 "$file_name" +} + +# Function to generate input data for testing based on the document type +input_data_for_test() { + local document_type=$1 + case $document_type in + ("text") + echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." + ;; + ("audio") + get_base64_str "$WORKPATH/tests/data/test.wav" + ;; + ("video") + get_base64_str "$WORKPATH/tests/data/test.mp4" + ;; + (*) + echo "Invalid document type" >&2 + exit 1 + ;; + esac +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # whisper microservice + ulimit -s 65536 + validate_services \ + "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ + '{"asr_result":"well"}' \ + "whisper-service" \ + "whisper-service" \ + "{\"audio\": \"$(input_data_for_test "audio")\"}" + + # vLLM service + validate_services \ + "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ + "generated_text" \ + "docsum-vllm-service" \ + "docsum-vllm-service" \ + '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + + # llm microservice + validate_services \ + "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ + "text" \ + "docsum-llm-server" \ + "docsum-llm-server" \ + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + +} + +function validate_megaservice() { + local SERVICE_NAME="docsum-backend-server" + local DOCKER_NAME="docsum-backend-server" + local EXPECTED_RESULT="[DONE]" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${host_ip}:8888/v1/docsum" + local DATA_TYPE="type=text" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_megaservice_json() { + # Curl the Mega Service + echo "" + echo ">>> Checking text data with Content-Type: application/json" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + + echo ">>> Checking audio data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" + + echo ">>> Checking video data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" + +} + +function stop_docker() { + cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ + docker compose stop && docker compose rm -f +} + +function main() { + echo "===========================================" + echo ">>>> Stopping any running Docker containers..." + stop_docker + + echo "===========================================" + if [[ "$IMAGE_REPO" == "opea" ]]; then + echo ">>>> Building Docker images..." + build_docker_images + fi + + echo "===========================================" + echo ">>>> Starting Docker services..." + start_services + + echo "===========================================" + echo ">>>> Validating microservices..." + validate_microservices + + echo "===========================================" + echo ">>>> Validating megaservice..." + validate_megaservice + echo ">>>> Validating validate_megaservice_json..." + validate_megaservice_json + + echo "===========================================" + echo ">>>> Stopping Docker containers..." + stop_docker + + echo "===========================================" + echo ">>>> Pruning Docker system..." + echo y | docker system prune + echo ">>>> Docker system pruned successfully." + echo "===========================================" +} + +main From 1fd1de1530328321d28aa6d9db85fffeb876574c Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:07:05 +0700 Subject: [PATCH 02/22] DocSum - fix main Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 -- .../amd/gpu/rocm-vllm/README.md | 175 ------------ .../amd/gpu/rocm-vllm/compose.yaml | 107 -------- .../amd/gpu/rocm-vllm/set_env.sh | 16 -- DocSum/docker_image_build/build.yaml | 9 - DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------ 6 files changed, 574 deletions(-) delete mode 100644 DocSum/Dockerfile-vllm-rocm delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm deleted file mode 100644 index f0e8a8743a..0000000000 --- a/DocSum/Dockerfile-vllm-rocm +++ /dev/null @@ -1,18 +0,0 @@ -FROM rocm/vllm-dev:main - -# Set the working directory -WORKDIR /workspace - -# Copy the api_server.py into the image -ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py - -# Expose the port used by the API server -EXPOSE 8011 - -# Set environment variables -ENV HUGGINGFACE_HUB_CACHE=/workspace -ENV WILM_USE_TRITON_FLASH_ATTENTION=0 -ENV PYTORCH_JIT=0 - -# Set the entrypoint to the api_server.py script -ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md deleted file mode 100644 index 4d41a5cd31..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# Build and deploy DocSum Application on AMD GPU (ROCm) - -## Build images - -## 🚀 Build Docker Images - -First of all, you need to build Docker Images locally and install the python package of it. - -### 1. Build LLM Image - -```bash -git clone https://github.com/opea-project/GenAIComps.git -cd GenAIComps -docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . -``` - -Then run the command `docker images`, you will have the following four Docker Images: - -### 2. Build MegaService Docker Image - -To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: - -```bash -git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/DocSum/ -docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . -``` - -### 3. Build UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` - -### 4. Build React UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . - -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` -4. `opea/docsum-react-ui:latest` - -## 🚀 Start Microservices and MegaService - -### Required Models - -Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. -For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. - -### Setup Environment Variables - -Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. - -```bash -export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${host_ip} -export DOCSUM_TGI_SERVICE_PORT="18882" -export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} -export DOCSUM_LLM_SERVER_PORT="8008" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DocSum_COMPONENT_NAME="OpeaDocSumTgi" -``` - -Note: Please replace with `host_ip` with your external IP address, do not use localhost. - -Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -Example for set isolation for 1 GPU - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 -``` - -Example for set isolation for 2 GPUs - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 - - /dev/dri/card1:/dev/dri/card1 - - /dev/dri/renderD129:/dev/dri/renderD129 -``` - -Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -### Start Microservice Docker Containers - -```bash -cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm -docker compose up -d -``` - -### Validate Microservices - -1. TGI Service - - ```bash - curl http://${host_ip}:8008/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ - -H 'Content-Type: application/json' - ``` - -2. LLM Microservice - - ```bash - curl http://${host_ip}:9000/v1/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - ``` - -3. MegaService - - ```bash - curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false - }' - ``` - -## 🚀 Launch the Svelte UI - -Open this URL `http://{host_ip}:5173` in your browser to access the frontend. - -![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) - -Here is an example for summarizing a article. - -![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) - -## 🚀 Launch the React UI (Optional) - -To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: - -```yaml -docsum-rocm-react-ui-server: - image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} - container_name: docsum-rocm-react-ui-server - depends_on: - - docsum-rocm-backend-server - ports: - - "5174:80" - environment: - - no_proxy=${no_proxy} - - https_proxy=${https_proxy} - - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} -``` - -Open this URL `http://{host_ip}:5175` in your browser to access the frontend. - -![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml deleted file mode 100644 index 037aa06395..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -services: - docsum-vllm-service: - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} - container_name: docsum-vllm-service - ports: - - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - WILM_USE_TRITON_FLASH_ATTENTION: 0 - PYTORCH_JIT: 0 - volumes: - - "./data:/data" - shm_size: 20G - devices: - - /dev/kfd:/dev/kfd - - /dev/dri/:/dev/dri/ - cap_add: - - SYS_PTRACE - group_add: - - video - security_opt: - - seccomp:unconfined - - apparmor=unconfined - command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" - ipc: host - - docsum-llm-server: - image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - container_name: docsum-llm-server - depends_on: - - docsum-vllm-service - ports: - - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" - ipc: host - cap_add: - - SYS_PTRACE - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} - LOGFLAG: ${DOCSUM_LOGFLAG:-False} - MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} - MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} - restart: unless-stopped - - whisper-service: - image: ${REGISTRY:-opea}/whisper:${TAG:-latest} - container_name: whisper-service - ports: - - "${DOCSUM_WHISPER_PORT:-7066}:7066" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped - - docsum-backend-server: - image: ${REGISTRY:-opea}/docsum:${TAG:-latest} - container_name: docsum-backend-server - depends_on: - - docsum-tgi-service - - docsum-llm-server - ports: - - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - MEGA_SERVICE_HOST_IP: ${HOST_IP} - LLM_SERVICE_HOST_IP: ${HOST_IP} - ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} - ipc: host - restart: always - - docsum-gradio-ui: - image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} - container_name: docsum-ui-server - depends_on: - - docsum-backend-server - ports: - - "${DOCSUM_FRONTEND_PORT:-5173}:5173" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - ipc: host - restart: always - -networks: - default: - driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh deleted file mode 100644 index 43e71e0fbf..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -export HOST_IP="" -export DOCSUM_MAX_INPUT_TOKENS=2048 -export DOCSUM_MAX_TOTAL_TOKENS=4096 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index dc0d546189..095fd28c93 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,12 +47,3 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - vllm_rocm: - build: - args: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - no_proxy: ${no_proxy} - context: ../ - dockerfile: ./Dockerfile-vllm-rocm - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh deleted file mode 100644 index d0919a019a..0000000000 --- a/DocSum/tests/test_compose_on_rocm_vllm.sh +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -set -xe -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') -export MAX_INPUT_TOKENS=1024 -export MAX_TOTAL_TOKENS=2048 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${ip_address} -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export MEGA_SERVICE_HOST_IP=${HOST_IP} -export LLM_SERVICE_HOST_IP=${HOST_IP} -export ASR_SERVICE_HOST_IP=${HOST_IP} -export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" - -function build_docker_images() { - opea_branch=${opea_branch:-"main"} - # If the opea_branch isn't main, replace the git clone branch in Dockerfile. - if [[ "${opea_branch}" != "main" ]]; then - cd $WORKPATH - OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" - NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" - find . -type f -name "Dockerfile*" | while read -r file; do - echo "Processing file: $file" - sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" - done - fi - - cd $WORKPATH/docker_image_build - git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git - - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" - docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - - docker images && sleep 1s -} - -function start_services() { - cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm - sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env - # Start Docker Containers - docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log - sleep 1m -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "EXPECTED_RESULT==> $EXPECTED_RESULT" - echo "CONTENT==> $CONTENT" - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -get_base64_str() { - local file_name=$1 - base64 -w 0 "$file_name" -} - -# Function to generate input data for testing based on the document type -input_data_for_test() { - local document_type=$1 - case $document_type in - ("text") - echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - ;; - ("audio") - get_base64_str "$WORKPATH/tests/data/test.wav" - ;; - ("video") - get_base64_str "$WORKPATH/tests/data/test.mp4" - ;; - (*) - echo "Invalid document type" >&2 - exit 1 - ;; - esac -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # whisper microservice - ulimit -s 65536 - validate_services \ - "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ - '{"asr_result":"well"}' \ - "whisper-service" \ - "whisper-service" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # vLLM service - validate_services \ - "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ - "generated_text" \ - "docsum-vllm-service" \ - "docsum-vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' - - # llm microservice - validate_services \ - "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ - "text" \ - "docsum-llm-server" \ - "docsum-llm-server" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - -} - -function validate_megaservice() { - local SERVICE_NAME="docsum-backend-server" - local DOCKER_NAME="docsum-backend-server" - local EXPECTED_RESULT="[DONE]" - local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - local URL="${host_ip}:8888/v1/docsum" - local DATA_TYPE="type=text" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_megaservice_json() { - # Curl the Mega Service - echo "" - echo ">>> Checking text data with Content-Type: application/json" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - - echo ">>> Checking audio data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" - - echo ">>> Checking video data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" - -} - -function stop_docker() { - cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ - docker compose stop && docker compose rm -f -} - -function main() { - echo "===========================================" - echo ">>>> Stopping any running Docker containers..." - stop_docker - - echo "===========================================" - if [[ "$IMAGE_REPO" == "opea" ]]; then - echo ">>>> Building Docker images..." - build_docker_images - fi - - echo "===========================================" - echo ">>>> Starting Docker services..." - start_services - - echo "===========================================" - echo ">>>> Validating microservices..." - validate_microservices - - echo "===========================================" - echo ">>>> Validating megaservice..." - validate_megaservice - echo ">>>> Validating validate_megaservice_json..." - validate_megaservice_json - - echo "===========================================" - echo ">>>> Stopping Docker containers..." - stop_docker - - echo "===========================================" - echo ">>>> Pruning Docker system..." - echo y | docker system prune - echo ">>>> Docker system pruned successfully." - echo "===========================================" -} - -main From bd2d47e7e53e1241c27aed0f823fa680d8ecf4e2 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:02:03 +0700 Subject: [PATCH 03/22] DocSum - add files for deploy app with ROCm vLLM Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 ++ .../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++ .../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++ .../amd/gpu/rocm-vllm/set_env.sh | 16 ++ DocSum/docker_image_build/build.yaml | 9 + DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++ 6 files changed, 574 insertions(+) create mode 100644 DocSum/Dockerfile-vllm-rocm create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm new file mode 100644 index 0000000000..f0e8a8743a --- /dev/null +++ b/DocSum/Dockerfile-vllm-rocm @@ -0,0 +1,18 @@ +FROM rocm/vllm-dev:main + +# Set the working directory +WORKDIR /workspace + +# Copy the api_server.py into the image +ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py + +# Expose the port used by the API server +EXPOSE 8011 + +# Set environment variables +ENV HUGGINGFACE_HUB_CACHE=/workspace +ENV WILM_USE_TRITON_FLASH_ATTENTION=0 +ENV PYTORCH_JIT=0 + +# Set the entrypoint to the api_server.py script +ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md new file mode 100644 index 0000000000..4d41a5cd31 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md @@ -0,0 +1,175 @@ +# Build and deploy DocSum Application on AMD GPU (ROCm) + +## Build images + +## 🚀 Build Docker Images + +First of all, you need to build Docker Images locally and install the python package of it. + +### 1. Build LLM Image + +```bash +git clone https://github.com/opea-project/GenAIComps.git +cd GenAIComps +docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . +``` + +Then run the command `docker images`, you will have the following four Docker Images: + +### 2. Build MegaService Docker Image + +To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: + +```bash +git clone https://github.com/opea-project/GenAIExamples +cd GenAIExamples/DocSum/ +docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . +``` + +### 3. Build UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` + +### 4. Build React UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . + +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` +4. `opea/docsum-react-ui:latest` + +## 🚀 Start Microservices and MegaService + +### Required Models + +Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. +For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. + +### Setup Environment Variables + +Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. + +```bash +export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${host_ip} +export DOCSUM_TGI_SERVICE_PORT="18882" +export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export DOCSUM_LLM_SERVER_PORT="8008" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DocSum_COMPONENT_NAME="OpeaDocSumTgi" +``` + +Note: Please replace with `host_ip` with your external IP address, do not use localhost. + +Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +Example for set isolation for 1 GPU + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 +``` + +Example for set isolation for 2 GPUs + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 + - /dev/dri/card1:/dev/dri/card1 + - /dev/dri/renderD129:/dev/dri/renderD129 +``` + +Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +### Start Microservice Docker Containers + +```bash +cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm +docker compose up -d +``` + +### Validate Microservices + +1. TGI Service + + ```bash + curl http://${host_ip}:8008/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -H 'Content-Type: application/json' + ``` + +2. LLM Microservice + + ```bash + curl http://${host_ip}:9000/v1/docsum \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ + -H 'Content-Type: application/json' + ``` + +3. MegaService + + ```bash + curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ + "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false + }' + ``` + +## 🚀 Launch the Svelte UI + +Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + +![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) + +Here is an example for summarizing a article. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) + +## 🚀 Launch the React UI (Optional) + +To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: + +```yaml +docsum-rocm-react-ui-server: + image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} + container_name: docsum-rocm-react-ui-server + depends_on: + - docsum-rocm-backend-server + ports: + - "5174:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} +``` + +Open this URL `http://{host_ip}:5175` in your browser to access the frontend. + +![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml new file mode 100644 index 0000000000..037aa06395 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml @@ -0,0 +1,107 @@ +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +services: + docsum-vllm-service: + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} + container_name: docsum-vllm-service + ports: + - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + WILM_USE_TRITON_FLASH_ATTENTION: 0 + PYTORCH_JIT: 0 + volumes: + - "./data:/data" + shm_size: 20G + devices: + - /dev/kfd:/dev/kfd + - /dev/dri/:/dev/dri/ + cap_add: + - SYS_PTRACE + group_add: + - video + security_opt: + - seccomp:unconfined + - apparmor=unconfined + command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" + ipc: host + + docsum-llm-server: + image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + container_name: docsum-llm-server + depends_on: + - docsum-vllm-service + ports: + - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" + ipc: host + cap_add: + - SYS_PTRACE + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} + LOGFLAG: ${DOCSUM_LOGFLAG:-False} + MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} + restart: unless-stopped + + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "${DOCSUM_WHISPER_PORT:-7066}:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + + docsum-backend-server: + image: ${REGISTRY:-opea}/docsum:${TAG:-latest} + container_name: docsum-backend-server + depends_on: + - docsum-tgi-service + - docsum-llm-server + ports: + - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + MEGA_SERVICE_HOST_IP: ${HOST_IP} + LLM_SERVICE_HOST_IP: ${HOST_IP} + ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} + ipc: host + restart: always + + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} + container_name: docsum-ui-server + depends_on: + - docsum-backend-server + ports: + - "${DOCSUM_FRONTEND_PORT:-5173}:5173" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + +networks: + default: + driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh new file mode 100644 index 0000000000..43e71e0fbf --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +export HOST_IP="" +export DOCSUM_MAX_INPUT_TOKENS=2048 +export DOCSUM_MAX_TOTAL_TOKENS=4096 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index 095fd28c93..dc0d546189 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,3 +47,12 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + vllm_rocm: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + context: ../ + dockerfile: ./Dockerfile-vllm-rocm + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh new file mode 100644 index 0000000000..d0919a019a --- /dev/null +++ b/DocSum/tests/test_compose_on_rocm_vllm.sh @@ -0,0 +1,249 @@ +#!/bin/bash +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${ip_address} +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export MEGA_SERVICE_HOST_IP=${HOST_IP} +export LLM_SERVICE_HOST_IP=${HOST_IP} +export ASR_SERVICE_HOST_IP=${HOST_IP} +export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker images && sleep 1s +} + +function start_services() { + cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm + sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env + # Start Docker Containers + docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log + sleep 1m +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "EXPECTED_RESULT==> $EXPECTED_RESULT" + echo "CONTENT==> $CONTENT" + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +get_base64_str() { + local file_name=$1 + base64 -w 0 "$file_name" +} + +# Function to generate input data for testing based on the document type +input_data_for_test() { + local document_type=$1 + case $document_type in + ("text") + echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." + ;; + ("audio") + get_base64_str "$WORKPATH/tests/data/test.wav" + ;; + ("video") + get_base64_str "$WORKPATH/tests/data/test.mp4" + ;; + (*) + echo "Invalid document type" >&2 + exit 1 + ;; + esac +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # whisper microservice + ulimit -s 65536 + validate_services \ + "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ + '{"asr_result":"well"}' \ + "whisper-service" \ + "whisper-service" \ + "{\"audio\": \"$(input_data_for_test "audio")\"}" + + # vLLM service + validate_services \ + "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ + "generated_text" \ + "docsum-vllm-service" \ + "docsum-vllm-service" \ + '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + + # llm microservice + validate_services \ + "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ + "text" \ + "docsum-llm-server" \ + "docsum-llm-server" \ + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + +} + +function validate_megaservice() { + local SERVICE_NAME="docsum-backend-server" + local DOCKER_NAME="docsum-backend-server" + local EXPECTED_RESULT="[DONE]" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${host_ip}:8888/v1/docsum" + local DATA_TYPE="type=text" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_megaservice_json() { + # Curl the Mega Service + echo "" + echo ">>> Checking text data with Content-Type: application/json" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + + echo ">>> Checking audio data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" + + echo ">>> Checking video data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" + +} + +function stop_docker() { + cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ + docker compose stop && docker compose rm -f +} + +function main() { + echo "===========================================" + echo ">>>> Stopping any running Docker containers..." + stop_docker + + echo "===========================================" + if [[ "$IMAGE_REPO" == "opea" ]]; then + echo ">>>> Building Docker images..." + build_docker_images + fi + + echo "===========================================" + echo ">>>> Starting Docker services..." + start_services + + echo "===========================================" + echo ">>>> Validating microservices..." + validate_microservices + + echo "===========================================" + echo ">>>> Validating megaservice..." + validate_megaservice + echo ">>>> Validating validate_megaservice_json..." + validate_megaservice_json + + echo "===========================================" + echo ">>>> Stopping Docker containers..." + stop_docker + + echo "===========================================" + echo ">>>> Pruning Docker system..." + echo y | docker system prune + echo ">>>> Docker system pruned successfully." + echo "===========================================" +} + +main From 2459ecbc53fdb7c9c449930700cff290de15c152 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:07:05 +0700 Subject: [PATCH 04/22] DocSum - fix main Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 -- .../amd/gpu/rocm-vllm/README.md | 175 ------------ .../amd/gpu/rocm-vllm/compose.yaml | 107 -------- .../amd/gpu/rocm-vllm/set_env.sh | 16 -- DocSum/docker_image_build/build.yaml | 9 - DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------ 6 files changed, 574 deletions(-) delete mode 100644 DocSum/Dockerfile-vllm-rocm delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm deleted file mode 100644 index f0e8a8743a..0000000000 --- a/DocSum/Dockerfile-vllm-rocm +++ /dev/null @@ -1,18 +0,0 @@ -FROM rocm/vllm-dev:main - -# Set the working directory -WORKDIR /workspace - -# Copy the api_server.py into the image -ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py - -# Expose the port used by the API server -EXPOSE 8011 - -# Set environment variables -ENV HUGGINGFACE_HUB_CACHE=/workspace -ENV WILM_USE_TRITON_FLASH_ATTENTION=0 -ENV PYTORCH_JIT=0 - -# Set the entrypoint to the api_server.py script -ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md deleted file mode 100644 index 4d41a5cd31..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# Build and deploy DocSum Application on AMD GPU (ROCm) - -## Build images - -## 🚀 Build Docker Images - -First of all, you need to build Docker Images locally and install the python package of it. - -### 1. Build LLM Image - -```bash -git clone https://github.com/opea-project/GenAIComps.git -cd GenAIComps -docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . -``` - -Then run the command `docker images`, you will have the following four Docker Images: - -### 2. Build MegaService Docker Image - -To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: - -```bash -git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/DocSum/ -docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . -``` - -### 3. Build UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` - -### 4. Build React UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . - -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` -4. `opea/docsum-react-ui:latest` - -## 🚀 Start Microservices and MegaService - -### Required Models - -Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. -For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. - -### Setup Environment Variables - -Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. - -```bash -export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${host_ip} -export DOCSUM_TGI_SERVICE_PORT="18882" -export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} -export DOCSUM_LLM_SERVER_PORT="8008" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DocSum_COMPONENT_NAME="OpeaDocSumTgi" -``` - -Note: Please replace with `host_ip` with your external IP address, do not use localhost. - -Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -Example for set isolation for 1 GPU - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 -``` - -Example for set isolation for 2 GPUs - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 - - /dev/dri/card1:/dev/dri/card1 - - /dev/dri/renderD129:/dev/dri/renderD129 -``` - -Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -### Start Microservice Docker Containers - -```bash -cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm -docker compose up -d -``` - -### Validate Microservices - -1. TGI Service - - ```bash - curl http://${host_ip}:8008/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ - -H 'Content-Type: application/json' - ``` - -2. LLM Microservice - - ```bash - curl http://${host_ip}:9000/v1/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - ``` - -3. MegaService - - ```bash - curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false - }' - ``` - -## 🚀 Launch the Svelte UI - -Open this URL `http://{host_ip}:5173` in your browser to access the frontend. - -![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) - -Here is an example for summarizing a article. - -![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) - -## 🚀 Launch the React UI (Optional) - -To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: - -```yaml -docsum-rocm-react-ui-server: - image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} - container_name: docsum-rocm-react-ui-server - depends_on: - - docsum-rocm-backend-server - ports: - - "5174:80" - environment: - - no_proxy=${no_proxy} - - https_proxy=${https_proxy} - - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} -``` - -Open this URL `http://{host_ip}:5175` in your browser to access the frontend. - -![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml deleted file mode 100644 index 037aa06395..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -services: - docsum-vllm-service: - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} - container_name: docsum-vllm-service - ports: - - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - WILM_USE_TRITON_FLASH_ATTENTION: 0 - PYTORCH_JIT: 0 - volumes: - - "./data:/data" - shm_size: 20G - devices: - - /dev/kfd:/dev/kfd - - /dev/dri/:/dev/dri/ - cap_add: - - SYS_PTRACE - group_add: - - video - security_opt: - - seccomp:unconfined - - apparmor=unconfined - command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" - ipc: host - - docsum-llm-server: - image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - container_name: docsum-llm-server - depends_on: - - docsum-vllm-service - ports: - - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" - ipc: host - cap_add: - - SYS_PTRACE - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} - LOGFLAG: ${DOCSUM_LOGFLAG:-False} - MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} - MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} - restart: unless-stopped - - whisper-service: - image: ${REGISTRY:-opea}/whisper:${TAG:-latest} - container_name: whisper-service - ports: - - "${DOCSUM_WHISPER_PORT:-7066}:7066" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped - - docsum-backend-server: - image: ${REGISTRY:-opea}/docsum:${TAG:-latest} - container_name: docsum-backend-server - depends_on: - - docsum-tgi-service - - docsum-llm-server - ports: - - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - MEGA_SERVICE_HOST_IP: ${HOST_IP} - LLM_SERVICE_HOST_IP: ${HOST_IP} - ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} - ipc: host - restart: always - - docsum-gradio-ui: - image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} - container_name: docsum-ui-server - depends_on: - - docsum-backend-server - ports: - - "${DOCSUM_FRONTEND_PORT:-5173}:5173" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - ipc: host - restart: always - -networks: - default: - driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh deleted file mode 100644 index 43e71e0fbf..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -export HOST_IP="" -export DOCSUM_MAX_INPUT_TOKENS=2048 -export DOCSUM_MAX_TOTAL_TOKENS=4096 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index dc0d546189..095fd28c93 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,12 +47,3 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - vllm_rocm: - build: - args: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - no_proxy: ${no_proxy} - context: ../ - dockerfile: ./Dockerfile-vllm-rocm - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh deleted file mode 100644 index d0919a019a..0000000000 --- a/DocSum/tests/test_compose_on_rocm_vllm.sh +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -set -xe -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') -export MAX_INPUT_TOKENS=1024 -export MAX_TOTAL_TOKENS=2048 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${ip_address} -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export MEGA_SERVICE_HOST_IP=${HOST_IP} -export LLM_SERVICE_HOST_IP=${HOST_IP} -export ASR_SERVICE_HOST_IP=${HOST_IP} -export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" - -function build_docker_images() { - opea_branch=${opea_branch:-"main"} - # If the opea_branch isn't main, replace the git clone branch in Dockerfile. - if [[ "${opea_branch}" != "main" ]]; then - cd $WORKPATH - OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" - NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" - find . -type f -name "Dockerfile*" | while read -r file; do - echo "Processing file: $file" - sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" - done - fi - - cd $WORKPATH/docker_image_build - git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git - - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" - docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - - docker images && sleep 1s -} - -function start_services() { - cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm - sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env - # Start Docker Containers - docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log - sleep 1m -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "EXPECTED_RESULT==> $EXPECTED_RESULT" - echo "CONTENT==> $CONTENT" - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -get_base64_str() { - local file_name=$1 - base64 -w 0 "$file_name" -} - -# Function to generate input data for testing based on the document type -input_data_for_test() { - local document_type=$1 - case $document_type in - ("text") - echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - ;; - ("audio") - get_base64_str "$WORKPATH/tests/data/test.wav" - ;; - ("video") - get_base64_str "$WORKPATH/tests/data/test.mp4" - ;; - (*) - echo "Invalid document type" >&2 - exit 1 - ;; - esac -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # whisper microservice - ulimit -s 65536 - validate_services \ - "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ - '{"asr_result":"well"}' \ - "whisper-service" \ - "whisper-service" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # vLLM service - validate_services \ - "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ - "generated_text" \ - "docsum-vllm-service" \ - "docsum-vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' - - # llm microservice - validate_services \ - "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ - "text" \ - "docsum-llm-server" \ - "docsum-llm-server" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - -} - -function validate_megaservice() { - local SERVICE_NAME="docsum-backend-server" - local DOCKER_NAME="docsum-backend-server" - local EXPECTED_RESULT="[DONE]" - local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - local URL="${host_ip}:8888/v1/docsum" - local DATA_TYPE="type=text" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_megaservice_json() { - # Curl the Mega Service - echo "" - echo ">>> Checking text data with Content-Type: application/json" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - - echo ">>> Checking audio data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" - - echo ">>> Checking video data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" - -} - -function stop_docker() { - cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ - docker compose stop && docker compose rm -f -} - -function main() { - echo "===========================================" - echo ">>>> Stopping any running Docker containers..." - stop_docker - - echo "===========================================" - if [[ "$IMAGE_REPO" == "opea" ]]; then - echo ">>>> Building Docker images..." - build_docker_images - fi - - echo "===========================================" - echo ">>>> Starting Docker services..." - start_services - - echo "===========================================" - echo ">>>> Validating microservices..." - validate_microservices - - echo "===========================================" - echo ">>>> Validating megaservice..." - validate_megaservice - echo ">>>> Validating validate_megaservice_json..." - validate_megaservice_json - - echo "===========================================" - echo ">>>> Stopping Docker containers..." - stop_docker - - echo "===========================================" - echo ">>>> Pruning Docker system..." - echo y | docker system prune - echo ">>>> Docker system pruned successfully." - echo "===========================================" -} - -main From 6d5049dd1c6bb3e201c4ca807da6950e0ab4b9d2 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:02:03 +0700 Subject: [PATCH 05/22] DocSum - add files for deploy app with ROCm vLLM Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 ++ .../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++ .../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++ .../amd/gpu/rocm-vllm/set_env.sh | 16 ++ DocSum/docker_image_build/build.yaml | 9 + DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++ 6 files changed, 574 insertions(+) create mode 100644 DocSum/Dockerfile-vllm-rocm create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm new file mode 100644 index 0000000000..f0e8a8743a --- /dev/null +++ b/DocSum/Dockerfile-vllm-rocm @@ -0,0 +1,18 @@ +FROM rocm/vllm-dev:main + +# Set the working directory +WORKDIR /workspace + +# Copy the api_server.py into the image +ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py + +# Expose the port used by the API server +EXPOSE 8011 + +# Set environment variables +ENV HUGGINGFACE_HUB_CACHE=/workspace +ENV WILM_USE_TRITON_FLASH_ATTENTION=0 +ENV PYTORCH_JIT=0 + +# Set the entrypoint to the api_server.py script +ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md new file mode 100644 index 0000000000..4d41a5cd31 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md @@ -0,0 +1,175 @@ +# Build and deploy DocSum Application on AMD GPU (ROCm) + +## Build images + +## 🚀 Build Docker Images + +First of all, you need to build Docker Images locally and install the python package of it. + +### 1. Build LLM Image + +```bash +git clone https://github.com/opea-project/GenAIComps.git +cd GenAIComps +docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . +``` + +Then run the command `docker images`, you will have the following four Docker Images: + +### 2. Build MegaService Docker Image + +To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: + +```bash +git clone https://github.com/opea-project/GenAIExamples +cd GenAIExamples/DocSum/ +docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . +``` + +### 3. Build UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` + +### 4. Build React UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . + +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` +4. `opea/docsum-react-ui:latest` + +## 🚀 Start Microservices and MegaService + +### Required Models + +Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. +For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. + +### Setup Environment Variables + +Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. + +```bash +export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${host_ip} +export DOCSUM_TGI_SERVICE_PORT="18882" +export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export DOCSUM_LLM_SERVER_PORT="8008" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DocSum_COMPONENT_NAME="OpeaDocSumTgi" +``` + +Note: Please replace with `host_ip` with your external IP address, do not use localhost. + +Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +Example for set isolation for 1 GPU + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 +``` + +Example for set isolation for 2 GPUs + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 + - /dev/dri/card1:/dev/dri/card1 + - /dev/dri/renderD129:/dev/dri/renderD129 +``` + +Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +### Start Microservice Docker Containers + +```bash +cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm +docker compose up -d +``` + +### Validate Microservices + +1. TGI Service + + ```bash + curl http://${host_ip}:8008/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -H 'Content-Type: application/json' + ``` + +2. LLM Microservice + + ```bash + curl http://${host_ip}:9000/v1/docsum \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ + -H 'Content-Type: application/json' + ``` + +3. MegaService + + ```bash + curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ + "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false + }' + ``` + +## 🚀 Launch the Svelte UI + +Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + +![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) + +Here is an example for summarizing a article. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) + +## 🚀 Launch the React UI (Optional) + +To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: + +```yaml +docsum-rocm-react-ui-server: + image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} + container_name: docsum-rocm-react-ui-server + depends_on: + - docsum-rocm-backend-server + ports: + - "5174:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} +``` + +Open this URL `http://{host_ip}:5175` in your browser to access the frontend. + +![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml new file mode 100644 index 0000000000..037aa06395 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml @@ -0,0 +1,107 @@ +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +services: + docsum-vllm-service: + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} + container_name: docsum-vllm-service + ports: + - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + WILM_USE_TRITON_FLASH_ATTENTION: 0 + PYTORCH_JIT: 0 + volumes: + - "./data:/data" + shm_size: 20G + devices: + - /dev/kfd:/dev/kfd + - /dev/dri/:/dev/dri/ + cap_add: + - SYS_PTRACE + group_add: + - video + security_opt: + - seccomp:unconfined + - apparmor=unconfined + command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" + ipc: host + + docsum-llm-server: + image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + container_name: docsum-llm-server + depends_on: + - docsum-vllm-service + ports: + - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" + ipc: host + cap_add: + - SYS_PTRACE + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} + LOGFLAG: ${DOCSUM_LOGFLAG:-False} + MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} + restart: unless-stopped + + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "${DOCSUM_WHISPER_PORT:-7066}:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + + docsum-backend-server: + image: ${REGISTRY:-opea}/docsum:${TAG:-latest} + container_name: docsum-backend-server + depends_on: + - docsum-tgi-service + - docsum-llm-server + ports: + - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + MEGA_SERVICE_HOST_IP: ${HOST_IP} + LLM_SERVICE_HOST_IP: ${HOST_IP} + ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} + ipc: host + restart: always + + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} + container_name: docsum-ui-server + depends_on: + - docsum-backend-server + ports: + - "${DOCSUM_FRONTEND_PORT:-5173}:5173" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + +networks: + default: + driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh new file mode 100644 index 0000000000..43e71e0fbf --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +export HOST_IP="" +export DOCSUM_MAX_INPUT_TOKENS=2048 +export DOCSUM_MAX_TOTAL_TOKENS=4096 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index 095fd28c93..dc0d546189 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,3 +47,12 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + vllm_rocm: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + context: ../ + dockerfile: ./Dockerfile-vllm-rocm + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh new file mode 100644 index 0000000000..d0919a019a --- /dev/null +++ b/DocSum/tests/test_compose_on_rocm_vllm.sh @@ -0,0 +1,249 @@ +#!/bin/bash +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${ip_address} +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export MEGA_SERVICE_HOST_IP=${HOST_IP} +export LLM_SERVICE_HOST_IP=${HOST_IP} +export ASR_SERVICE_HOST_IP=${HOST_IP} +export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker images && sleep 1s +} + +function start_services() { + cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm + sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env + # Start Docker Containers + docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log + sleep 1m +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "EXPECTED_RESULT==> $EXPECTED_RESULT" + echo "CONTENT==> $CONTENT" + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +get_base64_str() { + local file_name=$1 + base64 -w 0 "$file_name" +} + +# Function to generate input data for testing based on the document type +input_data_for_test() { + local document_type=$1 + case $document_type in + ("text") + echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." + ;; + ("audio") + get_base64_str "$WORKPATH/tests/data/test.wav" + ;; + ("video") + get_base64_str "$WORKPATH/tests/data/test.mp4" + ;; + (*) + echo "Invalid document type" >&2 + exit 1 + ;; + esac +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # whisper microservice + ulimit -s 65536 + validate_services \ + "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ + '{"asr_result":"well"}' \ + "whisper-service" \ + "whisper-service" \ + "{\"audio\": \"$(input_data_for_test "audio")\"}" + + # vLLM service + validate_services \ + "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ + "generated_text" \ + "docsum-vllm-service" \ + "docsum-vllm-service" \ + '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + + # llm microservice + validate_services \ + "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ + "text" \ + "docsum-llm-server" \ + "docsum-llm-server" \ + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + +} + +function validate_megaservice() { + local SERVICE_NAME="docsum-backend-server" + local DOCKER_NAME="docsum-backend-server" + local EXPECTED_RESULT="[DONE]" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${host_ip}:8888/v1/docsum" + local DATA_TYPE="type=text" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_megaservice_json() { + # Curl the Mega Service + echo "" + echo ">>> Checking text data with Content-Type: application/json" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + + echo ">>> Checking audio data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" + + echo ">>> Checking video data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" + +} + +function stop_docker() { + cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ + docker compose stop && docker compose rm -f +} + +function main() { + echo "===========================================" + echo ">>>> Stopping any running Docker containers..." + stop_docker + + echo "===========================================" + if [[ "$IMAGE_REPO" == "opea" ]]; then + echo ">>>> Building Docker images..." + build_docker_images + fi + + echo "===========================================" + echo ">>>> Starting Docker services..." + start_services + + echo "===========================================" + echo ">>>> Validating microservices..." + validate_microservices + + echo "===========================================" + echo ">>>> Validating megaservice..." + validate_megaservice + echo ">>>> Validating validate_megaservice_json..." + validate_megaservice_json + + echo "===========================================" + echo ">>>> Stopping Docker containers..." + stop_docker + + echo "===========================================" + echo ">>>> Pruning Docker system..." + echo y | docker system prune + echo ">>>> Docker system pruned successfully." + echo "===========================================" +} + +main From 9dfbdc5cffe708b084e7367d6df2910908f5e76a Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:07:05 +0700 Subject: [PATCH 06/22] DocSum - fix main Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 -- .../amd/gpu/rocm-vllm/README.md | 175 ------------ .../amd/gpu/rocm-vllm/compose.yaml | 107 -------- .../amd/gpu/rocm-vllm/set_env.sh | 16 -- DocSum/docker_image_build/build.yaml | 9 - DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------ 6 files changed, 574 deletions(-) delete mode 100644 DocSum/Dockerfile-vllm-rocm delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm deleted file mode 100644 index f0e8a8743a..0000000000 --- a/DocSum/Dockerfile-vllm-rocm +++ /dev/null @@ -1,18 +0,0 @@ -FROM rocm/vllm-dev:main - -# Set the working directory -WORKDIR /workspace - -# Copy the api_server.py into the image -ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py - -# Expose the port used by the API server -EXPOSE 8011 - -# Set environment variables -ENV HUGGINGFACE_HUB_CACHE=/workspace -ENV WILM_USE_TRITON_FLASH_ATTENTION=0 -ENV PYTORCH_JIT=0 - -# Set the entrypoint to the api_server.py script -ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md deleted file mode 100644 index 4d41a5cd31..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# Build and deploy DocSum Application on AMD GPU (ROCm) - -## Build images - -## 🚀 Build Docker Images - -First of all, you need to build Docker Images locally and install the python package of it. - -### 1. Build LLM Image - -```bash -git clone https://github.com/opea-project/GenAIComps.git -cd GenAIComps -docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . -``` - -Then run the command `docker images`, you will have the following four Docker Images: - -### 2. Build MegaService Docker Image - -To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: - -```bash -git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/DocSum/ -docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . -``` - -### 3. Build UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` - -### 4. Build React UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . - -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` -4. `opea/docsum-react-ui:latest` - -## 🚀 Start Microservices and MegaService - -### Required Models - -Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. -For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. - -### Setup Environment Variables - -Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. - -```bash -export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${host_ip} -export DOCSUM_TGI_SERVICE_PORT="18882" -export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} -export DOCSUM_LLM_SERVER_PORT="8008" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DocSum_COMPONENT_NAME="OpeaDocSumTgi" -``` - -Note: Please replace with `host_ip` with your external IP address, do not use localhost. - -Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -Example for set isolation for 1 GPU - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 -``` - -Example for set isolation for 2 GPUs - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 - - /dev/dri/card1:/dev/dri/card1 - - /dev/dri/renderD129:/dev/dri/renderD129 -``` - -Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -### Start Microservice Docker Containers - -```bash -cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm -docker compose up -d -``` - -### Validate Microservices - -1. TGI Service - - ```bash - curl http://${host_ip}:8008/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ - -H 'Content-Type: application/json' - ``` - -2. LLM Microservice - - ```bash - curl http://${host_ip}:9000/v1/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - ``` - -3. MegaService - - ```bash - curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false - }' - ``` - -## 🚀 Launch the Svelte UI - -Open this URL `http://{host_ip}:5173` in your browser to access the frontend. - -![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) - -Here is an example for summarizing a article. - -![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) - -## 🚀 Launch the React UI (Optional) - -To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: - -```yaml -docsum-rocm-react-ui-server: - image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} - container_name: docsum-rocm-react-ui-server - depends_on: - - docsum-rocm-backend-server - ports: - - "5174:80" - environment: - - no_proxy=${no_proxy} - - https_proxy=${https_proxy} - - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} -``` - -Open this URL `http://{host_ip}:5175` in your browser to access the frontend. - -![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml deleted file mode 100644 index 037aa06395..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -services: - docsum-vllm-service: - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} - container_name: docsum-vllm-service - ports: - - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - WILM_USE_TRITON_FLASH_ATTENTION: 0 - PYTORCH_JIT: 0 - volumes: - - "./data:/data" - shm_size: 20G - devices: - - /dev/kfd:/dev/kfd - - /dev/dri/:/dev/dri/ - cap_add: - - SYS_PTRACE - group_add: - - video - security_opt: - - seccomp:unconfined - - apparmor=unconfined - command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" - ipc: host - - docsum-llm-server: - image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - container_name: docsum-llm-server - depends_on: - - docsum-vllm-service - ports: - - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" - ipc: host - cap_add: - - SYS_PTRACE - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} - LOGFLAG: ${DOCSUM_LOGFLAG:-False} - MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} - MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} - restart: unless-stopped - - whisper-service: - image: ${REGISTRY:-opea}/whisper:${TAG:-latest} - container_name: whisper-service - ports: - - "${DOCSUM_WHISPER_PORT:-7066}:7066" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped - - docsum-backend-server: - image: ${REGISTRY:-opea}/docsum:${TAG:-latest} - container_name: docsum-backend-server - depends_on: - - docsum-tgi-service - - docsum-llm-server - ports: - - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - MEGA_SERVICE_HOST_IP: ${HOST_IP} - LLM_SERVICE_HOST_IP: ${HOST_IP} - ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} - ipc: host - restart: always - - docsum-gradio-ui: - image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} - container_name: docsum-ui-server - depends_on: - - docsum-backend-server - ports: - - "${DOCSUM_FRONTEND_PORT:-5173}:5173" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - ipc: host - restart: always - -networks: - default: - driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh deleted file mode 100644 index 43e71e0fbf..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -export HOST_IP="" -export DOCSUM_MAX_INPUT_TOKENS=2048 -export DOCSUM_MAX_TOTAL_TOKENS=4096 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index dc0d546189..095fd28c93 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,12 +47,3 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - vllm_rocm: - build: - args: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - no_proxy: ${no_proxy} - context: ../ - dockerfile: ./Dockerfile-vllm-rocm - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh deleted file mode 100644 index d0919a019a..0000000000 --- a/DocSum/tests/test_compose_on_rocm_vllm.sh +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -set -xe -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') -export MAX_INPUT_TOKENS=1024 -export MAX_TOTAL_TOKENS=2048 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${ip_address} -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export MEGA_SERVICE_HOST_IP=${HOST_IP} -export LLM_SERVICE_HOST_IP=${HOST_IP} -export ASR_SERVICE_HOST_IP=${HOST_IP} -export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" - -function build_docker_images() { - opea_branch=${opea_branch:-"main"} - # If the opea_branch isn't main, replace the git clone branch in Dockerfile. - if [[ "${opea_branch}" != "main" ]]; then - cd $WORKPATH - OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" - NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" - find . -type f -name "Dockerfile*" | while read -r file; do - echo "Processing file: $file" - sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" - done - fi - - cd $WORKPATH/docker_image_build - git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git - - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" - docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - - docker images && sleep 1s -} - -function start_services() { - cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm - sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env - # Start Docker Containers - docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log - sleep 1m -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "EXPECTED_RESULT==> $EXPECTED_RESULT" - echo "CONTENT==> $CONTENT" - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -get_base64_str() { - local file_name=$1 - base64 -w 0 "$file_name" -} - -# Function to generate input data for testing based on the document type -input_data_for_test() { - local document_type=$1 - case $document_type in - ("text") - echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - ;; - ("audio") - get_base64_str "$WORKPATH/tests/data/test.wav" - ;; - ("video") - get_base64_str "$WORKPATH/tests/data/test.mp4" - ;; - (*) - echo "Invalid document type" >&2 - exit 1 - ;; - esac -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # whisper microservice - ulimit -s 65536 - validate_services \ - "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ - '{"asr_result":"well"}' \ - "whisper-service" \ - "whisper-service" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # vLLM service - validate_services \ - "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ - "generated_text" \ - "docsum-vllm-service" \ - "docsum-vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' - - # llm microservice - validate_services \ - "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ - "text" \ - "docsum-llm-server" \ - "docsum-llm-server" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - -} - -function validate_megaservice() { - local SERVICE_NAME="docsum-backend-server" - local DOCKER_NAME="docsum-backend-server" - local EXPECTED_RESULT="[DONE]" - local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - local URL="${host_ip}:8888/v1/docsum" - local DATA_TYPE="type=text" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_megaservice_json() { - # Curl the Mega Service - echo "" - echo ">>> Checking text data with Content-Type: application/json" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - - echo ">>> Checking audio data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" - - echo ">>> Checking video data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" - -} - -function stop_docker() { - cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ - docker compose stop && docker compose rm -f -} - -function main() { - echo "===========================================" - echo ">>>> Stopping any running Docker containers..." - stop_docker - - echo "===========================================" - if [[ "$IMAGE_REPO" == "opea" ]]; then - echo ">>>> Building Docker images..." - build_docker_images - fi - - echo "===========================================" - echo ">>>> Starting Docker services..." - start_services - - echo "===========================================" - echo ">>>> Validating microservices..." - validate_microservices - - echo "===========================================" - echo ">>>> Validating megaservice..." - validate_megaservice - echo ">>>> Validating validate_megaservice_json..." - validate_megaservice_json - - echo "===========================================" - echo ">>>> Stopping Docker containers..." - stop_docker - - echo "===========================================" - echo ">>>> Pruning Docker system..." - echo y | docker system prune - echo ">>>> Docker system pruned successfully." - echo "===========================================" -} - -main From a8857ae326b2d71ca66bc6f86715ac9ab467ac85 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:02:03 +0700 Subject: [PATCH 07/22] DocSum - add files for deploy app with ROCm vLLM Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 ++ .../amd/gpu/rocm-vllm/README.md | 175 ++++++++++++ .../amd/gpu/rocm-vllm/compose.yaml | 107 ++++++++ .../amd/gpu/rocm-vllm/set_env.sh | 16 ++ DocSum/docker_image_build/build.yaml | 9 + DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ++++++++++++++++++ 6 files changed, 574 insertions(+) create mode 100644 DocSum/Dockerfile-vllm-rocm create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml create mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh create mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm new file mode 100644 index 0000000000..f0e8a8743a --- /dev/null +++ b/DocSum/Dockerfile-vllm-rocm @@ -0,0 +1,18 @@ +FROM rocm/vllm-dev:main + +# Set the working directory +WORKDIR /workspace + +# Copy the api_server.py into the image +ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py + +# Expose the port used by the API server +EXPOSE 8011 + +# Set environment variables +ENV HUGGINGFACE_HUB_CACHE=/workspace +ENV WILM_USE_TRITON_FLASH_ATTENTION=0 +ENV PYTORCH_JIT=0 + +# Set the entrypoint to the api_server.py script +ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md new file mode 100644 index 0000000000..4d41a5cd31 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md @@ -0,0 +1,175 @@ +# Build and deploy DocSum Application on AMD GPU (ROCm) + +## Build images + +## 🚀 Build Docker Images + +First of all, you need to build Docker Images locally and install the python package of it. + +### 1. Build LLM Image + +```bash +git clone https://github.com/opea-project/GenAIComps.git +cd GenAIComps +docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . +``` + +Then run the command `docker images`, you will have the following four Docker Images: + +### 2. Build MegaService Docker Image + +To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: + +```bash +git clone https://github.com/opea-project/GenAIExamples +cd GenAIExamples/DocSum/ +docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . +``` + +### 3. Build UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` + +### 4. Build React UI Docker Image + +Build the frontend Docker image via below command: + +```bash +cd GenAIExamples/DocSum/ui +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . + +docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . +``` + +Then run the command `docker images`, you will have the following Docker Images: + +1. `opea/llm-docsum-tgi:latest` +2. `opea/docsum:latest` +3. `opea/docsum-ui:latest` +4. `opea/docsum-react-ui:latest` + +## 🚀 Start Microservices and MegaService + +### Required Models + +Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. +For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. + +### Setup Environment Variables + +Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. + +```bash +export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${host_ip} +export DOCSUM_TGI_SERVICE_PORT="18882" +export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export DOCSUM_LLM_SERVER_PORT="8008" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DocSum_COMPONENT_NAME="OpeaDocSumTgi" +``` + +Note: Please replace with `host_ip` with your external IP address, do not use localhost. + +Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +Example for set isolation for 1 GPU + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 +``` + +Example for set isolation for 2 GPUs + +``` + - /dev/dri/card0:/dev/dri/card0 + - /dev/dri/renderD128:/dev/dri/renderD128 + - /dev/dri/card1:/dev/dri/card1 + - /dev/dri/renderD129:/dev/dri/renderD129 +``` + +Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) + +### Start Microservice Docker Containers + +```bash +cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm +docker compose up -d +``` + +### Validate Microservices + +1. TGI Service + + ```bash + curl http://${host_ip}:8008/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -H 'Content-Type: application/json' + ``` + +2. LLM Microservice + + ```bash + curl http://${host_ip}:9000/v1/docsum \ + -X POST \ + -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ + -H 'Content-Type: application/json' + ``` + +3. MegaService + + ```bash + curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ + "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false + }' + ``` + +## 🚀 Launch the Svelte UI + +Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + +![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) + +Here is an example for summarizing a article. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) + +## 🚀 Launch the React UI (Optional) + +To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: + +```yaml +docsum-rocm-react-ui-server: + image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} + container_name: docsum-rocm-react-ui-server + depends_on: + - docsum-rocm-backend-server + ports: + - "5174:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} +``` + +Open this URL `http://{host_ip}:5175` in your browser to access the frontend. + +![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml new file mode 100644 index 0000000000..037aa06395 --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml @@ -0,0 +1,107 @@ +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +services: + docsum-vllm-service: + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} + container_name: docsum-vllm-service + ports: + - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + WILM_USE_TRITON_FLASH_ATTENTION: 0 + PYTORCH_JIT: 0 + volumes: + - "./data:/data" + shm_size: 20G + devices: + - /dev/kfd:/dev/kfd + - /dev/dri/:/dev/dri/ + cap_add: + - SYS_PTRACE + group_add: + - video + security_opt: + - seccomp:unconfined + - apparmor=unconfined + command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" + ipc: host + + docsum-llm-server: + image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + container_name: docsum-llm-server + depends_on: + - docsum-vllm-service + ports: + - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" + ipc: host + cap_add: + - SYS_PTRACE + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" + HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} + LOGFLAG: ${DOCSUM_LOGFLAG:-False} + MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} + MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} + restart: unless-stopped + + whisper-service: + image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + container_name: whisper-service + ports: + - "${DOCSUM_WHISPER_PORT:-7066}:7066" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + + docsum-backend-server: + image: ${REGISTRY:-opea}/docsum:${TAG:-latest} + container_name: docsum-backend-server + depends_on: + - docsum-tgi-service + - docsum-llm-server + ports: + - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + MEGA_SERVICE_HOST_IP: ${HOST_IP} + LLM_SERVICE_HOST_IP: ${HOST_IP} + ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} + ipc: host + restart: always + + docsum-gradio-ui: + image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} + container_name: docsum-ui-server + depends_on: + - docsum-backend-server + ports: + - "${DOCSUM_FRONTEND_PORT:-5173}:5173" + environment: + no_proxy: ${no_proxy} + https_proxy: ${https_proxy} + http_proxy: ${http_proxy} + BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + +networks: + default: + driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh new file mode 100644 index 0000000000..43e71e0fbf --- /dev/null +++ b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +export HOST_IP="" +export DOCSUM_MAX_INPUT_TOKENS=2048 +export DOCSUM_MAX_TOTAL_TOKENS=4096 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index 095fd28c93..dc0d546189 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,3 +47,12 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} + vllm_rocm: + build: + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + no_proxy: ${no_proxy} + context: ../ + dockerfile: ./Dockerfile-vllm-rocm + image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh new file mode 100644 index 0000000000..d0919a019a --- /dev/null +++ b/DocSum/tests/test_compose_on_rocm_vllm.sh @@ -0,0 +1,249 @@ +#!/bin/bash +# Copyright (C) 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export HOST_IP=${ip_address} +export DOCSUM_VLLM_SERVICE_PORT="8008" +export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export DOCSUM_LLM_SERVER_PORT="9000" +export DOCSUM_WHISPER_PORT="7066" +export DOCSUM_BACKEND_SERVER_PORT="8888" +export DOCSUM_FRONTEND_PORT="5173" +export MEGA_SERVICE_HOST_IP=${HOST_IP} +export LLM_SERVICE_HOST_IP=${HOST_IP} +export ASR_SERVICE_HOST_IP=${HOST_IP} +export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker images && sleep 1s +} + +function start_services() { + cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm + sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env + # Start Docker Containers + docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log + sleep 1m +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + + echo "===========================================" + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "EXPECTED_RESULT==> $EXPECTED_RESULT" + echo "CONTENT==> $CONTENT" + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +get_base64_str() { + local file_name=$1 + base64 -w 0 "$file_name" +} + +# Function to generate input data for testing based on the document type +input_data_for_test() { + local document_type=$1 + case $document_type in + ("text") + echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." + ;; + ("audio") + get_base64_str "$WORKPATH/tests/data/test.wav" + ;; + ("video") + get_base64_str "$WORKPATH/tests/data/test.mp4" + ;; + (*) + echo "Invalid document type" >&2 + exit 1 + ;; + esac +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # whisper microservice + ulimit -s 65536 + validate_services \ + "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ + '{"asr_result":"well"}' \ + "whisper-service" \ + "whisper-service" \ + "{\"audio\": \"$(input_data_for_test "audio")\"}" + + # vLLM service + validate_services \ + "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ + "generated_text" \ + "docsum-vllm-service" \ + "docsum-vllm-service" \ + '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' + + # llm microservice + validate_services \ + "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ + "text" \ + "docsum-llm-server" \ + "docsum-llm-server" \ + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + +} + +function validate_megaservice() { + local SERVICE_NAME="docsum-backend-server" + local DOCKER_NAME="docsum-backend-server" + local EXPECTED_RESULT="[DONE]" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${host_ip}:8888/v1/docsum" + local DATA_TYPE="type=text" + + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s +} + +function validate_megaservice_json() { + # Curl the Mega Service + echo "" + echo ">>> Checking text data with Content-Type: application/json" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + + echo ">>> Checking audio data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" + + echo ">>> Checking video data" + validate_services \ + "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ + "[DONE]" \ + "docsum-backend-server" \ + "docsum-backend-server" \ + "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" + +} + +function stop_docker() { + cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ + docker compose stop && docker compose rm -f +} + +function main() { + echo "===========================================" + echo ">>>> Stopping any running Docker containers..." + stop_docker + + echo "===========================================" + if [[ "$IMAGE_REPO" == "opea" ]]; then + echo ">>>> Building Docker images..." + build_docker_images + fi + + echo "===========================================" + echo ">>>> Starting Docker services..." + start_services + + echo "===========================================" + echo ">>>> Validating microservices..." + validate_microservices + + echo "===========================================" + echo ">>>> Validating megaservice..." + validate_megaservice + echo ">>>> Validating validate_megaservice_json..." + validate_megaservice_json + + echo "===========================================" + echo ">>>> Stopping Docker containers..." + stop_docker + + echo "===========================================" + echo ">>>> Pruning Docker system..." + echo y | docker system prune + echo ">>>> Docker system pruned successfully." + echo "===========================================" +} + +main From 5a38b266ac77a2bf0766cefab14ec62f28633a8d Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 13 Feb 2025 10:07:05 +0700 Subject: [PATCH 08/22] DocSum - fix main Signed-off-by: Chingis Yundunov --- DocSum/Dockerfile-vllm-rocm | 18 -- .../amd/gpu/rocm-vllm/README.md | 175 ------------ .../amd/gpu/rocm-vllm/compose.yaml | 107 -------- .../amd/gpu/rocm-vllm/set_env.sh | 16 -- DocSum/docker_image_build/build.yaml | 9 - DocSum/tests/test_compose_on_rocm_vllm.sh | 249 ------------------ 6 files changed, 574 deletions(-) delete mode 100644 DocSum/Dockerfile-vllm-rocm delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/README.md delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml delete mode 100644 DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh delete mode 100644 DocSum/tests/test_compose_on_rocm_vllm.sh diff --git a/DocSum/Dockerfile-vllm-rocm b/DocSum/Dockerfile-vllm-rocm deleted file mode 100644 index f0e8a8743a..0000000000 --- a/DocSum/Dockerfile-vllm-rocm +++ /dev/null @@ -1,18 +0,0 @@ -FROM rocm/vllm-dev:main - -# Set the working directory -WORKDIR /workspace - -# Copy the api_server.py into the image -ADD https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.7.0/vllm/entrypoints/openai/api_server.py /workspace/api_server.py - -# Expose the port used by the API server -EXPOSE 8011 - -# Set environment variables -ENV HUGGINGFACE_HUB_CACHE=/workspace -ENV WILM_USE_TRITON_FLASH_ATTENTION=0 -ENV PYTORCH_JIT=0 - -# Set the entrypoint to the api_server.py script -ENTRYPOINT ["python3", "/workspace/api_server.py"] diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md b/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md deleted file mode 100644 index 4d41a5cd31..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# Build and deploy DocSum Application on AMD GPU (ROCm) - -## Build images - -## 🚀 Build Docker Images - -First of all, you need to build Docker Images locally and install the python package of it. - -### 1. Build LLM Image - -```bash -git clone https://github.com/opea-project/GenAIComps.git -cd GenAIComps -docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . -``` - -Then run the command `docker images`, you will have the following four Docker Images: - -### 2. Build MegaService Docker Image - -To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command: - -```bash -git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/DocSum/ -docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . -``` - -### 3. Build UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` - -### 4. Build React UI Docker Image - -Build the frontend Docker image via below command: - -```bash -cd GenAIExamples/DocSum/ui -export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum" -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react . - -docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . -``` - -Then run the command `docker images`, you will have the following Docker Images: - -1. `opea/llm-docsum-tgi:latest` -2. `opea/docsum:latest` -3. `opea/docsum-ui:latest` -4. `opea/docsum-react-ui:latest` - -## 🚀 Start Microservices and MegaService - -### Required Models - -Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" in environment variables below if you want to use another model. -For gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) in "HUGGINGFACEHUB_API_TOKEN" environment variable. - -### Setup Environment Variables - -Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below. - -```bash -export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm" -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${host_ip} -export DOCSUM_TGI_SERVICE_PORT="18882" -export DOCSUM_TGI_LLM_ENDPOINT="http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} -export DOCSUM_LLM_SERVER_PORT="8008" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DocSum_COMPONENT_NAME="OpeaDocSumTgi" -``` - -Note: Please replace with `host_ip` with your external IP address, do not use localhost. - -Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -Example for set isolation for 1 GPU - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 -``` - -Example for set isolation for 2 GPUs - -``` - - /dev/dri/card0:/dev/dri/card0 - - /dev/dri/renderD128:/dev/dri/renderD128 - - /dev/dri/card1:/dev/dri/card1 - - /dev/dri/renderD129:/dev/dri/renderD129 -``` - -Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus) - -### Start Microservice Docker Containers - -```bash -cd GenAIExamples/DocSum/docker_compose/amd/gpu/rocm -docker compose up -d -``` - -### Validate Microservices - -1. TGI Service - - ```bash - curl http://${host_ip}:8008/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ - -H 'Content-Type: application/json' - ``` - -2. LLM Microservice - - ```bash - curl http://${host_ip}:9000/v1/docsum \ - -X POST \ - -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \ - -H 'Content-Type: application/json' - ``` - -3. MegaService - - ```bash - curl http://${host_ip}:8888/v1/docsum -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32, "language":"en", "stream":false - }' - ``` - -## 🚀 Launch the Svelte UI - -Open this URL `http://{host_ip}:5173` in your browser to access the frontend. - -![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b) - -Here is an example for summarizing a article. - -![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/67ecb2ec-408d-4e81-b124-6ded6b833f55) - -## 🚀 Launch the React UI (Optional) - -To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-rocm-ui-server` service with the `docsum-rocm-react-ui-server` service as per the config below: - -```yaml -docsum-rocm-react-ui-server: - image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest} - container_name: docsum-rocm-react-ui-server - depends_on: - - docsum-rocm-backend-server - ports: - - "5174:80" - environment: - - no_proxy=${no_proxy} - - https_proxy=${https_proxy} - - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} -``` - -Open this URL `http://{host_ip}:5175` in your browser to access the frontend. - -![project-screenshot](../../../../assets/img/docsum-ui-react.png) diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml deleted file mode 100644 index 037aa06395..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/compose.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -services: - docsum-vllm-service: - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} - container_name: docsum-vllm-service - ports: - - "${DOCSUM_VLLM_SERVICE_PORT:-8081}:8011" - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - WILM_USE_TRITON_FLASH_ATTENTION: 0 - PYTORCH_JIT: 0 - volumes: - - "./data:/data" - shm_size: 20G - devices: - - /dev/kfd:/dev/kfd - - /dev/dri/:/dev/dri/ - cap_add: - - SYS_PTRACE - group_add: - - video - security_opt: - - seccomp:unconfined - - apparmor=unconfined - command: "--model ${DOCSUM_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\"" - ipc: host - - docsum-llm-server: - image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - container_name: docsum-llm-server - depends_on: - - docsum-vllm-service - ports: - - "${DOCSUM_LLM_SERVER_PORT:-9000}:9000" - ipc: host - cap_add: - - SYS_PTRACE - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}" - HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - HF_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN} - LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID} - LOGFLAG: ${DOCSUM_LOGFLAG:-False} - MAX_INPUT_TOKENS: ${DOCSUM_MAX_INPUT_TOKENS} - MAX_TOTAL_TOKENS: ${DOCSUM_MAX_TOTAL_TOKENS} - restart: unless-stopped - - whisper-service: - image: ${REGISTRY:-opea}/whisper:${TAG:-latest} - container_name: whisper-service - ports: - - "${DOCSUM_WHISPER_PORT:-7066}:7066" - ipc: host - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - restart: unless-stopped - - docsum-backend-server: - image: ${REGISTRY:-opea}/docsum:${TAG:-latest} - container_name: docsum-backend-server - depends_on: - - docsum-tgi-service - - docsum-llm-server - ports: - - "${DOCSUM_BACKEND_SERVER_PORT:-8888}:8888" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - MEGA_SERVICE_HOST_IP: ${HOST_IP} - LLM_SERVICE_HOST_IP: ${HOST_IP} - ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP} - ipc: host - restart: always - - docsum-gradio-ui: - image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest} - container_name: docsum-ui-server - depends_on: - - docsum-backend-server - ports: - - "${DOCSUM_FRONTEND_PORT:-5173}:5173" - environment: - no_proxy: ${no_proxy} - https_proxy: ${https_proxy} - http_proxy: ${http_proxy} - BACKEND_SERVICE_ENDPOINT: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - DOC_BASE_URL: ${DOCSUM_BACKEND_SERVICE_ENDPOINT} - ipc: host - restart: always - -networks: - default: - driver: bridge diff --git a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh deleted file mode 100644 index 43e71e0fbf..0000000000 --- a/DocSum/docker_compose/amd/gpu/rocm-vllm/set_env.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -export HOST_IP="" -export DOCSUM_MAX_INPUT_TOKENS=2048 -export DOCSUM_MAX_TOTAL_TOKENS=4096 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN="" -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export DOCSUM_BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml index dc0d546189..095fd28c93 100644 --- a/DocSum/docker_image_build/build.yaml +++ b/DocSum/docker_image_build/build.yaml @@ -47,12 +47,3 @@ services: dockerfile: comps/llms/src/doc-summarization/Dockerfile extends: docsum image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest} - vllm_rocm: - build: - args: - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - no_proxy: ${no_proxy} - context: ../ - dockerfile: ./Dockerfile-vllm-rocm - image: ${REGISTRY:-opea}/llm-vllm-rocm:${TAG:-latest} diff --git a/DocSum/tests/test_compose_on_rocm_vllm.sh b/DocSum/tests/test_compose_on_rocm_vllm.sh deleted file mode 100644 index d0919a019a..0000000000 --- a/DocSum/tests/test_compose_on_rocm_vllm.sh +++ /dev/null @@ -1,249 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 - -set -xe -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') -export MAX_INPUT_TOKENS=1024 -export MAX_TOTAL_TOKENS=2048 -export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" -export HOST_IP=${ip_address} -export DOCSUM_VLLM_SERVICE_PORT="8008" -export DOCSUM_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -export DOCSUM_LLM_SERVER_PORT="9000" -export DOCSUM_WHISPER_PORT="7066" -export DOCSUM_BACKEND_SERVER_PORT="8888" -export DOCSUM_FRONTEND_PORT="5173" -export MEGA_SERVICE_HOST_IP=${HOST_IP} -export LLM_SERVICE_HOST_IP=${HOST_IP} -export ASR_SERVICE_HOST_IP=${HOST_IP} -export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" - -function build_docker_images() { - opea_branch=${opea_branch:-"main"} - # If the opea_branch isn't main, replace the git clone branch in Dockerfile. - if [[ "${opea_branch}" != "main" ]]; then - cd $WORKPATH - OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" - NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" - find . -type f -name "Dockerfile*" | while read -r file; do - echo "Processing file: $file" - sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" - done - fi - - cd $WORKPATH/docker_image_build - git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git - - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="vllm_rocm llm-docsum docsum docsum-gradio-ui whisper" - docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - - docker images && sleep 1s -} - -function start_services() { - cd "$WORKPATH"/docker_compose/amd/gpu/rocm-vllm - sed -i "s/backend_address/$ip_address/g" "$WORKPATH"/ui/svelte/.env - # Start Docker Containers - docker compose up -d > "${LOG_PATH}"/start_services_with_compose.log - sleep 1m -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - - echo "===========================================" - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "EXPECTED_RESULT==> $EXPECTED_RESULT" - echo "CONTENT==> $CONTENT" - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -get_base64_str() { - local file_name=$1 - base64 -w 0 "$file_name" -} - -# Function to generate input data for testing based on the document type -input_data_for_test() { - local document_type=$1 - case $document_type in - ("text") - echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are." - ;; - ("audio") - get_base64_str "$WORKPATH/tests/data/test.wav" - ;; - ("video") - get_base64_str "$WORKPATH/tests/data/test.mp4" - ;; - (*) - echo "Invalid document type" >&2 - exit 1 - ;; - esac -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # whisper microservice - ulimit -s 65536 - validate_services \ - "${HOST_IP}:${DOCSUM_WHISPER_PORT}/v1/asr" \ - '{"asr_result":"well"}' \ - "whisper-service" \ - "whisper-service" \ - "{\"audio\": \"$(input_data_for_test "audio")\"}" - - # vLLM service - validate_services \ - "${HOST_IP}:${DOCSUM_VLLM_SERVICE_PORT}/v1/chat/completions" \ - "generated_text" \ - "docsum-vllm-service" \ - "docsum-vllm-service" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' - - # llm microservice - validate_services \ - "${HOST_IP}:${DOCSUM_LLM_SERVER_PORT}/v1/docsum" \ - "text" \ - "docsum-llm-server" \ - "docsum-llm-server" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - -} - -function validate_megaservice() { - local SERVICE_NAME="docsum-backend-server" - local DOCKER_NAME="docsum-backend-server" - local EXPECTED_RESULT="[DONE]" - local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - local URL="${host_ip}:8888/v1/docsum" - local DATA_TYPE="type=text" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") - - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_megaservice_json() { - # Curl the Mega Service - echo "" - echo ">>> Checking text data with Content-Type: application/json" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' - - echo ">>> Checking audio data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"audio\", \"messages\": \"$(input_data_for_test "audio")\"}" - - echo ">>> Checking video data" - validate_services \ - "${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum" \ - "[DONE]" \ - "docsum-backend-server" \ - "docsum-backend-server" \ - "{\"type\": \"video\", \"messages\": \"$(input_data_for_test "video")\"}" - -} - -function stop_docker() { - cd $WORKPATH/docker_compose/amd/gpu/rocm-vllm/ - docker compose stop && docker compose rm -f -} - -function main() { - echo "===========================================" - echo ">>>> Stopping any running Docker containers..." - stop_docker - - echo "===========================================" - if [[ "$IMAGE_REPO" == "opea" ]]; then - echo ">>>> Building Docker images..." - build_docker_images - fi - - echo "===========================================" - echo ">>>> Starting Docker services..." - start_services - - echo "===========================================" - echo ">>>> Validating microservices..." - validate_microservices - - echo "===========================================" - echo ">>>> Validating megaservice..." - validate_megaservice - echo ">>>> Validating validate_megaservice_json..." - validate_megaservice_json - - echo "===========================================" - echo ">>>> Stopping Docker containers..." - stop_docker - - echo "===========================================" - echo ">>>> Pruning Docker system..." - echo y | docker system prune - echo ">>>> Docker system pruned successfully." - echo "===========================================" -} - -main From 198d50e6e9523a6344d1c83bf542aca9b804cd42 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Fri, 11 Apr 2025 11:43:00 +0700 Subject: [PATCH 09/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/README.md | 14 +++++ AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 56 +++++++++++++++++++ AgentQnA/kubernetes/helm/rocm-values.yaml | 52 +++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 AgentQnA/kubernetes/helm/rocm-tgi-values.yaml create mode 100644 AgentQnA/kubernetes/helm/rocm-values.yaml diff --git a/AgentQnA/kubernetes/helm/README.md b/AgentQnA/kubernetes/helm/README.md index 8d0cbc61e4..f5b126fdd3 100644 --- a/AgentQnA/kubernetes/helm/README.md +++ b/AgentQnA/kubernetes/helm/README.md @@ -9,3 +9,17 @@ export HFTOKEN="insert-your-huggingface-token-here" helm install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml ``` + +## Deploy on ROCm with vLLM + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm upgrade --install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f rocm-values.yaml +``` + +## Deploy on ROCm with TGI + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm upgrade --install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f rocm-tgi-values.yaml +``` diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..4da9284d5e --- /dev/null +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,56 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values +vllm: + enabled: false +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "3.0.0-rocm" + LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..840852d73c --- /dev/null +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct" + env: + HIP_VISIBLE_DEVICES: "0,1" + TENSOR_PARALLEL_SIZE: "2" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "2" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" From 9bc4a3728ee339955b6e1068a29d7b3ca3a3c2fc Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Tue, 22 Apr 2025 23:19:10 +0700 Subject: [PATCH 10/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 4 ++-- AgentQnA/kubernetes/helm/rocm-values.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 4da9284d5e..4295a68f27 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -1,5 +1,4 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2025 Advanced Micro Devices, Inc. # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values @@ -54,3 +53,4 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi model: "meta-llama/Llama-3.3-70B-Instruct" + diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 840852d73c..15ef46ccd1 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -1,5 +1,4 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2025 Advanced Micro Devices, Inc. # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values @@ -50,3 +49,4 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm model: "meta-llama/Llama-3.3-70B-Instruct" + From 015faa6b69b85653201d48775e5cf97a005d48a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:19:46 +0000 Subject: [PATCH 11/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 1 - AgentQnA/kubernetes/helm/rocm-values.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 4295a68f27..352fe746f9 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -53,4 +53,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi model: "meta-llama/Llama-3.3-70B-Instruct" - diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 15ef46ccd1..7702596c0b 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -49,4 +49,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm model: "meta-llama/Llama-3.3-70B-Instruct" - From 62c1c5f68b5b5500c3d310e3da83428dc03c736f Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 20:33:55 +0700 Subject: [PATCH 12/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 1 + AgentQnA/kubernetes/helm/rocm-values.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 352fe746f9..4295a68f27 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -53,3 +53,4 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi model: "meta-llama/Llama-3.3-70B-Instruct" + diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 7702596c0b..15ef46ccd1 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -49,3 +49,4 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm model: "meta-llama/Llama-3.3-70B-Instruct" + From 834cf043ba702493b1ef9fb687a325d9bfbbb66c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:34:37 +0000 Subject: [PATCH 13/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 1 - AgentQnA/kubernetes/helm/rocm-values.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 4295a68f27..352fe746f9 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -53,4 +53,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi model: "meta-llama/Llama-3.3-70B-Instruct" - diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 15ef46ccd1..7702596c0b 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -49,4 +49,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm model: "meta-llama/Llama-3.3-70B-Instruct" - From 2dba72a25d72a31d6693b8f82ae5e1dfa63ac427 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 20:45:40 +0700 Subject: [PATCH 14/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 9 +++++---- AgentQnA/kubernetes/helm/rocm-values.yaml | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 352fe746f9..cab995d72d 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -10,7 +10,7 @@ tgi: image: repository: ghcr.io/huggingface/text-generation-inference tag: "3.0.0-rocm" - LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct" + LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" USE_FLASH_ATTENTION: "false" @@ -44,12 +44,13 @@ tgi: supervisor: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" ragagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" + diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 7702596c0b..038a93fbb0 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -11,7 +11,7 @@ vllm: image: repository: opea/vllm-rocm tag: latest - LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct" + LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" env: HIP_VISIBLE_DEVICES: "0,1" TENSOR_PARALLEL_SIZE: "2" @@ -40,12 +40,13 @@ vllm: supervisor: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" ragagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "meta-llama/Llama-3.3-70B-Instruct" + model: "Intel/neural-chat-7b-v3-3" + From e68e8692dfc49172c93c6de3624e1c5bc35d9bdc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:46:40 +0000 Subject: [PATCH 15/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 1 - AgentQnA/kubernetes/helm/rocm-values.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index cab995d72d..5203554871 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -53,4 +53,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi model: "Intel/neural-chat-7b-v3-3" - diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 038a93fbb0..34ade2e7fa 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -49,4 +49,3 @@ sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm model: "Intel/neural-chat-7b-v3-3" - From 42e2b0ca37efd631316f3d0ef79313c04fd7c73e Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 21:01:02 +0700 Subject: [PATCH 16/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index cab995d72d..85299bf0dc 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -13,7 +13,7 @@ tgi: LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" - USE_FLASH_ATTENTION: "false" + USE_FLASH_ATTENTION: "true" FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0,1" MAX_BATCH_SIZE: "4" From 9e420054cdc82d4b21fc076bc5a598d6f07bf760 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 21:16:13 +0700 Subject: [PATCH 17/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 814c104381..969ab0f1d6 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -15,12 +15,12 @@ tgi: MAX_TOTAL_TOKENS: "2048" USE_FLASH_ATTENTION: "true" FLASH_ATTENTION_RECOMPUTE: "false" - HIP_VISIBLE_DEVICES: "0,1" - MAX_BATCH_SIZE: "4" - extraCmdArgs: [ "--num-shard","2" ] + HIP_VISIBLE_DEVICES: "0" + MAX_BATCH_SIZE: "1" + extraCmdArgs: [ "--num-shard","1" ] resources: limits: - amd.com/gpu: "2" + amd.com/gpu: "1" requests: cpu: 1 memory: 16Gi From de2fbb583237535d9bab8414173d2cfbec7ad6bf Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 21:28:34 +0700 Subject: [PATCH 18/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 969ab0f1d6..0933227f12 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -9,14 +9,14 @@ tgi: accelDevice: "rocm" image: repository: ghcr.io/huggingface/text-generation-inference - tag: "3.0.0-rocm" + tag: "2.4.1-rocm" LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" USE_FLASH_ATTENTION: "true" FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0" - MAX_BATCH_SIZE: "1" + MAX_BATCH_SIZE: "4" extraCmdArgs: [ "--num-shard","1" ] resources: limits: From 5479cce8810a03c195365ad30bb95fd8dab30268 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Thu, 24 Apr 2025 21:54:37 +0700 Subject: [PATCH 19/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 0933227f12..bf761f6512 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -9,12 +9,12 @@ tgi: accelDevice: "rocm" image: repository: ghcr.io/huggingface/text-generation-inference - tag: "2.4.1-rocm" + tag: "3.0.0-rocm" LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "false" + FLASH_ATTENTION_RECOMPUTE: "true" HIP_VISIBLE_DEVICES: "0" MAX_BATCH_SIZE: "4" extraCmdArgs: [ "--num-shard","1" ] From f55d7f97a270e84977117220cac746e530bcb385 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Fri, 25 Apr 2025 00:52:59 +0700 Subject: [PATCH 20/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 8 ++++---- AgentQnA/kubernetes/helm/rocm-values.yaml | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index bf761f6512..8148ec90f1 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -10,7 +10,7 @@ tgi: image: repository: ghcr.io/huggingface/text-generation-inference tag: "3.0.0-rocm" - LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" MAX_INPUT_LENGTH: "2048" MAX_TOTAL_TOKENS: "4096" USE_FLASH_ATTENTION: "true" @@ -44,12 +44,12 @@ tgi: supervisor: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" ragagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-tgi llm_engine: tgi - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml index 34ade2e7fa..5de0cb0a3c 100644 --- a/AgentQnA/kubernetes/helm/rocm-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -11,10 +11,10 @@ vllm: image: repository: opea/vllm-rocm tag: latest - LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" env: - HIP_VISIBLE_DEVICES: "0,1" - TENSOR_PARALLEL_SIZE: "2" + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" HF_HUB_DISABLE_PROGRESS_BARS: "1" HF_HUB_ENABLE_HF_TRANSFER: "0" VLLM_USE_TRITON_FLASH_ATTN: "0" @@ -30,7 +30,7 @@ vllm: "--distributed-executor-backend", "mp" ] resources: limits: - amd.com/gpu: "2" + amd.com/gpu: "1" startupProbe: failureThreshold: 180 securityContext: @@ -40,12 +40,12 @@ vllm: supervisor: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" ragagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" sqlagent: llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_engine: vllm - model: "Intel/neural-chat-7b-v3-3" + model: "meta-llama/Meta-Llama-3-8B-Instruct" From 3765a6258faaa8dd1a4922a14941ae240fcae976 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Fri, 25 Apr 2025 01:02:17 +0700 Subject: [PATCH 21/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index 8148ec90f1..d7ed9771a4 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -9,12 +9,12 @@ tgi: accelDevice: "rocm" image: repository: ghcr.io/huggingface/text-generation-inference - tag: "3.0.0-rocm" + tag: "2.4.1-rocm" LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - MAX_INPUT_LENGTH: "2048" - MAX_TOTAL_TOKENS: "4096" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0" MAX_BATCH_SIZE: "4" extraCmdArgs: [ "--num-shard","1" ] From 5e55c8109a82af62f2cd90c4378a95e427183643 Mon Sep 17 00:00:00 2001 From: Chingis Yundunov Date: Tue, 27 May 2025 12:18:38 +0700 Subject: [PATCH 22/22] AgentQnA - Adding files to deploy an application in the K8S environment using Helm Signed-off-by: Chingis Yundunov --- AgentQnA/kubernetes/helm/rocm-tgi-values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml index d7ed9771a4..997d18c402 100644 --- a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -11,8 +11,8 @@ tgi: repository: ghcr.io/huggingface/text-generation-inference tag: "2.4.1-rocm" LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" USE_FLASH_ATTENTION: "false" FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0"