diff --git a/AgentQnA/kubernetes/helm/README.md b/AgentQnA/kubernetes/helm/README.md index 8d0cbc61e4..f5b126fdd3 100644 --- a/AgentQnA/kubernetes/helm/README.md +++ b/AgentQnA/kubernetes/helm/README.md @@ -9,3 +9,17 @@ export HFTOKEN="insert-your-huggingface-token-here" helm install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml ``` + +## Deploy on ROCm with vLLM + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm upgrade --install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f rocm-values.yaml +``` + +## Deploy on ROCm with TGI + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm upgrade --install agentqna oci://ghcr.io/opea-project/charts/agentqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f rocm-tgi-values.yaml +``` diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..997d18c402 --- /dev/null +++ b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,55 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values +vllm: + enabled: false +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","1" ] + resources: + limits: + amd.com/gpu: "1" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Meta-Llama-3-8B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Meta-Llama-3-8B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..5de0cb0a3c --- /dev/null +++ b/AgentQnA/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Meta-Llama-3-8B-Instruct"