ChatQnA: Update kubernetes xeon chatqna remote inference and svelte UI (#1215)

sgurunat · pre-commit-ci[bot] · web-flow · commit 031cf6e1ff08 · 2024-12-04T22:40:03.000+08:00
Signed-off-by: sgurunat &lt;gurunath.s@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-remote-inference.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-remote-inference.yaml
@@ -70,9 +70,8 @@ data:
   no_proxy: ""
   LOGFLAG: ""
   vLLM_ENDPOINT: "insert-your-remote-inference-endpoint"
-  LLM_MODEL: "meta-llama/Meta-Llama-3.1-8B-Instruct"
-  LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-8B-Instruct"
-  MODEL_ID: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+  LLM_MODEL: "meta-llama/Meta-Llama-3.1-70B-Instruct"
+  MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
   CLIENTID: ""
   CLIENT_SECRET: ""
   TOKEN_URL: ""
@@ -216,6 +215,10 @@ data:
             proxy_set_header X-Real-IP $remote_addr;
             proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
             proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_buffering off;
+            proxy_cache off;
+            proxy_request_buffering off;
+            gzip off;
         }
 
         location /v1/dataprep {
@@ -552,6 +555,9 @@ spec:
         {}
       containers:
         - name: chatqna-ui
+          env:
+            - name: MODEL_ID
+              value: "meta-llama/Meta-Llama-3.1-70B-Instruct"
           securityContext:
             {}
           image: "opea/chatqna-ui:latest"
@@ -691,7 +697,7 @@ spec:
             seccompProfile:
               type: RuntimeDefault
           image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
+          imagePullPolicy: Always
           ports:
             - name: embedding-usvc
               containerPort: 6000
@@ -769,7 +775,7 @@ spec:
             seccompProfile:
               type: RuntimeDefault
           image: "opea/llm-vllm:latest"
-          imagePullPolicy: IfNotPresent
+          imagePullPolicy: Always
           ports:
             - name: llm-uservice
               containerPort: 9000
@@ -919,7 +925,7 @@ spec:
             seccompProfile:
               type: RuntimeDefault
           image: "opea/reranking-tei:latest"
-          imagePullPolicy: IfNotPresent
+          imagePullPolicy: Always
           ports:
             - name: reranking-usvc
               containerPort: 8000
@@ -1257,7 +1263,7 @@ spec:
             - name: EMBEDDING_SERVICE_HOST_IP
               value: chatqna-embedding-usvc
             - name: MODEL_ID
-              value: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+              value: "meta-llama/Meta-Llama-3.1-70B-Instruct"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
@@ -1269,7 +1275,7 @@ spec:
             seccompProfile:
               type: RuntimeDefault
           image: "opea/chatqna-wrapper:latest"
-          imagePullPolicy: IfNotPresent
+          imagePullPolicy: Always
           volumeMounts:
             - mountPath: /tmp
               name: tmp
diff --git a/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts b/ChatQnA/ui/svelte/src/lib/network/chat/Network.ts
@@ -16,13 +16,19 @@ import { env } from "$env/dynamic/public";
 import { SSE } from "sse.js";
 
 const CHAT_BASE_URL = env.CHAT_BASE_URL;
+const MODEL_ID = env.MODEL_ID;
 
 export async function fetchTextStream(query: string) {
 	let payload = {};
 	let url = "";
+	let modelId = "Intel/neural-chat-7b-v3-3";
+
+	if (MODEL_ID) {
+		modelId = MODEL_ID;
+	}
 
 	payload = {
-		model: "Intel/neural-chat-7b-v3-3",
+		model: `${modelId}`,
 		messages: query,
 	};
 	url = `${CHAT_BASE_URL}`;