Skip to content

Commit 671dff7

Browse files
authored
[ChatQnA] Enable Prometheus and Grafana with telemetry docker compose file. (#1623)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
1 parent 8fe1929 commit 671dff7

File tree

17 files changed

+488
-13
lines changed

17 files changed

+488
-13
lines changed

ChatQnA/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,11 @@ To set up environment variables for deploying ChatQnA services, follow these ste
7070
# on Gaudi
7171
cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
7272
source ./set_env.sh
73-
export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
73+
export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1
7474
# on Xeon
7575
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
7676
source ./set_env.sh
77-
export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
77+
export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1
7878
# on Nvidia GPU
7979
cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
8080
source ./set_env.sh

ChatQnA/docker_compose/intel/cpu/xeon/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,10 @@ docker compose up -d
5959
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
6060
CPU example with Open Telemetry feature:
6161

62+
> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
63+
6264
```bash
63-
cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
65+
./grafana/dashboards/download_opea_dashboard.sh
6466
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
6567
```
6668

ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,19 @@
44
services:
55
tei-embedding-service:
66
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
7+
environment:
8+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
79
tei-reranking-service:
810
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
11+
environment:
12+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
13+
# vllm-service:
14+
# command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --otlp-traces-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
15+
chatqna-xeon-backend-server:
16+
environment:
17+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
918
jaeger:
10-
image: jaegertracing/all-in-one:latest
19+
image: jaegertracing/all-in-one:1.67.0
1120
container_name: jaeger
1221
ports:
1322
- "16686:16686"
@@ -21,7 +30,51 @@ services:
2130
https_proxy: ${https_proxy}
2231
COLLECTOR_ZIPKIN_HOST_PORT: 9411
2332
restart: unless-stopped
24-
chatqna-xeon-backend-server:
33+
prometheus:
34+
image: prom/prometheus:v2.52.0
35+
container_name: prometheus
36+
user: root
37+
volumes:
38+
- ./prometheus.yaml:/etc/prometheus/prometheus.yaml
39+
- ./prometheus_data:/prometheus
40+
command:
41+
- '--config.file=/etc/prometheus/prometheus.yaml'
42+
ports:
43+
- '9090:9090'
44+
ipc: host
45+
restart: unless-stopped
46+
grafana:
47+
image: grafana/grafana:11.0.0
48+
container_name: grafana
49+
volumes:
50+
- ./grafana_data:/var/lib/grafana
51+
- ./grafana/dashboards:/var/lib/grafana/dashboards
52+
- ./grafana/provisioning:/etc/grafana/provisioning
53+
user: root
2554
environment:
26-
- ENABLE_OPEA_TELEMETRY=true
27-
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
55+
GF_SECURITY_ADMIN_PASSWORD: admin
56+
GF_RENDERING_CALLBACK_URL: http://grafana:3000/
57+
GF_LOG_FILTERS: rendering:debug
58+
depends_on:
59+
- prometheus
60+
ports:
61+
- '3000:3000'
62+
ipc: host
63+
restart: unless-stopped
64+
node-exporter:
65+
image: prom/node-exporter
66+
container_name: node-exporter
67+
volumes:
68+
- /proc:/host/proc:ro
69+
- /sys:/host/sys:ro
70+
- /:/rootfs:ro
71+
command:
72+
- '--path.procfs=/host/proc'
73+
- '--path.sysfs=/host/sys'
74+
- --collector.filesystem.ignored-mount-points
75+
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
76+
ports:
77+
- 9100:9100
78+
restart: always
79+
deploy:
80+
mode: global

ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,21 @@
44
services:
55
tei-embedding-service:
66
command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
7+
environment:
8+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
79
tei-reranking-service:
810
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
11+
environment:
12+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
913
tgi-service:
1014
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
15+
environment:
16+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
17+
chatqna-xeon-backend-server:
18+
environment:
19+
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
1120
jaeger:
12-
image: jaegertracing/all-in-one:latest
21+
image: jaegertracing/all-in-one:1.67.0
1322
container_name: jaeger
1423
ports:
1524
- "16686:16686"
@@ -23,7 +32,51 @@ services:
2332
https_proxy: ${https_proxy}
2433
COLLECTOR_ZIPKIN_HOST_PORT: 9411
2534
restart: unless-stopped
26-
chatqna-xeon-backend-server:
35+
prometheus:
36+
image: prom/prometheus:v2.52.0
37+
container_name: prometheus
38+
user: root
39+
volumes:
40+
- ./prometheus.yaml:/etc/prometheus/prometheus.yaml
41+
- ./prometheus_data:/prometheus
42+
command:
43+
- '--config.file=/etc/prometheus/prometheus.yaml'
44+
ports:
45+
- '9090:9090'
46+
ipc: host
47+
restart: unless-stopped
48+
grafana:
49+
image: grafana/grafana:11.0.0
50+
container_name: grafana
51+
volumes:
52+
- ./grafana_data:/var/lib/grafana
53+
- ./grafana/dashboards:/var/lib/grafana/dashboards
54+
- ./grafana/provisioning:/etc/grafana/provisioning
55+
user: root
2756
environment:
28-
- ENABLE_OPEA_TELEMETRY=true
29-
- TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
57+
GF_SECURITY_ADMIN_PASSWORD: admin
58+
GF_RENDERING_CALLBACK_URL: http://grafana:3000/
59+
GF_LOG_FILTERS: rendering:debug
60+
depends_on:
61+
- prometheus
62+
ports:
63+
- '3000:3000'
64+
ipc: host
65+
restart: unless-stopped
66+
node-exporter:
67+
image: prom/node-exporter
68+
container_name: node-exporter
69+
volumes:
70+
- /proc:/host/proc:ro
71+
- /sys:/host/sys:ro
72+
- /:/rootfs:ro
73+
command:
74+
- '--path.procfs=/host/proc'
75+
- '--path.sysfs=/host/sys'
76+
- --collector.filesystem.ignored-mount-points
77+
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
78+
ports:
79+
- 9100:9100
80+
restart: always
81+
deploy:
82+
mode: global
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
5+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
6+
wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: 1
5+
6+
providers:
7+
- name: 'default'
8+
orgId: 1
9+
folder: ''
10+
type: file
11+
disableDeletion: false
12+
updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
13+
options:
14+
path: /var/lib/grafana/dashboards
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# config file version
5+
apiVersion: 1
6+
7+
# list of datasources that should be deleted from the database
8+
deleteDatasources:
9+
- name: Prometheus
10+
orgId: 1
11+
12+
# list of datasources to insert/update depending
13+
# what's available in the database
14+
datasources:
15+
# <string, required> name of the datasource. Required
16+
- name: Prometheus
17+
# <string, required> datasource type. Required
18+
type: prometheus
19+
# <string, required> access mode. direct or proxy. Required
20+
access: proxy
21+
# <int> org id. will default to orgId 1 if not specified
22+
orgId: 1
23+
# <string> url
24+
url: http://prometheus:9090
25+
# <string> database password, if used
26+
password:
27+
# <string> database user, if used
28+
user:
29+
# <string> database name, if used
30+
database:
31+
# <bool> enable/disable basic auth
32+
basicAuth: false
33+
# <string> basic auth username, if used
34+
basicAuthUser:
35+
# <string> basic auth password, if used
36+
basicAuthPassword:
37+
# <bool> enable/disable with credentials headers
38+
withCredentials:
39+
# <bool> mark as default datasource. Max one per org
40+
isDefault: true
41+
# <map> fields that will be converted to json and stored in json_data
42+
jsonData:
43+
httpMethod: GET
44+
graphiteVersion: "1.1"
45+
tlsAuth: false
46+
tlsAuthWithCACert: false
47+
# <string> json object of data that will be encrypted.
48+
secureJsonData:
49+
tlsCACert: "..."
50+
tlsClientCert: "..."
51+
tlsClientKey: "..."
52+
version: 1
53+
# <bool> allow users to edit datasources from the UI.
54+
editable: true
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL}
4+
global:
5+
scrape_interval: 5s
6+
external_labels:
7+
monitor: "my-monitor"
8+
scrape_configs:
9+
- job_name: "prometheus"
10+
static_configs:
11+
- targets: ["prometheus:9090"]
12+
- job_name: "vllm"
13+
metrics_path: /metrics
14+
static_configs:
15+
- targets: ["vllm-service:80"]
16+
- job_name: "tgi"
17+
metrics_path: /metrics
18+
static_configs:
19+
- targets: ["tgi-service:80"]
20+
- job_name: "tei-embedding"
21+
metrics_path: /metrics
22+
static_configs:
23+
- targets: ["tei-embedding-server:80"]
24+
- job_name: "tei-reranking"
25+
metrics_path: /metrics
26+
static_configs:
27+
- targets: ["tei-reranking-server:80"]
28+
- job_name: "retriever"
29+
metrics_path: /metrics
30+
static_configs:
31+
- targets: ["retriever-redis-server:7000"]
32+
- job_name: "dataprep-redis-service"
33+
metrics_path: /metrics
34+
static_configs:
35+
- targets: ["dataprep-redis-server:5000"]
36+
- job_name: "chatqna-backend-server"
37+
metrics_path: /metrics
38+
static_configs:
39+
- targets: ["chatqna-xeon-backend-server:8888"]
40+
- job_name: "prometheus-node-exporter"
41+
metrics_path: /metrics
42+
static_configs:
43+
- targets: ["node-exporter:9100"]

ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ export LOGFLAG=""
1818
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
1919
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
2020
export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
21+
# Set no proxy
22+
export no_proxy="$no_proxy,chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,node-exporter,$JAEGER_IP"

ChatQnA/docker_compose/intel/hpu/gaudi/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ docker compose up -d
6666

6767
To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
6868

69+
> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
70+
6971
```bash
72+
./grafana/dashboards/download_opea_dashboard.sh
7073
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
7174
```
7275

ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ services:
77
tei-reranking-service:
88
command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
99
jaeger:
10-
image: jaegertracing/all-in-one:latest
10+
image: jaegertracing/all-in-one:1.67.0
1111
container_name: jaeger
1212
ports:
1313
- "16686:16686"
@@ -21,6 +21,67 @@ services:
2121
https_proxy: ${https_proxy}
2222
COLLECTOR_ZIPKIN_HOST_PORT: 9411
2323
restart: unless-stopped
24+
prometheus:
25+
image: prom/prometheus:v2.52.0
26+
container_name: prometheus
27+
user: root
28+
volumes:
29+
- ./prometheus.yaml:/etc/prometheus/prometheus.yaml
30+
- ./prometheus_data:/prometheus
31+
command:
32+
- '--config.file=/etc/prometheus/prometheus.yaml'
33+
ports:
34+
- '9090:9090'
35+
ipc: host
36+
restart: unless-stopped
37+
grafana:
38+
image: grafana/grafana:11.0.0
39+
container_name: grafana
40+
volumes:
41+
- ./grafana_data:/var/lib/grafana
42+
- ./grafana/dashboards:/var/lib/grafana/dashboards
43+
- ./grafana/provisioning:/etc/grafana/provisioning
44+
user: root
45+
environment:
46+
GF_SECURITY_ADMIN_PASSWORD: admin
47+
GF_RENDERING_CALLBACK_URL: http://grafana:3000/
48+
GF_LOG_FILTERS: rendering:debug
49+
depends_on:
50+
- prometheus
51+
ports:
52+
- '3000:3000'
53+
ipc: host
54+
restart: unless-stopped
55+
node-exporter:
56+
image: prom/node-exporter
57+
container_name: node-exporter
58+
volumes:
59+
- /proc:/host/proc:ro
60+
- /sys:/host/sys:ro
61+
- /:/rootfs:ro
62+
command:
63+
- '--path.procfs=/host/proc'
64+
- '--path.sysfs=/host/sys'
65+
- --collector.filesystem.ignored-mount-points
66+
- "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
67+
ports:
68+
- 9100:9100
69+
restart: always
70+
deploy:
71+
mode: global
72+
gaudi-exporter:
73+
image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
74+
container_name: gaudi-exporter
75+
volumes:
76+
- /proc:/host/proc:ro
77+
- /sys:/host/sys:ro
78+
- /:/rootfs:ro
79+
- /dev:/dev
80+
ports:
81+
- 41611:41611
82+
restart: always
83+
deploy:
84+
mode: global
2485
chatqna-gaudi-backend-server:
2586
environment:
2687
- ENABLE_OPEA_TELEMETRY=true

0 commit comments

Comments
 (0)