Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prometheus #148

Merged
merged 1 commit into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cluster/terraform_kubernetes/config/development.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"welcome_app_hostnames": [
"www.cluster1.development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3"
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
"welcome_app_hostnames": [
"www.platform-test.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3"
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
}
3 changes: 2 additions & 1 deletion cluster/terraform_kubernetes/config/production.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@
"development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3",
"enable_lowpriority_app": true
"enable_lowpriority_app": true,
"prometheus_app_mem": "2Gi"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration
# and
# https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
#
# initial scrape configs added, these will be updated later as required
#
global:
scrape_interval: 1m
evaluation_interval: 1m
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.monitoring.svc:9093"
scrape_configs:
# Scrape config for node-exporter.
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# disabled certificate verification otherwise it fails with incorrect IP
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Scrape config for nodes (kubelet).
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape to be configured
# for all the declared ports (or port-free target if none is declared)
# or only some ports.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Only scrape pods that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Scrape config for kube-state-metrics.
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Example scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some endpoints.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# Only scrape endpoints that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Loading
Loading