Skip to content

Commit

Permalink
Add prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
RMcVelia committed Feb 5, 2024
1 parent 7165774 commit e333753
Show file tree
Hide file tree
Showing 16 changed files with 1,078 additions and 4 deletions.
3 changes: 2 additions & 1 deletion cluster/terraform_kubernetes/config/development.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"welcome_app_hostnames": [
"www.cluster1.development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3"
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
"welcome_app_hostnames": [
"www.platform-test.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3"
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
}
3 changes: 2 additions & 1 deletion cluster/terraform_kubernetes/config/production.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@
"development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3",
"enable_lowpriority_app": true
"enable_lowpriority_app": true,
"prometheus_app_mem": "2Gi"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration
# and
# https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
#
# initial scrape configs added, these will be updated later as required
#
global:
scrape_interval: 1m
evaluation_interval: 1m
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.monitoring.svc:9093"
scrape_configs:
# Scrape config for node-exporter.
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# disabled certificate verification otherwise it fails with incorrect IP
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Scrape config for nodes (kubelet).
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape to be configured
# for all the declared ports (or port-free target if none is declared)
# or only some ports.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Only scrape pods that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Scrape config for kube-state-metrics.
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Example scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some endpoints.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# Only scrape endpoints that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Loading

0 comments on commit e333753

Please sign in to comment.