Skip to content

Commit

Permalink
Add prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
RMcVelia committed Jan 23, 2024
1 parent 6af27ec commit b6bc56c
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 1 deletion.
3 changes: 2 additions & 1 deletion cluster/terraform_kubernetes/config/development.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"welcome_app_hostnames": [
"www.cluster1.development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3"
"ingress_nginx_version": "4.8.3",
"enable_prometheus": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration
#
# initial scrape configs added, these will be updated later as required
#
global:
scrape_interval: 1m
evaluation_interval: 1m
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.monitoring.svc:9093"
scrape_configs:
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$${1}/proxy/metrics
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
164 changes: 164 additions & 0 deletions cluster/terraform_kubernetes/prometheus.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
resource "kubernetes_cluster_role" "prometheus" {
count = var.enable_prometheus ? 1 : 0

metadata {
name = "prometheus"
}

rule {
api_groups = [""]
resources = ["nodes", "nodes/proxy", "services", "endpoints", "pods", ]
verbs = ["get", "list", "watch", ]
}

rule {
api_groups = ["extensions", ]
resources = ["ingresses", ]
verbs = ["get", "list", "watch", ]
}

rule {
non_resource_urls = ["/metrics", ]
verbs = ["get", ]
}
}

resource "kubernetes_cluster_role_binding" "prometheus" {
count = var.enable_prometheus ? 1 : 0

metadata {
name = "prometheus"
}

role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = "prometheus"
}

subject {
kind = "ServiceAccount"
name = "default"
namespace = "monitoring"
}

}

resource "kubernetes_config_map" "prometheus" {
count = var.enable_prometheus ? 1 : 0

metadata {
name = "prometheus-server-conf"
namespace = "monitoring"
}

data = {
"prometheus.rules" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.rules")}"
"prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}"
}

}

resource "kubernetes_deployment" "prometheus" {
count = var.enable_prometheus ? 1 : 0

metadata {
name = "prometheus"
namespace = "monitoring"
}

spec {
replicas = 1

selector {
match_labels = {
app = "prometheus-server"
}
}

template {
metadata {
labels = {
app = "prometheus-server"
}
}

spec {
container {
image = "prom/prometheus:${var.prometheus_version}"
name = "prometheus"

args = [
"--storage.tsdb.retention.time=12h",
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus/",
]

port {
container_port = 9090
}

resources {
limits = {
cpu = 1
memory = "1Gi"
}
requests = {
cpu = "500m"
memory = "500M"
}
}

volume_mount {
mount_path = "/etc/prometheus/"
name = "prometheus-config-volume"
}

volume_mount {
mount_path = "/prometheus/"
name = "prometheus-storage-volume"
}
}

volume {
name = "prometheus-config-volume"
config_map {
# default_mode = "0420"
name = "prometheus-server-conf"
}
}

volume {
name = "prometheus-storage-volume"
empty_dir {}
}

}
}
}
}

resource "kubernetes_service" "prometheus" {
count = var.enable_prometheus ? 1 : 0

metadata {
name = "prometheus"
namespace = "monitoring"
annotations = {
"prometheus.io/port" = "9090"
"prometheus.io/scrape" = "true"
}
}

spec {
port {
node_port = 30000
port = 8080
target_port = 9090
}
selector = {
app = "prometheus-server"
}
type = "NodePort"
}
}
8 changes: 8 additions & 0 deletions cluster/terraform_kubernetes/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ variable "lowpriority_app_replicas" {
default = 3
}

variable "enable_prometheus" {
type = bool
default = false
}

variable "prometheus_version" {
default = "v2.49.1"
}

locals {
cluster_name = (
Expand Down
10 changes: 10 additions & 0 deletions documentation/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,13 @@ Terraform created statuscake monitoring for the permanent clusters.

These monitor https://status.${cluster}/healthz for each cluster,
and will email and page the TS infra team on failure.

## Prometheus

Prometheus monitoring can be enabled for a cluster by adding "enable_prometheus":true in the kubernetes terraform env.tfvars.json file

The default prometheus version is hardcoded in the kubernetes variable.tf. It can be overridden for a cluster by adding prometheus_version to the env.tfvars.json file.

Prometheus rules and yml config files are loaded from the terraform_kubernetes/config/prometheus directory. Each file is prefixed with the cluster env.
e.g. development.prometheus.rules and development.prometheus.yml
Currently a restart/reload of the prometheus process is required if changes are made to these files.
20 changes: 20 additions & 0 deletions scripts/pfwd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Script to open a connection to prometheus
# Requires kubectl config to be already pointing to the target cluster
#
PROM=`kubectl get pods -n monitoring -l app=prometheus-server --no-headers=true -o name`
# AMAN=`kubectl get pods -n monitoring -l app=alertmanager --no-headers=true -o name`
# GRAF=`kubectl get pods -n monitoring -l app=grafana --no-headers=true -o name`
echo $PROM
# echo $AMAN
# echo $GRAF
kubectl port-forward -n monitoring $PROM 8080:9090 &
# kubectl port-forward -n monitoring $AMAN 8081:9093 &
# kubectl port-forward -n monitoring $GRAF 3000 &
echo
echo Prometheus at http://localhost:8080
# echo Alertmanager at http://localhost:8081
# echo Grafana at http://localhost:3000
echo
echo kill with pkill kubectl
echo

0 comments on commit b6bc56c

Please sign in to comment.