From 3d34871399c726f80f018d4766e13f9f33f92e49 Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Mon, 22 Jan 2024 12:46:42 +0000 Subject: [PATCH] Add prometheus --- .../config/development.tfvars.json | 3 +- .../prometheus/development.prometheus.rules | 15 ++ .../prometheus/development.prometheus.yml | 160 +++++++++++++++++ cluster/terraform_kubernetes/prometheus.tf | 164 ++++++++++++++++++ cluster/terraform_kubernetes/variables.tf | 8 + documentation/monitoring.md | 10 ++ scripts/pfwd.sh | 20 +++ 7 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 cluster/terraform_kubernetes/config/prometheus/development.prometheus.rules create mode 100644 cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml create mode 100644 cluster/terraform_kubernetes/prometheus.tf create mode 100755 scripts/pfwd.sh diff --git a/cluster/terraform_kubernetes/config/development.tfvars.json b/cluster/terraform_kubernetes/config/development.tfvars.json index 853cd44e..0b56414f 100644 --- a/cluster/terraform_kubernetes/config/development.tfvars.json +++ b/cluster/terraform_kubernetes/config/development.tfvars.json @@ -14,5 +14,6 @@ "welcome_app_hostnames": [ "www.cluster1.development.teacherservices.cloud" ], - "ingress_nginx_version": "4.8.3" + "ingress_nginx_version": "4.8.3", + "enable_prometheus": true } diff --git a/cluster/terraform_kubernetes/config/prometheus/development.prometheus.rules b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.rules new file mode 100644 index 00000000..e97b6c8f --- /dev/null +++ b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.rules @@ -0,0 +1,15 @@ +# +# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ +# +# Example rule below, which should be removed once real rules added +# +groups: +- name: container restarts + rules: + - alert: High number of restarted containers + expr: sum(kube_pod_container_status_restarts_total) > 1000 + for: 5m + labels: + severity: slack + annotations: + summary: High number of restarted containers diff --git a/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml new file mode 100644 index 00000000..0440796c --- /dev/null +++ b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml @@ -0,0 +1,160 @@ +# +# see https://prometheus.io/docs/prometheus/latest/configuration/configuration +# +# initial scrape configs added, these will be updated later as required +# +global: + scrape_interval: 1m + evaluation_interval: 1m +rule_files: + - /etc/prometheus/prometheus.rules +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "alertmanager.monitoring.svc:9093" +scrape_configs: + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + # - job_name: 'nginx-pods' + # kubernetes_sd_configs: + # - role: pod + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_container_port_number] + # regex: '443' + # action: drop + # - source_labels: [__meta_kubernetes_pod_container_port_number] + # regex: '8443' + # action: drop + # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + # regex: 'ingress-nginx' + # action: keep + # - source_labels: [__address__, __meta_kubernetes_pod_container_port_number] + # action: replace + # regex: ([^:]+)(?::\d+)?;(\d+) + # replacement: $1:10254 + # target_label: __address__ + # - job_name: 'apply-pods' + # kubernetes_sd_configs: + # - role: pod + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_label_app] + # regex: 'apply-review-rm1' + # action: keep + # - job_name: 'apply-sidekiq-pods' + # kubernetes_sd_configs: + # - role: pod + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_label_app] + # regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1' + # action: keep + # - source_labels: [__address__] + # action: replace + # regex: ([^:]+) + # replacement: "$${1}:9394" + # target_label: __address__ + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name diff --git a/cluster/terraform_kubernetes/prometheus.tf b/cluster/terraform_kubernetes/prometheus.tf new file mode 100644 index 00000000..a1fc6125 --- /dev/null +++ b/cluster/terraform_kubernetes/prometheus.tf @@ -0,0 +1,164 @@ +resource "kubernetes_cluster_role" "prometheus" { + count = var.enable_prometheus ? 1 : 0 + + metadata { + name = "prometheus" + } + + rule { + api_groups = [""] + resources = ["nodes", "nodes/proxy", "services", "endpoints", "pods", ] + verbs = ["get", "list", "watch", ] + } + + rule { + api_groups = ["extensions", ] + resources = ["ingresses", ] + verbs = ["get", "list", "watch", ] + } + + rule { + non_resource_urls = ["/metrics", ] + verbs = ["get", ] + } +} + +resource "kubernetes_cluster_role_binding" "prometheus" { + count = var.enable_prometheus ? 1 : 0 + + metadata { + name = "prometheus" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "prometheus" + } + + subject { + kind = "ServiceAccount" + name = "default" + namespace = "monitoring" + } + +} + +resource "kubernetes_config_map" "prometheus" { + count = var.enable_prometheus ? 1 : 0 + + metadata { + name = "prometheus-server-conf" + namespace = "monitoring" + } + + data = { + "prometheus.rules" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.rules")}" + "prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}" + } + +} + +resource "kubernetes_deployment" "prometheus" { + count = var.enable_prometheus ? 1 : 0 + + metadata { + name = "prometheus" + namespace = "monitoring" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "prometheus-server" + } + } + + template { + metadata { + labels = { + app = "prometheus-server" + } + } + + spec { + container { + image = "prom/prometheus:${var.prometheus_version}" + name = "prometheus" + + args = [ + "--storage.tsdb.retention.time=12h", + "--config.file=/etc/prometheus/prometheus.yml", + "--storage.tsdb.path=/prometheus/", + ] + + port { + container_port = 9090 + } + + resources { + limits = { + cpu = 1 + memory = "1Gi" + } + requests = { + cpu = "500m" + memory = "500M" + } + } + + volume_mount { + mount_path = "/etc/prometheus/" + name = "prometheus-config-volume" + } + + volume_mount { + mount_path = "/prometheus/" + name = "prometheus-storage-volume" + } + } + + volume { + name = "prometheus-config-volume" + config_map { + # default_mode = "0420" + name = "prometheus-server-conf" + } + } + + volume { + name = "prometheus-storage-volume" + empty_dir {} + } + + } + } + } +} + +resource "kubernetes_service" "prometheus" { + count = var.enable_prometheus ? 1 : 0 + + metadata { + name = "prometheus" + namespace = "monitoring" + annotations = { + "prometheus.io/port" = "9090" + "prometheus.io/scrape" = "true" + } + } + + spec { + port { + node_port = 30000 + port = 8080 + target_port = 9090 + } + selector = { + app = "prometheus-server" + } + type = "NodePort" + } +} diff --git a/cluster/terraform_kubernetes/variables.tf b/cluster/terraform_kubernetes/variables.tf index 4055cf79..a910c863 100644 --- a/cluster/terraform_kubernetes/variables.tf +++ b/cluster/terraform_kubernetes/variables.tf @@ -64,6 +64,14 @@ variable "lowpriority_app_replicas" { default = 3 } +variable "enable_prometheus" { + type = bool + default = false +} + +variable "prometheus_version" { + default = "v2.49.1" +} locals { cluster_name = ( diff --git a/documentation/monitoring.md b/documentation/monitoring.md index 284b84b9..49592a29 100644 --- a/documentation/monitoring.md +++ b/documentation/monitoring.md @@ -20,3 +20,13 @@ Terraform created statuscake monitoring for the permanent clusters. These monitor https://status.${cluster}/healthz for each cluster, and will email and page the TS infra team on failure. + +## Prometheus + +Prometheus monitoring can be enabled for a cluster by adding "enable_prometheus":true in the kubernetes terraform env.tfvars.json file + +The default prometheus version is hardcoded in the kubernetes variable.tf. It can be overridden for a cluster by adding prometheus_version to the env.tfvars.json file. + +Prometheus rules and yml config files are loaded from the terraform_kubernetes/config/prometheus directory. Each file is prefixed with the cluster env. +e.g. development.prometheus.rules and development.prometheus.yml +Currently a restart/reload of the prometheus process is required if changes are made to these files. diff --git a/scripts/pfwd.sh b/scripts/pfwd.sh new file mode 100755 index 00000000..cfc1ec87 --- /dev/null +++ b/scripts/pfwd.sh @@ -0,0 +1,20 @@ +# +# Script to open a connection to prometheus +# Requires kubectl config to be already pointing to the target cluster +# +PROM=`kubectl get pods -n monitoring -l app=prometheus-server --no-headers=true -o name` +# AMAN=`kubectl get pods -n monitoring -l app=alertmanager --no-headers=true -o name` +# GRAF=`kubectl get pods -n monitoring -l app=grafana --no-headers=true -o name` +echo $PROM +# echo $AMAN +# echo $GRAF +kubectl port-forward -n monitoring $PROM 8080:9090 & +# kubectl port-forward -n monitoring $AMAN 8081:9093 & +# kubectl port-forward -n monitoring $GRAF 3000 & +echo +echo Prometheus at http://localhost:8080 +# echo Alertmanager at http://localhost:8081 +# echo Grafana at http://localhost:3000 +echo +echo kill with pkill kubectl +echo