Skip to content

Commit

Permalink
Added high pod cpu usage alert
Browse files Browse the repository at this point in the history
WHY: It is one of the requisite alerts for monitoring
HOW: By adding high cpu templates, prometheus templates, altering configmap and wiring it up
  • Loading branch information
Tope Emmanuel committed Mar 17, 2024
1 parent 2183265 commit f2f78eb
Show file tree
Hide file tree
Showing 11 changed files with 158 additions and 45 deletions.
8 changes: 7 additions & 1 deletion cluster/terraform_kubernetes/alertmanager.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@ data "azurerm_key_vault_secret" "slack_secret" {
key_vault_id = data.azurerm_key_vault.key_vault.id
name = "SLACK-SECRET"
}



resource "kubernetes_config_map" "alertmanager_config" {
metadata {
name = "alertmanager-config"
namespace = kubernetes_namespace.default_list["monitoring"].metadata[0].name
}

data = {
"config.yml" = local.alertmanager_config_content
"config.yml" = templatefile("${path.module}/config/prometheus/alertmanager-config.tpl", {
service_configs = local.service_configs
})
}

}

resource "kubernetes_config_map" "alertmanager_templates" {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
global:
templates:
- '/etc/alertmanager-templates/*.tmpl'

route:
receiver: alert-default
group_by: ['alertname', 'priority']
group_wait: 1m
repeat_interval: 30m
routes:
%{ for config in service_configs ~}
- receiver: ${config.receiver_name}
match:
service: '${config.service}'
group_wait: 1m
repeat_interval: 10m
%{ endfor ~}

receivers:
- name: alert-default
%{ for config in service_configs ~}
- name: ${config.receiver_name}
slack_configs:
- api_url: '${config.slack_secret}'
channel: '${config.slack_channel}'
send_resolved: true
text: '{{ template "${config.text_template}" . }}'
%{ if config.title_template != "" ~}
title: '{{ template "${config.title_template}" . }}'
%{ endif ~}
%{ endfor ~}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,19 @@
{{range .Alerts}}{{.Annotations.DESCRIPTION}}
{{end}}
{{ end }}

{{ define "slack.alert.text" }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} - *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
{{ end }}

{{ define "slack.alert.title" }}
{{ with index .Alerts 0 }}
{{ .Annotations.summary }}
{{ end }}
{{ end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: High Number Of DesiredPods
rules:
- alert: High Number Of DesiredPods
expr: kubelet_active_pods > 1000
for: 30m
labels:
severity: slack
annotations:
description: High Number Of DesiredPods
summary: High Number Of DesiredPods

%{ for config in service_configs ~}
- name: pod-cpu-usage-${config.service}
rules:
- alert: HighPodCpuUsage-${config.service}
expr: rate(container_cpu_usage_seconds_total{container!="",pod!="",pod=~"${config.service}.*"}[5m]) > ${config.threshold}
for: 2m
labels:
service: ${config.service}
annotations:
summary: High CPU usage detected in pod ({{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using high CPU: {{ $value }}"
%{ endfor ~}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,14 @@ groups:
severity: slack
annotations:
summary: High number of restarted containers

- name: pod-cpu-usage
rules:
- alert: HighPodCpuUsage
expr: rate(container_cpu_usage_seconds_total{container!="",pod!=""}[5m]) > ${threshold}
for: 5m
labels:
severity: critical
annotations:
summary: High CPU usage detected in pod ({{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using high CPU: {{ $value }}%"
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,14 @@ groups:
severity: slack
annotations:
summary: High number of restarted containers

- name: pod-cpu-usage
rules:
- alert: HighPodCpuUsage
expr: rate(container_cpu_usage_seconds_total{container!="",pod!=""}[5m]) > ${threshold}
for: 5m
labels:
severity: critical
annotations:
summary: High CPU usage detected in pod ({{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using high CPU: {{ $value }}%"
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,14 @@ groups:
severity: slack
annotations:
summary: High number of restarted containers

- name: pod-cpu-usage
rules:
- alert: HighPodCpuUsage
expr: rate(container_cpu_usage_seconds_total{container!="",pod!=""}[5m]) > ${threshold}
for: 5m
labels:
severity: critical
annotations:
summary: High CPU usage detected in pod ({{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using high CPU: {{ $value }}%"
6 changes: 4 additions & 2 deletions cluster/terraform_kubernetes/prometheus.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ resource "kubernetes_config_map" "prometheus" {
}

data = {
"prometheus.rules" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.rules")}"
"prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}"
"prometheus.rules" = templatefile("${path.module}/config/prometheus/${var.config}.prometheus-rules.tpl", {
service_configs = local.service_configs
})
"prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}"
}

}
Expand Down
42 changes: 38 additions & 4 deletions cluster/terraform_kubernetes/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ variable "slack_channel" {
default = "#teacher-services-infra"
}

variable "test_slack_channel" {
default = "test_monitoring_tope_emma"
}

variable "alertmanager_image_version" {
default = "v0.19.0"
}
Expand All @@ -166,6 +170,23 @@ variable "node_exporter_version" {
default = "v1.7.0"
}

variable "cpu_threshold" {
description = "The CPU usage threshold for triggering a high CPU usage alert"
type = number
default = 0.5
}

variable "alertmanager_app_configs" {
description = "Overridable configuration for service configs"
type = any
default = null
}

data "azurerm_key_vault_secret" "alertmanager_services" {
key_vault_id = data.azurerm_key_vault.key_vault.id
name = "alertmanager-configs"
}

locals {
cluster_name = (
var.cip_tenant ?
Expand Down Expand Up @@ -226,16 +247,29 @@ locals {
var.environment # cluster1, cluster2, etc
)

alertmanager_config_path = "${path.module}/config/prometheus/alertmanager-config.yaml"
alertmanager_config_content = templatefile(local.alertmanager_config_path, {
slack_secret = data.azurerm_key_vault_secret.slack_secret.value,
slack_channel = var.slack_channel
alertmanager_config_content = templatefile("${path.module}/config/prometheus/alertmanager-config.tpl", {
service_configs = var.alertmanager_app_configs != null ? var.alertmanager_app_configs : local.service_configs
})


template_files = {
"slack.tmpl" = "${path.module}/config/prometheus/alertmanager-slack.yaml"
}

alertmanager_templates = { for k, v in local.template_files : k => file(v) }

alertmanager_config = jsondecode(data.azurerm_key_vault_secret.alertmanager_services.value)
service_config = local.alertmanager_config["serviceconfig"]
service_configs = [
for service_name, service_data in local.service_config : {
service = service_name
threshold = service_data["threshold"]
slack_channel = service_data["slack_channel"]
slack_secret = service_data["slack_secret"]
receiver_name = service_name
text_template = "slack.alert.text"
title_template = "slack.alert.title"
}
]

}

0 comments on commit f2f78eb

Please sign in to comment.