Skip to content

Commit

Permalink
Merge pull request #174 from DFE-Digital/feature/1661-add-pod-cpu-ale…
Browse files Browse the repository at this point in the history
…rt-rule

Added high pod cpu usage alert
  • Loading branch information
temitope777 authored Mar 26, 2024
2 parents fdce2ef + 9e89ceb commit 1d37cab
Show file tree
Hide file tree
Showing 20 changed files with 184 additions and 126 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ templates/new_service/terraform/application/.terraform.lock.hcl
templates/new_service/terraform/application/vendor/

# terrafile
bin/terrafile
cluster/terraform_kubernetes/vendor/modules/aks

# macOS files
Expand Down
11 changes: 4 additions & 7 deletions cluster/terraform_kubernetes/alertmanager.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
data "azurerm_key_vault_secret" "slack_secret" {
data "azurerm_key_vault_secret" "slack_webhooks" {
for_each = toset(local.slack_secret_names)
name = each.key
key_vault_id = data.azurerm_key_vault.key_vault.id
name = "SLACK-SECRET"
}

resource "kubernetes_config_map" "alertmanager_config" {
metadata {
name = "alertmanager-config"
namespace = kubernetes_namespace.default_list["monitoring"].metadata[0].name
}

data = {
"config.yml" = local.alertmanager_config_content
}
Expand All @@ -18,7 +19,6 @@ resource "kubernetes_config_map" "alertmanager_templates" {
name = "alertmanager-templates"
namespace = kubernetes_namespace.default_list["monitoring"].metadata[0].name
}

data = local.alertmanager_templates
}

Expand Down Expand Up @@ -52,12 +52,9 @@ resource "kubernetes_deployment" "alertmanager" {
"--config.file=/etc/alertmanager/config.yml",
"--storage.path=/alertmanager",
]


port {
container_port = 9093
}

resources {
limits = {
cpu = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@
"thanos_retention_raw": "3d",
"thanos_retention_5m": "3d",
"thanos_retention_1h": "3d",
"cluster_short": "pt"
"cluster_short": "pt",
"alertmanager_slack_receiver_list": [],
"alertable_apps": {}
}
15 changes: 14 additions & 1 deletion cluster/terraform_kubernetes/config/production.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,18 @@
"thanos_app_cpu": "0.5",
"thanos_compactor_mem": "3Gi",
"thanos_store_mem": "2Gi",
"cluster_short": "pd"
"cluster_short": "pd",
"alertmanager_slack_receiver_list": [
"SLACK_WEBHOOK_ATT"
],
"alertable_apps": {
"bat-production/apply-production": {
"receiver":"SLACK_WEBHOOK_ATT",
"max_cpu": 0.8
},
"bat-production/apply-sandbox": {
"receiver":"SLACK_WEBHOOK_ATT",
"max_cpu": 0.8
}
}
}

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{{ define "slack.devops.text" }}
{{range .Alerts}}{{.Annotations.DESCRIPTION}}
{{end}}
{{ end }}

{{ define "slack.alert.text" }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.description }}
*Details:*
{{ range .Labels.SortedPairs }} - *{{ .Name }}:* `{{ .Value }}`
{{ end }}
{{ end }}
{{ end }}

{{ define "slack.alert.title" }}
{{ with index .Alerts 0 }}
{{ .Annotations.summary }}
{{ end }}
{{ end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
global:
resolve_timeout: 1m
%{ if slack_url != "" ~}
slack_api_url: '${slack_url}'
%{ endif ~}

route:
# The default route, this will be used if the matcher(s) of any of the child routes are not satisfied
receiver: 'slack-notifications'
group_interval: 1m
repeat_interval: 1h
group_by: [period]
routes:
- receiver: 'slack-notifications'
group_interval: 24h
repeat_interval: 24h
match:
period: 'daily'
- receiver: 'slack-notifications'
group_interval: 1m
repeat_interval: 1h
match:
period: 'out-of-hours'
%{ for receiver, webhook in slack_receivers ~}
- receiver: ${ receiver }
group_interval: 1m
repeat_interval: 1h
match:
receiver: '${ receiver }'
%{ endfor ~}

templates:
- /etc/alertmanager-templates/*.tmpl

receivers:
- name: slack-notifications
%{ if slack_url != "" ~}
slack_configs:
- send_resolved: true
text: '{{ template "slack.alert.text" . }}'
title: '{{ template "slack.alert.title" . }}'
%{ endif ~}
%{ for receiver, webhook in slack_receivers ~}
- name: ${ receiver }
slack_configs:
- api_url: ${ webhook }
send_resolved: true
text: '{{ template "slack.alert.text" . }}'
title: '{{ template "slack.alert.title" . }}'
%{ endfor ~}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: Container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: high
annotations:
summary: High number of restarted containers
- name: High CPU
rules:
%{ for instance in apps ~}
- alert: High CPU Alert for ${instance.app_name}
expr: avg(rate(container_cpu_usage_seconds_total{namespace="${instance.namespace}",pod=~"${instance.app_name}-[a-z0-9]+-[a-z0-9]+",container!=""}[5m])) by (namespace,container,image) > ${instance.max_cpu}
for: 5m
annotations:
summary: High CPU Alert for ${instance.app_name}
description: "Average pod CPU usage has been above {{ ${instance.max_cpu} | humanizePercentage }} in the last 5 minutes (current value: {{ $value | humanizePercentage }})"
labels:
severity: high
app: ${instance.app_name}
%{ if instance.receiver != null }receiver: ${instance.receiver}%{ endif }
%{ endfor ~}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ global:
external_labels:
cluster: prometheus
rule_files:
- /etc/prometheus/prometheus.rules
- /etc/prometheus/app.alert.rules
alerting:
alertmanagers:
- scheme: http
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

15 changes: 14 additions & 1 deletion cluster/terraform_kubernetes/config/test.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,18 @@
"thanos_store_mem": "2Gi",
"thanos_compactor_mem": "3Gi",
"thanos_app_cpu": "0.5",
"cluster_short": "ts"
"cluster_short": "ts",
"alertable_apps": {
"bat-qa/apply-qa": {
"receiver": "SLACK_WEBHOOK_ATT",
"max_cpu": 0.8
},
"bat-staging/apply-staging": {
"receiver": "SLACK_WEBHOOK_ATT",
"max_cpu": 0.8
}
},
"alertmanager_slack_receiver_list": [
"SLACK_WEBHOOK_ATT"
]
}
8 changes: 8 additions & 0 deletions cluster/terraform_kubernetes/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
data "azurerm_key_vault" "key_vault" {
name = var.cluster_kv
resource_group_name = var.resource_group_name
}

data "azurerm_key_vault_secrets" "main" {
key_vault_id = data.azurerm_key_vault.key_vault.id
}
12 changes: 2 additions & 10 deletions cluster/terraform_kubernetes/prometheus.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
resource "kubernetes_cluster_role" "prometheus" {

metadata {
name = "prometheus"
}
Expand All @@ -23,7 +22,6 @@ resource "kubernetes_cluster_role" "prometheus" {
}

resource "kubernetes_cluster_role_binding" "prometheus" {

metadata {
name = "prometheus"
}
Expand All @@ -39,25 +37,21 @@ resource "kubernetes_cluster_role_binding" "prometheus" {
name = "default"
namespace = "monitoring"
}

}

resource "kubernetes_config_map" "prometheus" {

metadata {
name = "prometheus-server-conf"
namespace = kubernetes_namespace.default_list["monitoring"].metadata[0].name
}

data = {
"prometheus.rules" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.rules")}"
"prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}"
"prometheus.yml" = file("${path.module}/config/prometheus/${var.config}.prometheus.yml")
"app.alert.rules" = local.app_alert_rules
}

}

resource "kubernetes_deployment" "prometheus" {

metadata {
name = "prometheus"
namespace = "monitoring"
Expand Down Expand Up @@ -207,14 +201,12 @@ resource "kubernetes_deployment" "prometheus" {
secret_name = kubernetes_secret.thanos.metadata[0].name
}
}

}
}
}
}

resource "kubernetes_service" "prometheus" {

metadata {
name = "prometheus"
namespace = "monitoring"
Expand Down
5 changes: 0 additions & 5 deletions cluster/terraform_kubernetes/statuscake.tf
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
data "azurerm_key_vault" "key_vault" {
name = var.cluster_kv
resource_group_name = var.resource_group_name
}

data "azurerm_key_vault_secret" "statuscake_secret" {
key_vault_id = data.azurerm_key_vault.key_vault.id
name = "STATUSCAKE-API-TOKEN"
Expand Down
Loading

0 comments on commit 1d37cab

Please sign in to comment.