diff --git a/cluster/terraform_aks_cluster/azure_metric_alerts.tf b/cluster/terraform_aks_cluster/azure_metric_alerts.tf index 55104fd9..f92a11de 100644 --- a/cluster/terraform_aks_cluster/azure_metric_alerts.tf +++ b/cluster/terraform_aks_cluster/azure_metric_alerts.tf @@ -31,3 +31,60 @@ resource "azurerm_monitor_metric_alert" "node_availability" { lifecycle { ignore_changes = [tags] } } + +data "azurerm_subscription" "current" {} + +resource "azurerm_monitor_metric_alert" "port_exhaustion" { + name = "${var.resource_prefix}-tsc-${var.environment}-port-exhaustion" + resource_group_name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg" + scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg/providers/Microsoft.Network/loadBalancers/kubernetes"] + severity = 1 + criteria { + metric_namespace = "microsoft.network/loadbalancers" + metric_name = "SnatConnectionCount" + aggregation = "Total" + operator = "GreaterThan" + threshold = 0 + dimension { + name = "ConnectionState" + operator = "Include" + values = ["failed"] + } + } + + action { + action_group_id = data.azurerm_monitor_action_group.main.id + } + + lifecycle { ignore_changes = [tags] } +} + +resource "azurerm_monitor_metric_alert" "high_port_usage" { + name = "${var.resource_prefix}-tsc-${var.environment}-high-port-usage" + resource_group_name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg" + scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg/providers/Microsoft.Network/loadBalancers/kubernetes"] + severity = 2 + criteria { + metric_namespace = "microsoft.network/loadbalancers" + metric_name = "UsedSnatPorts" + aggregation = "Average" + operator = "GreaterThan" + threshold = 900 + dimension { + name = "BackendIPAddress" + operator = "Include" + values = ["*"] + } + dimension { + name = "ProtocolType" + operator = "Include" + values = ["TCP"] + } + } + + action { + action_group_id = data.azurerm_monitor_action_group.main.id + } + + lifecycle { ignore_changes = [tags] } +} diff --git a/documentation/monitoring.md b/documentation/monitoring.md index 806cf143..af3d9043 100644 --- a/documentation/monitoring.md +++ b/documentation/monitoring.md @@ -141,3 +141,19 @@ Configuration is managed through Terraform variables: - The action group name follows the format `[resource-prefix]-tsc` - Alert thresholds can be customized per environment - The metric namespace used is `microsoft.containerservice/managedclusters` + +### High Port Usage + +AKS uses an azure load balancer for inbound and outbound connections and this can lead to port exhaustion if a node does alot of network requests. + +If port usage goes over a threshold we alert on this as a warning so we can take pre-emptive action. + +### Port Exhaustion + +If connections start failing because of port exhaustion we alert on this as an error. + +### Troubleshooting Port Exhaustion + +Unfortunately we can't alert which kubernetes service is using aa high number of ports so this is a troublshooting exercise following: + +[Troubleshoot SNAT port exhaustion on Azure Kubernetes Service nodes](https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/snat-port-exhaustion?tabs=for-a-linux-pod)