Skip to content

Commit

Permalink
Add prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
RMcVelia committed Jan 31, 2024
1 parent 1cfd963 commit 8d047ef
Show file tree
Hide file tree
Showing 6 changed files with 408 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
#
# Example rule below, which should be removed once real rules added
#
groups:
- name: container restarts
rules:
- alert: High number of restarted containers
expr: sum(kube_pod_container_status_restarts_total) > 1000
for: 5m
labels:
severity: slack
annotations:
summary: High number of restarted containers
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#
# see https://prometheus.io/docs/prometheus/latest/configuration/configuration
# and
# https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-kubernetes.yml
#
# initial scrape configs added, these will be updated later as required
#
global:
scrape_interval: 1m
evaluation_interval: 1m
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.monitoring.svc:9093"
scrape_configs:
# Scrape config for node-exporter.
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# disabled certificate verification otherwise it fails with incorrect IP
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Scrape config for nodes (kubelet).
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape to be configured
# for all the declared ports (or port-free target if none is declared)
# or only some ports.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Only scrape pods that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Scrape config for kube-state-metrics.
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Example scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some endpoints.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# Only scrape endpoints that have
# "prometheus_io_scrape = true" annotation.
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
159 changes: 159 additions & 0 deletions cluster/terraform_kubernetes/prometheus.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
resource "kubernetes_cluster_role" "prometheus" {

metadata {
name = "prometheus"
}

rule {
api_groups = [""]
resources = ["nodes", "nodes/proxy", "services", "endpoints", "pods", ]
verbs = ["get", "list", "watch", ]
}

rule {
api_groups = ["extensions", ]
resources = ["ingresses", ]
verbs = ["get", "list", "watch", ]
}

rule {
non_resource_urls = ["/metrics", ]
verbs = ["get", ]
}
}

resource "kubernetes_cluster_role_binding" "prometheus" {

metadata {
name = "prometheus"
}

role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = kubernetes_cluster_role.prometheus.metadata[0].name
}

subject {
kind = "ServiceAccount"
name = "default"
namespace = "monitoring"
}

}

resource "kubernetes_config_map" "prometheus" {

metadata {
name = "prometheus-server-conf"
namespace = "monitoring"
}

data = {
"prometheus.rules" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.rules")}"
"prometheus.yml" = "${file("${path.module}/config/prometheus/${var.config}.prometheus.yml")}"
}

}

resource "kubernetes_deployment" "prometheus" {

metadata {
name = "prometheus"
namespace = "monitoring"
}

spec {
replicas = 1

selector {
match_labels = {
app = "prometheus-server"
}
}

template {
metadata {
labels = {
app = "prometheus-server"
}
}

spec {
container {
image = "prom/prometheus:${var.prometheus_version}"
name = "prometheus"

args = [
"--storage.tsdb.retention.time=12h",
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus/",
]

port {
container_port = 9090
}

resources {
limits = {
cpu = 1
memory = "1Gi"
}
requests = {
cpu = "500m"
memory = "500M"
}
}

volume_mount {
mount_path = "/etc/prometheus/"
name = "prometheus-config-volume"
}

volume_mount {
mount_path = "/prometheus/"
name = "prometheus-storage-volume"
}
}

volume {
name = "prometheus-config-volume"
config_map {
# default_mode = "0420"
name = kubernetes_config_map.prometheus.metadata[0].name
}
}

volume {
name = "prometheus-storage-volume"
empty_dir {}
}

}
}
}
}

resource "kubernetes_service" "prometheus" {

metadata {
name = "prometheus"
namespace = "monitoring"
annotations = {
"prometheus.io/port" = "9090"
"prometheus.io/scrape" = "true"
}
}

spec {
port {
node_port = 30000
port = 8080
target_port = kubernetes_deployment.prometheus.spec[0].template[0].spec[0].container[0].port[0].container_port
}
selector = {
app = "prometheus-server"
}
type = "NodePort"
}
}
3 changes: 3 additions & 0 deletions cluster/terraform_kubernetes/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ data "azurerm_client_config" "current" {}
data "environment_variables" "github_actions" {
filter = "GITHUB_ACTIONS"
}
variable "prometheus_version" {
default = "v2.49.1"
}

locals {
cluster_name = (
Expand Down
Loading

0 comments on commit 8d047ef

Please sign in to comment.