diff --git a/cluster/terraform_kubernetes/config/development.tfvars.json b/cluster/terraform_kubernetes/config/development.tfvars.json index 443099bd..b8d9aa35 100644 --- a/cluster/terraform_kubernetes/config/development.tfvars.json +++ b/cluster/terraform_kubernetes/config/development.tfvars.json @@ -15,5 +15,8 @@ "www.cluster1.development.teacherservices.cloud" ], "ingress_nginx_version": "4.8.3", - "prometheus_tsdb_retention_time": "12h" + "thanos_retention_raw": "3d", + "thanos_retention_5m": "3d", + "thanos_retention_1h": "3d", + "cluster_short": "dv" } diff --git a/cluster/terraform_kubernetes/config/platform-test.tfvars.json b/cluster/terraform_kubernetes/config/platform-test.tfvars.json index a5f919cf..ba067a98 100644 --- a/cluster/terraform_kubernetes/config/platform-test.tfvars.json +++ b/cluster/terraform_kubernetes/config/platform-test.tfvars.json @@ -23,5 +23,8 @@ "www.platform-test.teacherservices.cloud" ], "ingress_nginx_version": "4.8.3", - "prometheus_tsdb_retention_time": "12h" + "thanos_retention_raw": "3d", + "thanos_retention_5m": "3d", + "thanos_retention_1h": "3d", + "cluster_short": "pt" } diff --git a/cluster/terraform_kubernetes/config/production.tfvars.json b/cluster/terraform_kubernetes/config/production.tfvars.json index 7ff99cf0..d964b423 100644 --- a/cluster/terraform_kubernetes/config/production.tfvars.json +++ b/cluster/terraform_kubernetes/config/production.tfvars.json @@ -31,5 +31,9 @@ ], "ingress_nginx_version": "4.8.3", "enable_lowpriority_app": true, - "prometheus_app_mem": "2Gi" + "prometheus_app_mem": "2Gi", + "prometheus_app_cpu": "0.5", + "thanos_app_mem": "1Gi", + "thanos_app_cpu": "0.5", + "cluster_short": "pd" } diff --git a/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml index 2b9138dd..1aaf4914 100644 --- a/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml +++ b/cluster/terraform_kubernetes/config/prometheus/development.prometheus.yml @@ -11,6 +11,10 @@ global: # Keep at most 100 sets of details of targets dropped by relabeling. # This information is used to display in the UI for troubleshooting. keep_dropped_targets: 100 + external_labels: + cluster: prometheus + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) rule_files: - /etc/prometheus/prometheus.rules alerting: diff --git a/cluster/terraform_kubernetes/config/prometheus/platform-test.prometheus.yml b/cluster/terraform_kubernetes/config/prometheus/platform-test.prometheus.yml index 2b9138dd..3ede8793 100644 --- a/cluster/terraform_kubernetes/config/prometheus/platform-test.prometheus.yml +++ b/cluster/terraform_kubernetes/config/prometheus/platform-test.prometheus.yml @@ -11,6 +11,10 @@ global: # Keep at most 100 sets of details of targets dropped by relabeling. # This information is used to display in the UI for troubleshooting. keep_dropped_targets: 100 + external_labels: + cluster: prometheus + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) rule_files: - /etc/prometheus/prometheus.rules alerting: @@ -157,45 +161,3 @@ scrape_configs: - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - # - # Below were used in the spike and left here for info - # They wree a quick fix, so very likely can be improved - # - # - job_name: 'nginx-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '8443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - # regex: 'ingress-nginx' - # action: keep - # - source_labels: [__address__, __meta_kubernetes_pod_container_port_number] - # action: replace - # regex: ([^:]+)(?::\d+)?;(\d+) - # replacement: $1:10254 - # target_label: __address__ - # - job_name: 'apply-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-review-rm1' - # action: keep - # - job_name: 'apply-sidekiq-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1' - # action: keep - # - source_labels: [__address__] - # action: replace - # regex: ([^:]+) - # replacement: "$${1}:9394" - # target_label: __address__ - # diff --git a/cluster/terraform_kubernetes/config/prometheus/production.prometheus.yml b/cluster/terraform_kubernetes/config/prometheus/production.prometheus.yml index 2b9138dd..3ede8793 100644 --- a/cluster/terraform_kubernetes/config/prometheus/production.prometheus.yml +++ b/cluster/terraform_kubernetes/config/prometheus/production.prometheus.yml @@ -11,6 +11,10 @@ global: # Keep at most 100 sets of details of targets dropped by relabeling. # This information is used to display in the UI for troubleshooting. keep_dropped_targets: 100 + external_labels: + cluster: prometheus + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) rule_files: - /etc/prometheus/prometheus.rules alerting: @@ -157,45 +161,3 @@ scrape_configs: - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - # - # Below were used in the spike and left here for info - # They wree a quick fix, so very likely can be improved - # - # - job_name: 'nginx-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '8443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - # regex: 'ingress-nginx' - # action: keep - # - source_labels: [__address__, __meta_kubernetes_pod_container_port_number] - # action: replace - # regex: ([^:]+)(?::\d+)?;(\d+) - # replacement: $1:10254 - # target_label: __address__ - # - job_name: 'apply-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-review-rm1' - # action: keep - # - job_name: 'apply-sidekiq-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1' - # action: keep - # - source_labels: [__address__] - # action: replace - # regex: ([^:]+) - # replacement: "$${1}:9394" - # target_label: __address__ - # diff --git a/cluster/terraform_kubernetes/config/prometheus/test.prometheus.yml b/cluster/terraform_kubernetes/config/prometheus/test.prometheus.yml index 2b9138dd..3ede8793 100644 --- a/cluster/terraform_kubernetes/config/prometheus/test.prometheus.yml +++ b/cluster/terraform_kubernetes/config/prometheus/test.prometheus.yml @@ -11,6 +11,10 @@ global: # Keep at most 100 sets of details of targets dropped by relabeling. # This information is used to display in the UI for troubleshooting. keep_dropped_targets: 100 + external_labels: + cluster: prometheus + # Each Prometheus has to have unique labels. + replica: $(POD_NAME) rule_files: - /etc/prometheus/prometheus.rules alerting: @@ -157,45 +161,3 @@ scrape_configs: - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - # - # Below were used in the spike and left here for info - # They wree a quick fix, so very likely can be improved - # - # - job_name: 'nginx-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_container_port_number] - # regex: '8443' - # action: drop - # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - # regex: 'ingress-nginx' - # action: keep - # - source_labels: [__address__, __meta_kubernetes_pod_container_port_number] - # action: replace - # regex: ([^:]+)(?::\d+)?;(\d+) - # replacement: $1:10254 - # target_label: __address__ - # - job_name: 'apply-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-review-rm1' - # action: keep - # - job_name: 'apply-sidekiq-pods' - # kubernetes_sd_configs: - # - role: pod - # relabel_configs: - # - source_labels: [__meta_kubernetes_pod_label_app] - # regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1' - # action: keep - # - source_labels: [__address__] - # action: replace - # regex: ([^:]+) - # replacement: "$${1}:9394" - # target_label: __address__ - # diff --git a/cluster/terraform_kubernetes/config/prometheus/thanos.yml.tmpl b/cluster/terraform_kubernetes/config/prometheus/thanos.yml.tmpl new file mode 100644 index 00000000..342e44cc --- /dev/null +++ b/cluster/terraform_kubernetes/config/prometheus/thanos.yml.tmpl @@ -0,0 +1,6 @@ +# thanos.yaml +type: AZURE +config: + storage_account: '${storage-account-name}' + storage_account_key: '${storage-account-key}' + container: 'metrics' diff --git a/cluster/terraform_kubernetes/config/test.tfvars.json b/cluster/terraform_kubernetes/config/test.tfvars.json index b1d282f7..c406ad9d 100644 --- a/cluster/terraform_kubernetes/config/test.tfvars.json +++ b/cluster/terraform_kubernetes/config/test.tfvars.json @@ -36,5 +36,9 @@ "enable_lowpriority_app": true, "lowpriority_app_cpu": "0.5", "lowpriority_app_mem": "1Gi", - "prometheus_app_mem": "2Gi" + "prometheus_app_mem": "2Gi", + "prometheus_app_cpu": "0.5", + "thanos_app_mem": "1Gi", + "thanos_app_cpu": "0.5", + "cluster_short": "ts" } diff --git a/cluster/terraform_kubernetes/prometheus.tf b/cluster/terraform_kubernetes/prometheus.tf index 9d90f47e..747e93ac 100644 --- a/cluster/terraform_kubernetes/prometheus.tf +++ b/cluster/terraform_kubernetes/prometheus.tf @@ -68,14 +68,15 @@ resource "kubernetes_deployment" "prometheus" { selector { match_labels = { - app = "prometheus-server" + app = "prometheus" } } template { metadata { labels = { - app = "prometheus-server" + app = "prometheus" + thanos-store-api = true } } @@ -88,6 +89,10 @@ resource "kubernetes_deployment" "prometheus" { "--storage.tsdb.retention.time=${var.prometheus_tsdb_retention_time}", "--config.file=/etc/prometheus/prometheus.yml", "--storage.tsdb.path=/prometheus/", + "--web.enable-lifecycle", + "--storage.tsdb.no-lockfile", + "--storage.tsdb.min-block-duration=2h", + "--storage.tsdb.max-block-duration=2h", ] port { @@ -100,8 +105,8 @@ resource "kubernetes_deployment" "prometheus" { memory = var.prometheus_app_mem } requests = { - cpu = "500m" - memory = "500M" + cpu = var.prometheus_app_cpu + memory = var.prometheus_app_mem } } @@ -129,6 +134,89 @@ resource "kubernetes_deployment" "prometheus" { empty_dir {} } + container { + image = "quay.io/thanos/thanos:${var.thanos_version}" + name = "thanos" + + args = [ + "sidecar", + "--log.level=debug", + "--tsdb.path=/prometheus", + "--prometheus.url=http://127.0.0.1:9090", + "--objstore.config-file=/config/thanos.yaml", + # "--reloader.config-file=/etc/prometheus/prometheus.yml", + # "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yaml", + # "--reloader.rule-dir=/etc/prometheus/rules/", + ] + + env { + name = "POD_NAME" + value_from { + field_ref { + field_path = "metadata.name" + } + } + } + + liveness_probe { + http_get { + path = "/-/healthy" + port = "http-sidecar" + } + } + + readiness_probe { + http_get { + path = "/-/ready" + port = "http-sidecar" + } + } + + port { + container_port = 10902 + name = "http-sidecar" + } + + port { + container_port = 10901 + name = "grpc" + } + + resources { + limits = { + cpu = 1 + memory = var.prometheus_app_mem + } + requests = { + cpu = var.prometheus_app_cpu + memory = var.prometheus_app_mem + } + } + + volume_mount { + mount_path = "/etc/prometheus/" + name = "prometheus-config-volume" + } + + volume_mount { + mount_path = "/prometheus/" + name = "prometheus-storage-volume" + } + + volume_mount { + mount_path = "/config/" + name = "thanos-config-volume" + read_only = true + } + } + + volume { + name = "thanos-config-volume" + secret { + secret_name = kubernetes_secret.thanos.metadata[0].name + } + } + } } } @@ -152,7 +240,7 @@ resource "kubernetes_service" "prometheus" { target_port = kubernetes_deployment.prometheus.spec[0].template[0].spec[0].container[0].port[0].container_port } selector = { - app = "prometheus-server" + app = "prometheus" } type = "NodePort" } diff --git a/cluster/terraform_kubernetes/thanos.tf b/cluster/terraform_kubernetes/thanos.tf new file mode 100644 index 00000000..72386416 --- /dev/null +++ b/cluster/terraform_kubernetes/thanos.tf @@ -0,0 +1,335 @@ +resource "azurerm_storage_account" "thanos" { + + name = "${var.resource_prefix}${var.cluster_short}thanossa" + location = data.azurerm_resource_group.resource_group.location + resource_group_name = data.azurerm_resource_group.resource_group.name + account_tier = "Standard" + account_replication_type = "GRS" + allow_nested_items_to_be_public = false + + lifecycle { ignore_changes = [tags] } +} + +resource "azurerm_storage_container" "metrics" { + + name = "metrics" + storage_account_name = azurerm_storage_account.thanos.name + container_access_type = "private" +} + +resource "kubernetes_secret" "thanos" { + + metadata { + name = "thanos-objstore-config" + namespace = "monitoring" + } + + data = { + "thanos.yaml" = templatefile("${path.module}/config/prometheus/thanos.yml.tmpl", local.template_variable_map) + "object-store.yaml" = templatefile("${path.module}/config/prometheus/thanos.yml.tmpl", local.template_variable_map) + } +} + +resource "kubernetes_service" "thanos-store-gateway" { + + metadata { + name = "thanos-store-gateway" + namespace = "monitoring" + } + + spec { + port { + name = "grpc" + port = 10901 + target_port = "grpc" + } + selector = { + thanos-store-api : "true" + } + type = "ClusterIP" + } +} + +resource "kubernetes_deployment" "thanos-querier" { + + metadata { + name = "thanos-querier" + namespace = "monitoring" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "thanos-querier" + } + } + + template { + metadata { + labels = { + app = "thanos-querier" + # thanos-store-api = true + } + } + + spec { + + container { + image = "quay.io/thanos/thanos:${var.thanos_version}" + name = "thanos-querier" + + args = [ + "query", + "--log.level=debug", + "--query.replica-label=replica", + "--store=dnssrv+thanos-store-gateway:10901", + ] + + liveness_probe { + http_get { + path = "/-/healthy" + port = "http" + } + } + + readiness_probe { + http_get { + path = "/-/ready" + port = "http" + } + } + + port { + container_port = 10901 + name = "grpc" + } + + port { + container_port = 10902 + name = "http" + } + + resources { + limits = { + cpu = 1 + memory = var.thanos_app_mem + } + requests = { + cpu = var.thanos_app_cpu + memory = var.thanos_app_mem + } + } + } + + } + } + } +} + +resource "kubernetes_service" "thanos-querier" { + + metadata { + name = "thanos-querier" + namespace = "monitoring" + labels = { + app = "thanos-querier" + } + } + + spec { + port { + name = "http" + port = 9090 + target_port = "http" + protocol = "TCP" + } + selector = { + app = "thanos-querier" + } + } +} + +resource "kubernetes_deployment" "thanos-store-gateway" { + + metadata { + name = "thanos-store-gateway" + namespace = "monitoring" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "thanos-store-gateway" + } + } + + template { + metadata { + labels = { + app = "thanos-store-gateway" + thanos-store-api = true + } + } + + spec { + + container { + image = "quay.io/thanos/thanos:${var.thanos_version}" + name = "thanos-store-gateway" + + args = [ + "store", + "--log.level=debug", + "--data-dir=/data", + "--objstore.config-file=/config/thanos.yaml", + "--index-cache-size=500MB", + "--chunk-pool-size=500MB", + ] + + port { + container_port = 10901 + name = "grpc" + } + + port { + container_port = 10902 + name = "http" + } + + liveness_probe { + http_get { + path = "/-/healthy" + port = "http" + } + } + + readiness_probe { + http_get { + path = "/-/ready" + port = "http" + } + } + + resources { + limits = { + cpu = 1 + memory = var.thanos_app_mem + } + requests = { + cpu = var.thanos_app_cpu + memory = var.thanos_app_mem + } + } + + volume_mount { + mount_path = "/config/" + name = "thanos-config-volume" + read_only = true + } + } + + volume { + name = "thanos-config-volume" + secret { + secret_name = "thanos-objstore-config" + } + } + + } + } + } +} + +resource "kubernetes_deployment" "thanos-compactor" { + + metadata { + name = "thanos-compactor" + namespace = "monitoring" + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "thanos-compactor" + } + } + + template { + metadata { + labels = { + app = "thanos-compactor" + thanos-store-api = true + } + } + + spec { + + container { + image = "quay.io/thanos/thanos:${var.thanos_version}" + name = "thanos-compactor" + + args = [ + "compact", + "--log.level=debug", + "--data-dir=/data", + "--objstore.config-file=/config/thanos.yaml", + "--retention.resolution-raw=${var.thanos_retention_raw}", + "--retention.resolution-5m=${var.thanos_retention_5m}", + "--retention.resolution-1h=${var.thanos_retention_1h}", + "--wait", + ] + + liveness_probe { + http_get { + path = "/-/healthy" + port = "http" + } + } + + readiness_probe { + http_get { + path = "/-/ready" + port = "http" + } + } + + port { + container_port = 10902 + name = "http" + } + + resources { + limits = { + cpu = 1 + memory = var.thanos_app_mem + } + requests = { + cpu = var.thanos_app_cpu + memory = var.thanos_app_mem + } + } + + volume_mount { + mount_path = "/config/" + name = "thanos-config-volume" + read_only = true + } + } + + volume { + name = "thanos-config-volume" + secret { + secret_name = "thanos-objstore-config" + } + } + + } + } + } +} diff --git a/cluster/terraform_kubernetes/variables.tf b/cluster/terraform_kubernetes/variables.tf index 804e9d82..50bd3308 100644 --- a/cluster/terraform_kubernetes/variables.tf +++ b/cluster/terraform_kubernetes/variables.tf @@ -76,7 +76,7 @@ variable "prometheus_version" { variable "prometheus_tsdb_retention_time" { description = "Prometheus retention period for locally stored data" - default = "15d" + default = "6h" } variable "prometheus_app_mem" { @@ -84,6 +84,44 @@ variable "prometheus_app_mem" { default = "1Gi" } +variable "prometheus_app_cpu" { + description = "Prometheus app cpu request" + default = "100m" +} + +variable "thanos_version" { + default = "v0.8.0" +} + +variable "thanos_app_mem" { + description = "Thanos app memory limit" + default = "1Gi" +} + +variable "thanos_app_cpu" { + description = "Thanos app cpu request" + default = "100m" +} + +variable "thanos_retention_raw" { + description = "Thanos retention period for raw samples" + default = "30d" +} + +variable "thanos_retention_5m" { + description = "Thanos retention period for 5m samples" + default = "60d" +} + +variable "thanos_retention_1h" { + description = "Thanos retention period for 1h samples" + default = "90d" +} + +variable "cluster_short" { + description = "Short name of the cluster configuration, e.g. dv, pt, ts, pd" +} + locals { cluster_name = ( var.cip_tenant ? @@ -133,4 +171,9 @@ locals { spn_authentication = contains(keys(data.environment_variables.github_actions.items), "GITHUB_ACTIONS") kubelogin_args = local.spn_authentication ? local.kubelogin_spn_args : local.kubelogin_azurecli_args + + template_variable_map = { + storage-account-name = azurerm_storage_account.thanos.name + storage-account-key = azurerm_storage_account.thanos.primary_access_key + } } diff --git a/documentation/monitoring.md b/documentation/monitoring.md index b51ffdc6..22daefc8 100644 --- a/documentation/monitoring.md +++ b/documentation/monitoring.md @@ -28,8 +28,30 @@ Prometheus monitoring is enabled for a cluster by default. The default prometheus version is hardcoded in the kubernetes variable.tf. It can be overridden for a cluster by adding prometheus_version to the env.tfvars.json file. There are several other variables that can be changed depending on env requirements. prometheus_app_mem - app memory limit (default 1G) -prometheus_tsdb_retention_time - local storage retention period +prometheus_app_cpu - app memory requests (default 100m) +prometheus_tsdb_retention_time - local storage retention period (default 6h) Prometheus rules and yml config files are loaded from the terraform_kubernetes/config/prometheus directory. Each file is prefixed with the cluster env. e.g. development.prometheus.rules and development.prometheus.yml Currently a restart/reload of the prometheus process is required if changes are made to these files. + +## Thanos + +Prometheus is configured to use Thanos for backend storage. + +Thanos runs as a sidecar within the prometheus deployment. +It copies prometheus collected data after two hours to an Azure storage container. + +There are also three separate Thanos services +- thanos-querier +- thanos-store-gateway +- thanos-compactor +All are running as single replica deployments. + +The default thanos version is hardcoded in the kubernetes variable.tf. It can be overridden for a cluster by adding thanos_version to the env.tfvars.json file. +There are several other variables that can be changed depending on env requirements. +thanos_app_mem - app memory limit (default 1G) +thanos_app_cpu - app memory requests (default 100m) +thanos_retention_raw - Thanos retention period for raw samples (default 30d) +thanos_retention_5m - Thanos retention period for 5m samples (default 60d) +thanos_retention_1h - Thanos retention period for 1h samples (default 90d) diff --git a/scripts/pfwd.sh b/scripts/pfwd.sh index cfc1ec87..d5b15adf 100755 --- a/scripts/pfwd.sh +++ b/scripts/pfwd.sh @@ -2,19 +2,22 @@ # Script to open a connection to prometheus # Requires kubectl config to be already pointing to the target cluster # -PROM=`kubectl get pods -n monitoring -l app=prometheus-server --no-headers=true -o name` +PROM=`kubectl get pods -n monitoring -l app=prometheus --no-headers=true -o name` # AMAN=`kubectl get pods -n monitoring -l app=alertmanager --no-headers=true -o name` # GRAF=`kubectl get pods -n monitoring -l app=grafana --no-headers=true -o name` +THANOS=`kubectl get service -n monitoring -l app=thanos-querier --no-headers=true -o name` echo $PROM # echo $AMAN # echo $GRAF kubectl port-forward -n monitoring $PROM 8080:9090 & # kubectl port-forward -n monitoring $AMAN 8081:9093 & # kubectl port-forward -n monitoring $GRAF 3000 & +kubectl port-forward -n monitoring $THANOS 8082:9090 & echo echo Prometheus at http://localhost:8080 # echo Alertmanager at http://localhost:8081 # echo Grafana at http://localhost:3000 +echo Thanos at http://localhost:8082 echo echo kill with pkill kubectl echo