Skip to content

Commit

Permalink
Thanos with sidecar
Browse files Browse the repository at this point in the history
  • Loading branch information
RMcVelia committed Feb 12, 2024
1 parent c9ec354 commit 3c6b850
Show file tree
Hide file tree
Showing 14 changed files with 539 additions and 138 deletions.
5 changes: 4 additions & 1 deletion cluster/terraform_kubernetes/config/development.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@
"www.cluster1.development.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
"thanos_retention_raw": "3d",
"thanos_retention_5m": "3d",
"thanos_retention_1h": "3d",
"cluster_short": "dv"
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@
"www.platform-test.teacherservices.cloud"
],
"ingress_nginx_version": "4.8.3",
"prometheus_tsdb_retention_time": "12h"
"thanos_retention_raw": "3d",
"thanos_retention_5m": "3d",
"thanos_retention_1h": "3d",
"cluster_short": "pt"
}
6 changes: 5 additions & 1 deletion cluster/terraform_kubernetes/config/production.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,9 @@
],
"ingress_nginx_version": "4.8.3",
"enable_lowpriority_app": true,
"prometheus_app_mem": "2Gi"
"prometheus_app_mem": "2Gi",
"prometheus_app_cpu": "0.5",
"thanos_app_mem": "1Gi",
"thanos_app_cpu": "0.5",
"cluster_short": "pd"
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ global:
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
external_labels:
cluster: prometheus
# Each Prometheus has to have unique labels.
replica: $(POD_NAME)
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ global:
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
external_labels:
cluster: prometheus
# Each Prometheus has to have unique labels.
replica: $(POD_NAME)
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
Expand Down Expand Up @@ -157,45 +161,3 @@ scrape_configs:
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ global:
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
external_labels:
cluster: prometheus
# Each Prometheus has to have unique labels.
replica: $(POD_NAME)
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
Expand Down Expand Up @@ -157,45 +161,3 @@ scrape_configs:
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
46 changes: 4 additions & 42 deletions cluster/terraform_kubernetes/config/prometheus/test.prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ global:
# Keep at most 100 sets of details of targets dropped by relabeling.
# This information is used to display in the UI for troubleshooting.
keep_dropped_targets: 100
external_labels:
cluster: prometheus
# Each Prometheus has to have unique labels.
replica: $(POD_NAME)
rule_files:
- /etc/prometheus/prometheus.rules
alerting:
Expand Down Expand Up @@ -157,45 +161,3 @@ scrape_configs:
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
#
# Below were used in the spike and left here for info
# They wree a quick fix, so very likely can be improved
#
# - job_name: 'nginx-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_container_port_number]
# regex: '8443'
# action: drop
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
# regex: 'ingress-nginx'
# action: keep
# - source_labels: [__address__, __meta_kubernetes_pod_container_port_number]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:10254
# target_label: __address__
# - job_name: 'apply-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-review-rm1'
# action: keep
# - job_name: 'apply-sidekiq-pods'
# kubernetes_sd_configs:
# - role: pod
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app]
# regex: 'apply-secondary-worker-review-rm1|apply-worker-review-rm1'
# action: keep
# - source_labels: [__address__]
# action: replace
# regex: ([^:]+)
# replacement: "$${1}:9394"
# target_label: __address__
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# thanos.yaml
type: AZURE
config:
storage_account: '${storage-account-name}'
storage_account_key: '${storage-account-key}'
container: 'metrics'
6 changes: 5 additions & 1 deletion cluster/terraform_kubernetes/config/test.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,9 @@
"enable_lowpriority_app": true,
"lowpriority_app_cpu": "0.5",
"lowpriority_app_mem": "1Gi",
"prometheus_app_mem": "2Gi"
"prometheus_app_mem": "2Gi",
"prometheus_app_cpu": "0.5",
"thanos_app_mem": "1Gi",
"thanos_app_cpu": "0.5",
"cluster_short": "ts"
}
98 changes: 93 additions & 5 deletions cluster/terraform_kubernetes/prometheus.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,15 @@ resource "kubernetes_deployment" "prometheus" {

selector {
match_labels = {
app = "prometheus-server"
app = "prometheus"
}
}

template {
metadata {
labels = {
app = "prometheus-server"
app = "prometheus"
thanos-store-api = true
}
}

Expand All @@ -88,6 +89,10 @@ resource "kubernetes_deployment" "prometheus" {
"--storage.tsdb.retention.time=${var.prometheus_tsdb_retention_time}",
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus/",
"--web.enable-lifecycle",
"--storage.tsdb.no-lockfile",
"--storage.tsdb.min-block-duration=2h",
"--storage.tsdb.max-block-duration=2h",
]

port {
Expand All @@ -100,8 +105,8 @@ resource "kubernetes_deployment" "prometheus" {
memory = var.prometheus_app_mem
}
requests = {
cpu = "500m"
memory = "500M"
cpu = var.prometheus_app_cpu
memory = var.prometheus_app_mem
}
}

Expand Down Expand Up @@ -129,6 +134,89 @@ resource "kubernetes_deployment" "prometheus" {
empty_dir {}
}

container {
image = "quay.io/thanos/thanos:${var.thanos_version}"
name = "thanos"

args = [
"sidecar",
"--log.level=debug",
"--tsdb.path=/prometheus",
"--prometheus.url=http://127.0.0.1:9090",
"--objstore.config-file=/config/thanos.yaml",
# "--reloader.config-file=/etc/prometheus/prometheus.yml",
# "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yaml",
# "--reloader.rule-dir=/etc/prometheus/rules/",
]

env {
name = "POD_NAME"
value_from {
field_ref {
field_path = "metadata.name"
}
}
}

liveness_probe {
http_get {
path = "/-/healthy"
port = "http-sidecar"
}
}

readiness_probe {
http_get {
path = "/-/ready"
port = "http-sidecar"
}
}

port {
container_port = 10902
name = "http-sidecar"
}

port {
container_port = 10901
name = "grpc"
}

resources {
limits = {
cpu = 1
memory = var.prometheus_app_mem
}
requests = {
cpu = var.prometheus_app_cpu
memory = var.prometheus_app_mem
}
}

volume_mount {
mount_path = "/etc/prometheus/"
name = "prometheus-config-volume"
}

volume_mount {
mount_path = "/prometheus/"
name = "prometheus-storage-volume"
}

volume_mount {
mount_path = "/config/"
name = "thanos-config-volume"
read_only = true
}
}

volume {
name = "thanos-config-volume"
secret {
secret_name = kubernetes_secret.thanos.metadata[0].name
}
}

}
}
}
Expand All @@ -152,7 +240,7 @@ resource "kubernetes_service" "prometheus" {
target_port = kubernetes_deployment.prometheus.spec[0].template[0].spec[0].container[0].port[0].container_port
}
selector = {
app = "prometheus-server"
app = "prometheus"
}
type = "NodePort"
}
Expand Down
Loading

0 comments on commit 3c6b850

Please sign in to comment.