From fe1c1c64bac38de56a5ed9aed06a314cf33a0da4 Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Tue, 21 Jan 2025 22:19:15 +0000 Subject: [PATCH 1/8] Fix dependencies --- cluster/terraform_aks_cluster/azure_metric_alerts.tf | 4 ++-- cluster/terraform_kubernetes/gcp_wif.tf | 2 +- cluster/terraform_kubernetes/reloader.tf | 4 ++-- cluster/terraform_kubernetes/thanos.tf | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cluster/terraform_aks_cluster/azure_metric_alerts.tf b/cluster/terraform_aks_cluster/azure_metric_alerts.tf index f92a11de..5f4cfe15 100644 --- a/cluster/terraform_aks_cluster/azure_metric_alerts.tf +++ b/cluster/terraform_aks_cluster/azure_metric_alerts.tf @@ -37,7 +37,7 @@ data "azurerm_subscription" "current" {} resource "azurerm_monitor_metric_alert" "port_exhaustion" { name = "${var.resource_prefix}-tsc-${var.environment}-port-exhaustion" resource_group_name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg" - scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg/providers/Microsoft.Network/loadBalancers/kubernetes"] + scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${azurerm_kubernetes_cluster.main.node_resource_group}/providers/Microsoft.Network/loadBalancers/kubernetes"] severity = 1 criteria { metric_namespace = "microsoft.network/loadbalancers" @@ -62,7 +62,7 @@ resource "azurerm_monitor_metric_alert" "port_exhaustion" { resource "azurerm_monitor_metric_alert" "high_port_usage" { name = "${var.resource_prefix}-tsc-${var.environment}-high-port-usage" resource_group_name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg" - scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${var.resource_prefix}-tsc-aks-nodes-${var.environment}-rg/providers/Microsoft.Network/loadBalancers/kubernetes"] + scopes = ["/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${azurerm_kubernetes_cluster.main.node_resource_group}/providers/Microsoft.Network/loadBalancers/kubernetes"] severity = 2 criteria { metric_namespace = "microsoft.network/loadbalancers" diff --git a/cluster/terraform_kubernetes/gcp_wif.tf b/cluster/terraform_kubernetes/gcp_wif.tf index 6c1d35a7..580266ed 100644 --- a/cluster/terraform_kubernetes/gcp_wif.tf +++ b/cluster/terraform_kubernetes/gcp_wif.tf @@ -14,7 +14,7 @@ resource "kubernetes_service_account" "gcp_wif" { metadata { name = "gcp-wif" - namespace = each.key + namespace = kubernetes_namespace.default_list[each.key].metadata[0].name annotations = { "azure.workload.identity/client-id" = azurerm_user_assigned_identity.gcp_wif[each.key].client_id diff --git a/cluster/terraform_kubernetes/reloader.tf b/cluster/terraform_kubernetes/reloader.tf index 3026643d..9c55f518 100644 --- a/cluster/terraform_kubernetes/reloader.tf +++ b/cluster/terraform_kubernetes/reloader.tf @@ -39,7 +39,7 @@ resource "kubernetes_cluster_role" "reloader" { resource "kubernetes_service_account" "reloader" { metadata { name = "reloader" - namespace = "infra" + namespace = kubernetes_namespace.default_list["infra"].metadata[0].name } } @@ -66,7 +66,7 @@ resource "kubernetes_cluster_role_binding" "reloader" { resource "kubernetes_deployment" "reloader" { metadata { name = "reloader" - namespace = "infra" + namespace = kubernetes_namespace.default_list["infra"].metadata[0].name labels = { app = "reloader" } diff --git a/cluster/terraform_kubernetes/thanos.tf b/cluster/terraform_kubernetes/thanos.tf index bf066fcf..c4c0243c 100644 --- a/cluster/terraform_kubernetes/thanos.tf +++ b/cluster/terraform_kubernetes/thanos.tf @@ -21,7 +21,7 @@ resource "kubernetes_secret" "thanos" { metadata { name = "thanos-objstore-config" - namespace = "monitoring" + namespace = kubernetes_namespace.default_list["monitoring"].metadata[0].name } data = { From 324be8e9a32f649505093192bbceaa8f893a7ceb Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Tue, 21 Jan 2025 22:20:25 +0000 Subject: [PATCH 2/8] Update clone namespace exclusion --- scripts/export_aks_resources.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/export_aks_resources.sh b/scripts/export_aks_resources.sh index 46c448c4..68a676ba 100755 --- a/scripts/export_aks_resources.sh +++ b/scripts/export_aks_resources.sh @@ -6,7 +6,7 @@ set -eu set -o pipefail -NAMESPACE_BLACKLIST="default kube-node-lease kube-public kube-system gatekeeper-system" +NAMESPACE_BLACKLIST="default kube-node-lease kube-public kube-system gatekeeper-system infra monitoring" echo "Fetching custom namespaces..." all_namespaces=$(kubectl get namespaces -o name) From ea5e215b874534345eca5532c16cd0afa0248f44 Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Tue, 21 Jan 2025 22:29:01 +0000 Subject: [PATCH 3/8] Add missing clone resources --- cluster/terraform_aks_cluster/main.tf | 25 +- cluster/terraform_kubernetes/analytics.tf | 27 +++ cluster/terraform_kubernetes/filebeat.tf | 188 ++++++++++++++- cluster/terraform_kubernetes/gcp_wif.tf | 16 ++ .../ingress_controller.tf | 228 ++++++++++++++++++ .../terraform_kubernetes/lowpriority_app.tf | 62 +++++ cluster/terraform_kubernetes/variables.tf | 5 + cluster/terraform_kubernetes/welcome_app.tf | 207 +++++++++++++++- 8 files changed, 753 insertions(+), 5 deletions(-) diff --git a/cluster/terraform_aks_cluster/main.tf b/cluster/terraform_aks_cluster/main.tf index fd28ccbb..34838e04 100644 --- a/cluster/terraform_aks_cluster/main.tf +++ b/cluster/terraform_aks_cluster/main.tf @@ -89,6 +89,8 @@ resource "azurerm_kubernetes_cluster" "clone" { node_resource_group = local.clone_node_resource_group_name dns_prefix = "${azurerm_kubernetes_cluster.main.dns_prefix}-clone" kubernetes_version = azurerm_kubernetes_cluster.main.kubernetes_version + oidc_issuer_enabled = true + workload_identity_enabled = true dynamic "azure_active_directory_role_based_access_control" { for_each = var.enable_azure_RBAC_clone ? [1] : [] @@ -111,7 +113,17 @@ resource "azurerm_kubernetes_cluster" "clone" { } identity { - type = "SystemAssigned" + type = "UserAssigned" + identity_ids = [data.azurerm_user_assigned_identity.aks_control_plane.id] + } + + network_profile { + network_plugin = "kubenet" + load_balancer_sku = "standard" + + load_balancer_profile { + outbound_ip_address_ids = [azurerm_public_ip.egress-public-ip-clone[0].id] + } } lifecycle { ignore_changes = [tags] } @@ -142,6 +154,17 @@ resource "azurerm_public_ip" "egress-public-ip" { lifecycle { ignore_changes = [tags] } } +resource "azurerm_public_ip" "egress-public-ip-clone" { + count = var.clone_cluster ? 1 : 0 + name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-clone-egress-pip" + location = data.azurerm_resource_group.cluster.location + resource_group_name = data.azurerm_resource_group.cluster.name + allocation_method = "Static" + sku = "Standard" + + lifecycle { ignore_changes = [tags] } +} + # resource "null_resource" "delayed_uncordon" { # # Trigger uncordon after node pool creation/updation diff --git a/cluster/terraform_kubernetes/analytics.tf b/cluster/terraform_kubernetes/analytics.tf index 1d372eda..cfef8074 100644 --- a/cluster/terraform_kubernetes/analytics.tf +++ b/cluster/terraform_kubernetes/analytics.tf @@ -22,3 +22,30 @@ resource "kubernetes_config_map" "ama_logs" { } } + +resource "kubernetes_config_map" "ama_logs_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = "container-azm-ms-agentconfig" + namespace = "kube-system" + } + + data = { + config-version = "ver1" + log-data-collection-settings = <<-EOT + # Log data collection settings + + [log_collection_settings] + [log_collection_settings.env_var] + # In the absense of this configmap, default value for enabled is true + enabled = false + [log_collection_settings.filter_using_annotations] + # if enabled will exclude logs from pods with annotations fluentbit.io/exclude: "true". + # Read more: https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#kubernetes-annotations + enabled = true + EOT + schema-version = "v1" + } + +} diff --git a/cluster/terraform_kubernetes/filebeat.tf b/cluster/terraform_kubernetes/filebeat.tf index c6c6ac72..8b891045 100644 --- a/cluster/terraform_kubernetes/filebeat.tf +++ b/cluster/terraform_kubernetes/filebeat.tf @@ -13,7 +13,6 @@ resource "kubernetes_service_account" "filebeat" { } } - resource "kubernetes_cluster_role" "filebeat" { metadata { name = "filebeat" @@ -34,6 +33,7 @@ resource "kubernetes_cluster_role" "filebeat" { } } + resource "kubernetes_cluster_role_binding" "filebeat" { metadata { name = "filebeat" @@ -179,3 +179,189 @@ resource "kubernetes_daemonset" "filebeat" { } } } + +# +# Clone definition +# + +resource "kubernetes_service_account" "filebeat_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = "filebeat" + namespace = kubernetes_namespace.default_list_clone["monitoring"].metadata[0].name + labels = { + "name" = "filebeat" + } + } +} + +resource "kubernetes_cluster_role" "filebeat_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = "filebeat" + labels = { + "name" = "filebeat" + } + } + rule { + api_groups = [""] + resources = ["pods", "nodes", "namespaces"] + verbs = ["get", "list", "watch"] + } + + rule { + api_groups = ["apps"] + resources = ["replicasets"] + verbs = ["get", "list", "watch"] + } + +} + +resource "kubernetes_cluster_role_binding" "filebeat_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = "filebeat" + labels = { + "name" = "filebeat" + } + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.filebeat_clone[0].metadata[0].name + } + + subject { + kind = "ServiceAccount" + name = "filebeat" + namespace = kubernetes_namespace.default_list_clone["monitoring"].metadata[0].name + } +} + +resource "kubernetes_config_map" "filebeat_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + + metadata { + name = "filebeat-config-${local.config_map_hash}" + namespace = kubernetes_namespace.default_list_clone["monitoring"].metadata[0].name + } + + data = { + "filebeat.yml" = local.config_map_data + } + +} + +resource "kubernetes_daemonset" "filebeat_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + + metadata { + name = "filebeat" + namespace = kubernetes_namespace.default_list_clone["monitoring"].metadata[0].name + labels = { + app = "filebeat" + } + } + + spec { + selector { + match_labels = { + app = "filebeat" + } + } + + + template { + metadata { + labels = { + app = "filebeat" + } + } + + spec { + service_account_name = kubernetes_service_account.filebeat_clone[0].metadata[0].name + termination_grace_period_seconds = 30 + + node_selector = { + "teacherservices.cloud/node_pool" = "applications" + "kubernetes.io/os" = "linux" + } + + container { + image = "docker.elastic.co/beats/filebeat-oss:${var.filebeat_version}" + name = "filebeat" + + args = [ + "-c", + "filebeat.yml", + "-e", + ] + + security_context { + run_as_user = 0 + } + + resources { + limits = { + cpu = "200m" + memory = "200Mi" + } + requests = { + cpu = "100m" + memory = "100Mi" + } + } + + volume_mount { + mount_path = "/usr/share/filebeat/filebeat.yml" + name = "filebeat-config" + read_only = "true" + sub_path = "filebeat.yml" + } + + volume_mount { + mount_path = "/usr/share/filebeat/data" + name = "data" + } + + volume_mount { + mount_path = "/var/log" + name = "varlog" + read_only = "true" + } + + } + + volume { + name = "filebeat-config" + config_map { + name = kubernetes_config_map.filebeat_clone[0].metadata[0].name + default_mode = "0644" + } + } + + volume { + name = "varlog" + host_path { + path = "/var/log" + } + } + + volume { + name = "data" + host_path { + path = "/var/lib/filebeat-data" + type = "DirectoryOrCreate" + } + } + + } + } + } +} diff --git a/cluster/terraform_kubernetes/gcp_wif.tf b/cluster/terraform_kubernetes/gcp_wif.tf index 580266ed..1a71a6ff 100644 --- a/cluster/terraform_kubernetes/gcp_wif.tf +++ b/cluster/terraform_kubernetes/gcp_wif.tf @@ -22,6 +22,22 @@ resource "kubernetes_service_account" "gcp_wif" { } } +resource "kubernetes_service_account" "gcp_wif_clone" { + + for_each = var.clone_cluster ? toset(var.gcp_wif_namespaces) : [] + + provider = kubernetes.clone + + metadata { + name = "gcp-wif" + namespace = kubernetes_namespace.default_list[each.key].metadata[0].name + + annotations = { + "azure.workload.identity/client-id" = azurerm_user_assigned_identity.gcp_wif[each.key].client_id + } + } +} + resource "azurerm_federated_identity_credential" "gcp_wif" { for_each = toset(var.gcp_wif_namespaces) diff --git a/cluster/terraform_kubernetes/ingress_controller.tf b/cluster/terraform_kubernetes/ingress_controller.tf index 9dce1fb1..c2db8446 100644 --- a/cluster/terraform_kubernetes/ingress_controller.tf +++ b/cluster/terraform_kubernetes/ingress_controller.tf @@ -212,6 +212,223 @@ resource "helm_release" "ingress-nginx" { } } +resource "helm_release" "ingress-nginx-clone" { + count = var.clone_cluster ? 1 : 0 + provider = helm.clone + + name = helm_release.ingress-nginx.name + repository = helm_release.ingress-nginx.repository + chart = helm_release.ingress-nginx.chart + version = helm_release.ingress-nginx.version + + # The first part of the name with simple dots is the keys path in the values.yml file e.g. controller.service.annotations + # The last part is the final key e.g. service\\.beta\\.kubernetes\\.io/azure-load-balancer-health-probe-request-path + # It may have double escaped dots if the key contains dots e.g. \\. + # The corresponding value is in the "value" argument + # https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml + set { + name = "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-health-probe-request-path" + value = "/healthz" + type = "string" + } + # Resource group of the ingress public IP + # The cluster managed identity must have Network Contributor role on the resource group + set { + name = "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-resource-group" + value = azurerm_public_ip.ingress-public-ip-clone[0].resource_group_name + type = "string" + } + # Ingress IP + set { + name = "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-ipv4" + value = azurerm_public_ip.ingress-public-ip-clone[0].ip_address + type = "string" + } + # Route requests from the load balancer to the ingress pods on the same node instead of adding one more hop to the node with most pods. + # This preserves the client IP and removes a hop. It potentially creates a traffic imbalance but this should have no effect for us + # as we should have many well distributed ingress pods. + set { + name = "controller.service.externalTrafficPolicy" + value = "Local" + type = "string" + } + set { + name = "controller.extraArgs.default-ssl-certificate" + value = "default/cert-secret" + type = "string" + } + # Disable HTTP port 80 on the Azure load balancer + set { + name = "controller.service.enableHttp" + value = "false" + type = "auto" + } + # Allow POST requests with large body. Prevent error 413: Request entity too large + set { + name = "controller.config.proxy-body-size" + value = "50m" + type = "string" + } + # Sets the size of the buffer used for reading the first part of the response received from the proxied server. + # Needs to be larger than the response header or nginx will return an error for the request + # https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#proxy-buffer-size + set { + name = "controller.config.proxy-buffer-size" + value = "24k" + type = "string" + } + # This ConfigMap setting sets the time, in seconds, during which a keep-alive client connection will stay open on the server side + set { + name = "controller.config.keep-alive" + value = "120" + type = "auto" + } + # This ConfigMap setting defines a timeout for reading client request header, in seconds + set { + name = "controller.config.client-header-timeout" + value = "120" + type = "auto" + } + set { + name = "controller.replicaCount" + value = 20 + type = "auto" + } + set { + name = "controller.nodeSelector.teacherservices\\.cloud/node_pool" + value = "applications" + type = "string" + } + + # Send X-Forwarded-For HTTP header to keep the client IP for the apps + # When used behind front door, it contains the front door backend IP as well + # The Host header is replaced by the value of X-Forwarded-Host header. When using front door, + # apps will see the external host instead of the ingress host + set { + name = "controller.config.use-forwarded-headers" + value = "true" + type = "string" + } + set { + name = "controller.config.compute-full-forwarded-for" + value = "true" + type = "string" + } + set { + name = "controller.resources.limits.cpu" + value = "500m" + type = "string" + } + set { + name = "controller.resources.limits.memory" + value = "512Mi" + type = "string" + } + + # Enable prometheus metrics and configure scraping + set { + name = "controller.metrics.enabled" + value = "true" + type = "auto" + } + set { + name = "controller.podAnnotations.prometheus\\.io/scrape" + value = "true" + type = "string" + } + set { + name = "controller.podAnnotations.prometheus\\.io/path" + value = "/metrics" + type = "string" + } + set { + name = "controller.podAnnotations.prometheus\\.io/port" + value = "10254" + type = "string" + } + # Enable shipping logs to Logit.io + set { + name = "controller.podAnnotations.logit\\.io/send" + value = "true" + type = "string" + } + # Disable shipping logs to Log analytics via Container insights + set { + name = "controller.podAnnotations.fluentbit\\.io/exclude" + value = "true" + type = "string" + } + + # Set ingress class name so it can be retrieved as an attribute to force dependencies + set { + name = "controller.ingressClassResource.name" + value = "nginx" + type = "string" + } + # Block access to /metrics endpoint + dynamic "set" { + for_each = var.block_metrics_endpoint ? [1] : [] + + content { + name = "controller.config.server-snippet" + value = <<-EOT + location /metrics { + deny all; + } + EOT + type = "string" + } + } + + set { + name = "controller.podSecurityContext.runAsUser" + value = "1000" + type = "auto" + } + set { + name = "controller.podSecurityContext.runAsGroup" + value = "3000" + type = "auto" + } + set { + name = "controller.securityContext.capabilities.drop[0]" + value = "ALL" + type = "string" + } + // By default, NET_BIND_SERVICE is added to the deployment by the Helm chart, even if we do not explicitly set it. + + set { + name = "controller.securityContext.allowPrivilegeEscalation" + value = "false" + type = "string" + } + set { + name = "controller.securityContext.privileged" + value = "false" + type = "string" + } + set { + name = "controller.securityContext.runAsNonRoot" + value = "true" + type = "string" + } + set { + name = "controller.securityContext.readOnlyRootFilesystem" + value = "true" + type = "string" + } + set { + name = "controller.securityContext.seccompProfile.type" + value = "RuntimeDefault" + type = "string" + } + set { + name = "controller.automountServiceAccountToken" + value = "false" + type = "string" + } +} + resource "azurerm_public_ip" "ingress-public-ip" { name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-ingress-pip" location = data.azurerm_resource_group.resource_group.location @@ -222,6 +439,17 @@ resource "azurerm_public_ip" "ingress-public-ip" { lifecycle { ignore_changes = [tags] } } +resource "azurerm_public_ip" "ingress-public-ip-clone" { + count = var.clone_cluster ? 1 : 0 + name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-clone-ingress-pip" + location = data.azurerm_resource_group.resource_group.location + resource_group_name = data.azurerm_resource_group.resource_group.name + allocation_method = "Static" + sku = "Standard" + + lifecycle { ignore_changes = [tags] } +} + data "azurerm_resource_group" "resource_group" { name = var.resource_group_name } diff --git a/cluster/terraform_kubernetes/lowpriority_app.tf b/cluster/terraform_kubernetes/lowpriority_app.tf index 6bb3df34..9515f4e5 100644 --- a/cluster/terraform_kubernetes/lowpriority_app.tf +++ b/cluster/terraform_kubernetes/lowpriority_app.tf @@ -53,3 +53,65 @@ resource "kubernetes_deployment" "lowpriority_app" { } } } + +# +# Clone definition +# + +resource "kubernetes_priority_class_v1" "lowpriority_clone" { + count = var.enable_lowpriority_app_clone ? 1 : 0 + provider = kubernetes.clone + + metadata { + name = "lowpriority" + } + + value = -1 +} + +resource "kubernetes_deployment" "lowpriority_app_clone" { + count = var.enable_lowpriority_app_clone ? 1 : 0 + provider = kubernetes.clone + + metadata { + name = local.lowpriority_app_name + namespace = local.lowpriority_app_namespace + } + spec { + replicas = var.lowpriority_app_replicas + selector { + match_labels = { + app = local.lowpriority_app_name + } + } + template { + metadata { + labels = { + app = local.lowpriority_app_name + } + } + spec { + node_selector = { + "teacherservices.cloud/node_pool" = "applications" + "kubernetes.io/os" = "linux" + } + priority_class_name = "lowpriority" + container { + name = local.lowpriority_app_name + image = "k8s.gcr.io/pause" + + resources { + requests = { + cpu = var.lowpriority_app_cpu + memory = var.lowpriority_app_mem + } + limits = { + cpu = 1 + memory = var.lowpriority_app_mem + } + } + } + } + } + } +} diff --git a/cluster/terraform_kubernetes/variables.tf b/cluster/terraform_kubernetes/variables.tf index 3b2aa315..b35f9410 100644 --- a/cluster/terraform_kubernetes/variables.tf +++ b/cluster/terraform_kubernetes/variables.tf @@ -57,6 +57,11 @@ variable "enable_lowpriority_app" { default = false } +variable "enable_lowpriority_app_clone" { + type = bool + default = false +} + variable "grafana_app_cpu" { type = string default = "500m" diff --git a/cluster/terraform_kubernetes/welcome_app.tf b/cluster/terraform_kubernetes/welcome_app.tf index 69445712..307099f6 100644 --- a/cluster/terraform_kubernetes/welcome_app.tf +++ b/cluster/terraform_kubernetes/welcome_app.tf @@ -120,9 +120,6 @@ resource "kubernetes_deployment" "welcome_app" { } } - - - resource "kubernetes_service" "welcome_app" { metadata { name = local.welcome_app_name @@ -193,3 +190,207 @@ server { EOT } } + +# +# Clone definition +# + +resource "kubernetes_deployment" "welcome_app_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = local.welcome_app_name + namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name + } + spec { + replicas = 2 + selector { + match_labels = { + app = local.welcome_app_name + } + } + template { + metadata { + labels = { + app = local.welcome_app_name + } + annotations = { + "logit.io/send" = "true" + "fluentbit.io/exclude" = "true" + } + } + spec { + automount_service_account_token = false + node_selector = { + "teacherservices.cloud/node_pool" = "applications" + "kubernetes.io/os" = "linux" + } + topology_spread_constraint { + max_skew = 1 + topology_key = "topology.kubernetes.io/zone" + when_unsatisfiable = "DoNotSchedule" + label_selector { + match_labels = { + app = local.welcome_app_name + } + } + } + topology_spread_constraint { + max_skew = 1 + topology_key = "kubernetes.io/hostname" + when_unsatisfiable = "ScheduleAnyway" + label_selector { + match_labels = { + app = local.welcome_app_name + } + } + } + + container { + name = local.welcome_app_name + image = "nginxinc/nginx-unprivileged:1.27.3-alpine3.20" + + security_context { + run_as_user = 1000 + run_as_group = 3000 + capabilities { + drop = ["ALL"] + } + allow_privilege_escalation = false + privileged = false + run_as_non_root = true + read_only_root_filesystem = true + seccomp_profile { + type = "RuntimeDefault" + } + } + + resources { + requests = { + cpu = "100m" + memory = "64M" + } + limits = { + cpu = "100m" + memory = "64M" + } + } + + port { + container_port = 80 + } + + # Mount the updated NGINX configuration from the ConfigMap + volume_mount { + name = "nginx-config-volume" + mount_path = "/etc/nginx/conf.d/default.conf" + sub_path = "default.conf" + } + + # Mount a writable volume to /tmp + volume_mount { + name = "nginx-tmp" + mount_path = "/tmp" + } + volume_mount { + name = "nginx-cache" + mount_path = "/var/cache/nginx" + } + } + + volume { + name = "nginx-config-volume" + config_map { + name = kubernetes_config_map.welcome_app_nginx_config_clone[0].metadata[0].name + } + } + + volume { + name = "nginx-tmp" + empty_dir {} + } + + volume { + name = "nginx-cache" + empty_dir {} + } + } + } + } +} + +resource "kubernetes_service" "welcome_app_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = local.welcome_app_name + namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name + } + spec { + type = "ClusterIP" + port { + port = 80 + target_port = 8080 + } + selector = { + app = local.welcome_app_name + } + } +} + +resource "kubernetes_ingress_v1" "welcome_app_clone" { + for_each = var.clone_cluster ? toset(local.welcome_app_hostnames) : [] + provider = kubernetes.clone + + wait_for_load_balancer = true + metadata { + name = "${local.welcome_app_name}-${each.value}" + namespace = local.welcome_app_namespace + } + spec { + ingress_class_name = local.ingress_class_name + rule { + host = each.value + http { + path { + backend { + service { + name = kubernetes_service.welcome_app_clone[0].metadata[0].name + port { + number = kubernetes_service.welcome_app_clone[0].spec[0].port[0].port + } + } + } + } + } + } + } +} + +resource "kubernetes_config_map" "welcome_app_nginx_config_clone" { + count = var.clone_cluster ? 1 : 0 + provider = kubernetes.clone + metadata { + name = "nginx-config" + namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name + } + + data = { + "default.conf" = < Date: Tue, 21 Jan 2025 22:46:44 +0000 Subject: [PATCH 4/8] Update rebuild cluster documentation --- documentation/rebuild-cluster.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/documentation/rebuild-cluster.md b/documentation/rebuild-cluster.md index 16fa0728..89fb5b0e 100644 --- a/documentation/rebuild-cluster.md +++ b/documentation/rebuild-cluster.md @@ -24,8 +24,10 @@ Then we use scripts to export the resources from each namespace from one cluster Add or update the variables and apply: - `cluster/terraform_aks_cluster/config/.tfvars.json`: - `"clone_cluster": true` + - `"enable_azure_RBAC_clone": true` - `cluster/terraform_kubernetes/config/.tfvars.json`: - `"clone_cluster": true` + - `"enable_lowpriority_app_clone": true` (if enabled for main) - Run: `make terraform-apply` ## Export resources from main cluster @@ -57,7 +59,11 @@ Since the applications domain points to the main cluster, you won't be able to t ## Rebuild the first cluster - Wait for traffic to stop on main cluster. You can now make changes on the main cluster without impacting users. - Delete the non-system pod disruption budgets (check with `kubectl get pdb -A`) +- Delete the aks-systems-logs diagnostic setting from the main cluster under Monitoring -> Diagnostic settings - Make the required code changes in terraform +- Update your local branch to ignore the app domain record change that was made earlier, otherwise it will reset to the main cluster IP + - `cluster/terraform_kubernetes/config/dns.tf`: + - Add `lifecycle { ignore_changes = [records] }` to the cluster_a_record - Run terraform-plan to check the changes and make sure only the main cluter is updated. Most of the cloned configuration is referenced from the main cluster, so if the value changes on the main cluster, it would also impact the cloned cluster and force a rebuild, which would disrupt users. If it's the case, hardcode the original value temporarily for the cloned cluster. For instance, if you want to change the default node pool vm_size from "Standard_D2_v2", change the cloned cluster from: ``` @@ -84,16 +90,21 @@ Since the applications domain points to the main cluster, you won't be able to t - Restore your hosts file ## Route traffic to the main cluster -- Change the the applications domain record in the DNS zone manually +- Revert the local branch change to ignore the app domain record change that was made earlier + - `cluster/terraform_kubernetes/config/dns.tf`: + - Delete `lifecycle { ignore_changes = [records] }`from the cluster_a_record +- Run terraform-apply or change the applications domain record in the DNS zone manually - Wait at least 5 min for the TTL to expire - Update the DNS record in terraform code ## Delete the temp cluster - `cluster/terraform_aks_cluster/config/.tfvars.json`: - `"clone_cluster": false` (or remove variable) + - `"enable_azure_RBAC_clone": false` (or remove variable) - Run: `make terraform-kubernetes-apply` - `cluster/terraform_kubernetes/config/.tfvars.json`: - `"clone_cluster": false` (or remove variable) + - `"enable_lowpriority_app_clone": false` (or remove variable) - Run: `make terraform-aks-cluster-apply`. It may be necessary to remove the pod disruption budgets (check with `kubectl get pdb -A`). ## Delete the export files From eb1b49c87a9199f6fa75a2368e7a44beddd1a0f1 Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Wed, 22 Jan 2025 12:14:26 +0000 Subject: [PATCH 5/8] Fix terraform formatting --- cluster/terraform_aks_cluster/main.tf | 14 +++++++------- cluster/terraform_kubernetes/analytics.tf | 2 +- cluster/terraform_kubernetes/filebeat.tf | 10 +++++----- cluster/terraform_kubernetes/ingress_controller.tf | 4 ++-- cluster/terraform_kubernetes/lowpriority_app.tf | 4 ++-- cluster/terraform_kubernetes/welcome_app.tf | 6 +++--- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cluster/terraform_aks_cluster/main.tf b/cluster/terraform_aks_cluster/main.tf index 34838e04..030a0575 100644 --- a/cluster/terraform_aks_cluster/main.tf +++ b/cluster/terraform_aks_cluster/main.tf @@ -83,12 +83,12 @@ resource "azurerm_kubernetes_cluster_node_pool" "node_pools" { resource "azurerm_kubernetes_cluster" "clone" { count = var.clone_cluster ? 1 : 0 - name = local.clone_cluster_name - location = azurerm_kubernetes_cluster.main.location - resource_group_name = azurerm_kubernetes_cluster.main.resource_group_name - node_resource_group = local.clone_node_resource_group_name - dns_prefix = "${azurerm_kubernetes_cluster.main.dns_prefix}-clone" - kubernetes_version = azurerm_kubernetes_cluster.main.kubernetes_version + name = local.clone_cluster_name + location = azurerm_kubernetes_cluster.main.location + resource_group_name = azurerm_kubernetes_cluster.main.resource_group_name + node_resource_group = local.clone_node_resource_group_name + dns_prefix = "${azurerm_kubernetes_cluster.main.dns_prefix}-clone" + kubernetes_version = azurerm_kubernetes_cluster.main.kubernetes_version oidc_issuer_enabled = true workload_identity_enabled = true @@ -155,7 +155,7 @@ resource "azurerm_public_ip" "egress-public-ip" { } resource "azurerm_public_ip" "egress-public-ip-clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-clone-egress-pip" location = data.azurerm_resource_group.cluster.location resource_group_name = data.azurerm_resource_group.cluster.name diff --git a/cluster/terraform_kubernetes/analytics.tf b/cluster/terraform_kubernetes/analytics.tf index cfef8074..8169450a 100644 --- a/cluster/terraform_kubernetes/analytics.tf +++ b/cluster/terraform_kubernetes/analytics.tf @@ -24,7 +24,7 @@ resource "kubernetes_config_map" "ama_logs" { } resource "kubernetes_config_map" "ama_logs_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = "container-azm-ms-agentconfig" diff --git a/cluster/terraform_kubernetes/filebeat.tf b/cluster/terraform_kubernetes/filebeat.tf index 8b891045..7c09be1e 100644 --- a/cluster/terraform_kubernetes/filebeat.tf +++ b/cluster/terraform_kubernetes/filebeat.tf @@ -185,7 +185,7 @@ resource "kubernetes_daemonset" "filebeat" { # resource "kubernetes_service_account" "filebeat_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = "filebeat" @@ -197,7 +197,7 @@ resource "kubernetes_service_account" "filebeat_clone" { } resource "kubernetes_cluster_role" "filebeat_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = "filebeat" @@ -220,7 +220,7 @@ resource "kubernetes_cluster_role" "filebeat_clone" { } resource "kubernetes_cluster_role_binding" "filebeat_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = "filebeat" @@ -243,7 +243,7 @@ resource "kubernetes_cluster_role_binding" "filebeat_clone" { } resource "kubernetes_config_map" "filebeat_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { @@ -258,7 +258,7 @@ resource "kubernetes_config_map" "filebeat_clone" { } resource "kubernetes_daemonset" "filebeat_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { diff --git a/cluster/terraform_kubernetes/ingress_controller.tf b/cluster/terraform_kubernetes/ingress_controller.tf index c2db8446..f028ee0c 100644 --- a/cluster/terraform_kubernetes/ingress_controller.tf +++ b/cluster/terraform_kubernetes/ingress_controller.tf @@ -213,7 +213,7 @@ resource "helm_release" "ingress-nginx" { } resource "helm_release" "ingress-nginx-clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = helm.clone name = helm_release.ingress-nginx.name @@ -440,7 +440,7 @@ resource "azurerm_public_ip" "ingress-public-ip" { } resource "azurerm_public_ip" "ingress-public-ip-clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 name = "${var.resource_prefix}-tsc-aks-nodes-${var.environment}-clone-ingress-pip" location = data.azurerm_resource_group.resource_group.location resource_group_name = data.azurerm_resource_group.resource_group.name diff --git a/cluster/terraform_kubernetes/lowpriority_app.tf b/cluster/terraform_kubernetes/lowpriority_app.tf index 9515f4e5..c32f9511 100644 --- a/cluster/terraform_kubernetes/lowpriority_app.tf +++ b/cluster/terraform_kubernetes/lowpriority_app.tf @@ -59,7 +59,7 @@ resource "kubernetes_deployment" "lowpriority_app" { # resource "kubernetes_priority_class_v1" "lowpriority_clone" { - count = var.enable_lowpriority_app_clone ? 1 : 0 + count = var.enable_lowpriority_app_clone ? 1 : 0 provider = kubernetes.clone metadata { @@ -70,7 +70,7 @@ resource "kubernetes_priority_class_v1" "lowpriority_clone" { } resource "kubernetes_deployment" "lowpriority_app_clone" { - count = var.enable_lowpriority_app_clone ? 1 : 0 + count = var.enable_lowpriority_app_clone ? 1 : 0 provider = kubernetes.clone metadata { diff --git a/cluster/terraform_kubernetes/welcome_app.tf b/cluster/terraform_kubernetes/welcome_app.tf index 307099f6..d364ad22 100644 --- a/cluster/terraform_kubernetes/welcome_app.tf +++ b/cluster/terraform_kubernetes/welcome_app.tf @@ -196,7 +196,7 @@ EOT # resource "kubernetes_deployment" "welcome_app_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = local.welcome_app_name @@ -320,7 +320,7 @@ resource "kubernetes_deployment" "welcome_app_clone" { } resource "kubernetes_service" "welcome_app_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = local.welcome_app_name @@ -368,7 +368,7 @@ resource "kubernetes_ingress_v1" "welcome_app_clone" { } resource "kubernetes_config_map" "welcome_app_nginx_config_clone" { - count = var.clone_cluster ? 1 : 0 + count = var.clone_cluster ? 1 : 0 provider = kubernetes.clone metadata { name = "nginx-config" From f85503bd07f37cb58450f8eacf76b5117206b5ed Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Wed, 29 Jan 2025 13:02:10 +0000 Subject: [PATCH 6/8] Improve nginx ingress clone tf config --- .../ingress_controller.tf | 206 ++---------------- 1 file changed, 14 insertions(+), 192 deletions(-) diff --git a/cluster/terraform_kubernetes/ingress_controller.tf b/cluster/terraform_kubernetes/ingress_controller.tf index f028ee0c..641cff61 100644 --- a/cluster/terraform_kubernetes/ingress_controller.tf +++ b/cluster/terraform_kubernetes/ingress_controller.tf @@ -221,16 +221,21 @@ resource "helm_release" "ingress-nginx-clone" { chart = helm_release.ingress-nginx.chart version = helm_release.ingress-nginx.version - # The first part of the name with simple dots is the keys path in the values.yml file e.g. controller.service.annotations - # The last part is the final key e.g. service\\.beta\\.kubernetes\\.io/azure-load-balancer-health-probe-request-path - # It may have double escaped dots if the key contains dots e.g. \\. - # The corresponding value is in the "value" argument - # https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml - set { - name = "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-health-probe-request-path" - value = "/healthz" - type = "string" + dynamic "set" { + # Exclude the load balancer IP to force clone to use dynamic Public IP for load balancer ingress + for_each = [ + for s in helm_release.ingress-nginx.set : s + if s.name != "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-ipv4" + && s.name != "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-resource-group" + ] + + content { + name = set.value["name"] + value = set.value["value"] + type = set.value["type"] + } } + # Resource group of the ingress public IP # The cluster managed identity must have Network Contributor role on the resource group set { @@ -244,189 +249,6 @@ resource "helm_release" "ingress-nginx-clone" { value = azurerm_public_ip.ingress-public-ip-clone[0].ip_address type = "string" } - # Route requests from the load balancer to the ingress pods on the same node instead of adding one more hop to the node with most pods. - # This preserves the client IP and removes a hop. It potentially creates a traffic imbalance but this should have no effect for us - # as we should have many well distributed ingress pods. - set { - name = "controller.service.externalTrafficPolicy" - value = "Local" - type = "string" - } - set { - name = "controller.extraArgs.default-ssl-certificate" - value = "default/cert-secret" - type = "string" - } - # Disable HTTP port 80 on the Azure load balancer - set { - name = "controller.service.enableHttp" - value = "false" - type = "auto" - } - # Allow POST requests with large body. Prevent error 413: Request entity too large - set { - name = "controller.config.proxy-body-size" - value = "50m" - type = "string" - } - # Sets the size of the buffer used for reading the first part of the response received from the proxied server. - # Needs to be larger than the response header or nginx will return an error for the request - # https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#proxy-buffer-size - set { - name = "controller.config.proxy-buffer-size" - value = "24k" - type = "string" - } - # This ConfigMap setting sets the time, in seconds, during which a keep-alive client connection will stay open on the server side - set { - name = "controller.config.keep-alive" - value = "120" - type = "auto" - } - # This ConfigMap setting defines a timeout for reading client request header, in seconds - set { - name = "controller.config.client-header-timeout" - value = "120" - type = "auto" - } - set { - name = "controller.replicaCount" - value = 20 - type = "auto" - } - set { - name = "controller.nodeSelector.teacherservices\\.cloud/node_pool" - value = "applications" - type = "string" - } - - # Send X-Forwarded-For HTTP header to keep the client IP for the apps - # When used behind front door, it contains the front door backend IP as well - # The Host header is replaced by the value of X-Forwarded-Host header. When using front door, - # apps will see the external host instead of the ingress host - set { - name = "controller.config.use-forwarded-headers" - value = "true" - type = "string" - } - set { - name = "controller.config.compute-full-forwarded-for" - value = "true" - type = "string" - } - set { - name = "controller.resources.limits.cpu" - value = "500m" - type = "string" - } - set { - name = "controller.resources.limits.memory" - value = "512Mi" - type = "string" - } - - # Enable prometheus metrics and configure scraping - set { - name = "controller.metrics.enabled" - value = "true" - type = "auto" - } - set { - name = "controller.podAnnotations.prometheus\\.io/scrape" - value = "true" - type = "string" - } - set { - name = "controller.podAnnotations.prometheus\\.io/path" - value = "/metrics" - type = "string" - } - set { - name = "controller.podAnnotations.prometheus\\.io/port" - value = "10254" - type = "string" - } - # Enable shipping logs to Logit.io - set { - name = "controller.podAnnotations.logit\\.io/send" - value = "true" - type = "string" - } - # Disable shipping logs to Log analytics via Container insights - set { - name = "controller.podAnnotations.fluentbit\\.io/exclude" - value = "true" - type = "string" - } - - # Set ingress class name so it can be retrieved as an attribute to force dependencies - set { - name = "controller.ingressClassResource.name" - value = "nginx" - type = "string" - } - # Block access to /metrics endpoint - dynamic "set" { - for_each = var.block_metrics_endpoint ? [1] : [] - - content { - name = "controller.config.server-snippet" - value = <<-EOT - location /metrics { - deny all; - } - EOT - type = "string" - } - } - - set { - name = "controller.podSecurityContext.runAsUser" - value = "1000" - type = "auto" - } - set { - name = "controller.podSecurityContext.runAsGroup" - value = "3000" - type = "auto" - } - set { - name = "controller.securityContext.capabilities.drop[0]" - value = "ALL" - type = "string" - } - // By default, NET_BIND_SERVICE is added to the deployment by the Helm chart, even if we do not explicitly set it. - - set { - name = "controller.securityContext.allowPrivilegeEscalation" - value = "false" - type = "string" - } - set { - name = "controller.securityContext.privileged" - value = "false" - type = "string" - } - set { - name = "controller.securityContext.runAsNonRoot" - value = "true" - type = "string" - } - set { - name = "controller.securityContext.readOnlyRootFilesystem" - value = "true" - type = "string" - } - set { - name = "controller.securityContext.seccompProfile.type" - value = "RuntimeDefault" - type = "string" - } - set { - name = "controller.automountServiceAccountToken" - value = "false" - type = "string" - } } resource "azurerm_public_ip" "ingress-public-ip" { From 23ea0872d37346afb023e3cd479828e8d766821c Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Wed, 29 Jan 2025 13:04:36 +0000 Subject: [PATCH 7/8] Remove unnecessary clone resources --- .../terraform_kubernetes/lowpriority_app.tf | 62 ------ cluster/terraform_kubernetes/variables.tf | 5 - cluster/terraform_kubernetes/welcome_app.tf | 204 ------------------ documentation/rebuild-cluster.md | 2 - 4 files changed, 273 deletions(-) diff --git a/cluster/terraform_kubernetes/lowpriority_app.tf b/cluster/terraform_kubernetes/lowpriority_app.tf index c32f9511..6bb3df34 100644 --- a/cluster/terraform_kubernetes/lowpriority_app.tf +++ b/cluster/terraform_kubernetes/lowpriority_app.tf @@ -53,65 +53,3 @@ resource "kubernetes_deployment" "lowpriority_app" { } } } - -# -# Clone definition -# - -resource "kubernetes_priority_class_v1" "lowpriority_clone" { - count = var.enable_lowpriority_app_clone ? 1 : 0 - provider = kubernetes.clone - - metadata { - name = "lowpriority" - } - - value = -1 -} - -resource "kubernetes_deployment" "lowpriority_app_clone" { - count = var.enable_lowpriority_app_clone ? 1 : 0 - provider = kubernetes.clone - - metadata { - name = local.lowpriority_app_name - namespace = local.lowpriority_app_namespace - } - spec { - replicas = var.lowpriority_app_replicas - selector { - match_labels = { - app = local.lowpriority_app_name - } - } - template { - metadata { - labels = { - app = local.lowpriority_app_name - } - } - spec { - node_selector = { - "teacherservices.cloud/node_pool" = "applications" - "kubernetes.io/os" = "linux" - } - priority_class_name = "lowpriority" - container { - name = local.lowpriority_app_name - image = "k8s.gcr.io/pause" - - resources { - requests = { - cpu = var.lowpriority_app_cpu - memory = var.lowpriority_app_mem - } - limits = { - cpu = 1 - memory = var.lowpriority_app_mem - } - } - } - } - } - } -} diff --git a/cluster/terraform_kubernetes/variables.tf b/cluster/terraform_kubernetes/variables.tf index b35f9410..3b2aa315 100644 --- a/cluster/terraform_kubernetes/variables.tf +++ b/cluster/terraform_kubernetes/variables.tf @@ -57,11 +57,6 @@ variable "enable_lowpriority_app" { default = false } -variable "enable_lowpriority_app_clone" { - type = bool - default = false -} - variable "grafana_app_cpu" { type = string default = "500m" diff --git a/cluster/terraform_kubernetes/welcome_app.tf b/cluster/terraform_kubernetes/welcome_app.tf index d364ad22..d712ad92 100644 --- a/cluster/terraform_kubernetes/welcome_app.tf +++ b/cluster/terraform_kubernetes/welcome_app.tf @@ -190,207 +190,3 @@ server { EOT } } - -# -# Clone definition -# - -resource "kubernetes_deployment" "welcome_app_clone" { - count = var.clone_cluster ? 1 : 0 - provider = kubernetes.clone - metadata { - name = local.welcome_app_name - namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name - } - spec { - replicas = 2 - selector { - match_labels = { - app = local.welcome_app_name - } - } - template { - metadata { - labels = { - app = local.welcome_app_name - } - annotations = { - "logit.io/send" = "true" - "fluentbit.io/exclude" = "true" - } - } - spec { - automount_service_account_token = false - node_selector = { - "teacherservices.cloud/node_pool" = "applications" - "kubernetes.io/os" = "linux" - } - topology_spread_constraint { - max_skew = 1 - topology_key = "topology.kubernetes.io/zone" - when_unsatisfiable = "DoNotSchedule" - label_selector { - match_labels = { - app = local.welcome_app_name - } - } - } - topology_spread_constraint { - max_skew = 1 - topology_key = "kubernetes.io/hostname" - when_unsatisfiable = "ScheduleAnyway" - label_selector { - match_labels = { - app = local.welcome_app_name - } - } - } - - container { - name = local.welcome_app_name - image = "nginxinc/nginx-unprivileged:1.27.3-alpine3.20" - - security_context { - run_as_user = 1000 - run_as_group = 3000 - capabilities { - drop = ["ALL"] - } - allow_privilege_escalation = false - privileged = false - run_as_non_root = true - read_only_root_filesystem = true - seccomp_profile { - type = "RuntimeDefault" - } - } - - resources { - requests = { - cpu = "100m" - memory = "64M" - } - limits = { - cpu = "100m" - memory = "64M" - } - } - - port { - container_port = 80 - } - - # Mount the updated NGINX configuration from the ConfigMap - volume_mount { - name = "nginx-config-volume" - mount_path = "/etc/nginx/conf.d/default.conf" - sub_path = "default.conf" - } - - # Mount a writable volume to /tmp - volume_mount { - name = "nginx-tmp" - mount_path = "/tmp" - } - volume_mount { - name = "nginx-cache" - mount_path = "/var/cache/nginx" - } - } - - volume { - name = "nginx-config-volume" - config_map { - name = kubernetes_config_map.welcome_app_nginx_config_clone[0].metadata[0].name - } - } - - volume { - name = "nginx-tmp" - empty_dir {} - } - - volume { - name = "nginx-cache" - empty_dir {} - } - } - } - } -} - -resource "kubernetes_service" "welcome_app_clone" { - count = var.clone_cluster ? 1 : 0 - provider = kubernetes.clone - metadata { - name = local.welcome_app_name - namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name - } - spec { - type = "ClusterIP" - port { - port = 80 - target_port = 8080 - } - selector = { - app = local.welcome_app_name - } - } -} - -resource "kubernetes_ingress_v1" "welcome_app_clone" { - for_each = var.clone_cluster ? toset(local.welcome_app_hostnames) : [] - provider = kubernetes.clone - - wait_for_load_balancer = true - metadata { - name = "${local.welcome_app_name}-${each.value}" - namespace = local.welcome_app_namespace - } - spec { - ingress_class_name = local.ingress_class_name - rule { - host = each.value - http { - path { - backend { - service { - name = kubernetes_service.welcome_app_clone[0].metadata[0].name - port { - number = kubernetes_service.welcome_app_clone[0].spec[0].port[0].port - } - } - } - } - } - } - } -} - -resource "kubernetes_config_map" "welcome_app_nginx_config_clone" { - count = var.clone_cluster ? 1 : 0 - provider = kubernetes.clone - metadata { - name = "nginx-config" - namespace = kubernetes_namespace.default_list_clone[local.welcome_app_namespace].metadata[0].name - } - - data = { - "default.conf" = <.tfvars.json`: - `"clone_cluster": true` - - `"enable_lowpriority_app_clone": true` (if enabled for main) - Run: `make terraform-apply` ## Export resources from main cluster @@ -104,7 +103,6 @@ Since the applications domain points to the main cluster, you won't be able to t - Run: `make terraform-kubernetes-apply` - `cluster/terraform_kubernetes/config/.tfvars.json`: - `"clone_cluster": false` (or remove variable) - - `"enable_lowpriority_app_clone": false` (or remove variable) - Run: `make terraform-aks-cluster-apply`. It may be necessary to remove the pod disruption budgets (check with `kubectl get pdb -A`). ## Delete the export files From 3ff2149b254f5195e8827b133376ef057e351e2e Mon Sep 17 00:00:00 2001 From: RMcVelia Date: Wed, 29 Jan 2025 16:52:51 +0000 Subject: [PATCH 8/8] set Azure AD RBAC default to true for all clusters --- cluster/terraform_aks_cluster/config/development.tfvars.json | 3 +-- .../terraform_aks_cluster/config/platform-test.tfvars.json | 3 +-- cluster/terraform_aks_cluster/config/production.tfvars.json | 1 - cluster/terraform_aks_cluster/config/test.tfvars.json | 3 +-- cluster/terraform_aks_cluster/variables.tf | 4 ++-- documentation/rebuild-cluster.md | 2 -- 6 files changed, 5 insertions(+), 11 deletions(-) diff --git a/cluster/terraform_aks_cluster/config/development.tfvars.json b/cluster/terraform_aks_cluster/config/development.tfvars.json index c27206da..a195a1b8 100644 --- a/cluster/terraform_aks_cluster/config/development.tfvars.json +++ b/cluster/terraform_aks_cluster/config/development.tfvars.json @@ -15,6 +15,5 @@ "orchestrator_version": "1.30.6" } }, - "admin_group_id": "f77b2daf-7ff4-4aa5-8138-cf983d0b4a18", - "enable_azure_RBAC": true + "admin_group_id": "f77b2daf-7ff4-4aa5-8138-cf983d0b4a18" } diff --git a/cluster/terraform_aks_cluster/config/platform-test.tfvars.json b/cluster/terraform_aks_cluster/config/platform-test.tfvars.json index f3bcfaac..6d58653a 100644 --- a/cluster/terraform_aks_cluster/config/platform-test.tfvars.json +++ b/cluster/terraform_aks_cluster/config/platform-test.tfvars.json @@ -15,6 +15,5 @@ "orchestrator_version": "1.30.6" } }, - "admin_group_id": "f726cc54-78cb-4c98-89a6-b8e4396afb98", - "enable_azure_RBAC": true + "admin_group_id": "f726cc54-78cb-4c98-89a6-b8e4396afb98" } diff --git a/cluster/terraform_aks_cluster/config/production.tfvars.json b/cluster/terraform_aks_cluster/config/production.tfvars.json index 16272902..87898e71 100644 --- a/cluster/terraform_aks_cluster/config/production.tfvars.json +++ b/cluster/terraform_aks_cluster/config/production.tfvars.json @@ -17,6 +17,5 @@ } }, "admin_group_id": "5b0f84de-54a8-481a-8689-f3c226597259", - "enable_azure_RBAC": true, "ci_collection_interval": "1m" } diff --git a/cluster/terraform_aks_cluster/config/test.tfvars.json b/cluster/terraform_aks_cluster/config/test.tfvars.json index 0bcdd6e2..25c2eae8 100644 --- a/cluster/terraform_aks_cluster/config/test.tfvars.json +++ b/cluster/terraform_aks_cluster/config/test.tfvars.json @@ -16,6 +16,5 @@ "orchestrator_version": "1.30.6" } }, - "admin_group_id": "21b2f2a6-231e-45cb-b624-d5521b820941", - "enable_azure_RBAC": true + "admin_group_id": "21b2f2a6-231e-45cb-b624-d5521b820941" } diff --git a/cluster/terraform_aks_cluster/variables.tf b/cluster/terraform_aks_cluster/variables.tf index d63b1076..54aa4eb9 100644 --- a/cluster/terraform_aks_cluster/variables.tf +++ b/cluster/terraform_aks_cluster/variables.tf @@ -29,12 +29,12 @@ variable "admin_group_id" { } variable "enable_azure_RBAC" { type = bool - default = false + default = true description = "Enable Azure AD RBAC on this cluster" } variable "enable_azure_RBAC_clone" { type = bool - default = false + default = true description = "Enable Azure AD RBAC on the clone cluster" } diff --git a/documentation/rebuild-cluster.md b/documentation/rebuild-cluster.md index 07632f1b..7beadb61 100644 --- a/documentation/rebuild-cluster.md +++ b/documentation/rebuild-cluster.md @@ -24,7 +24,6 @@ Then we use scripts to export the resources from each namespace from one cluster Add or update the variables and apply: - `cluster/terraform_aks_cluster/config/.tfvars.json`: - `"clone_cluster": true` - - `"enable_azure_RBAC_clone": true` - `cluster/terraform_kubernetes/config/.tfvars.json`: - `"clone_cluster": true` - Run: `make terraform-apply` @@ -99,7 +98,6 @@ Since the applications domain points to the main cluster, you won't be able to t ## Delete the temp cluster - `cluster/terraform_aks_cluster/config/.tfvars.json`: - `"clone_cluster": false` (or remove variable) - - `"enable_azure_RBAC_clone": false` (or remove variable) - Run: `make terraform-kubernetes-apply` - `cluster/terraform_kubernetes/config/.tfvars.json`: - `"clone_cluster": false` (or remove variable)