Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[925] Migrate development cluster to Azure RBAC #146

Merged
merged 11 commits into from
Jan 30, 2024
10 changes: 2 additions & 8 deletions .github/actions/deploy-environment/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,13 @@ runs:
make ci ${{ inputs.environment_name }} deploy-azure-resources
shell: bash

- run: |
TFSTATE_CONTAINER_ACCESS_KEY="$(az storage account keys list -g ${{ env.resource_group_name }} -n ${{ env.storage_account_name }} | jq -r '.[0].value')"
echo "::add-mask::$TFSTATE_CONTAINER_ACCESS_KEY"
echo "TFSTATE_CONTAINER_ACCESS_KEY=$TFSTATE_CONTAINER_ACCESS_KEY" >> $GITHUB_ENV
shell: bash

- uses: hashicorp/setup-terraform@v2
with:
terraform_version: ${{ env.terraform_version }}
terraform_wrapper: false

- name: Set ARM environment variables
uses: DFE-Digital/github-actions/set-arm-environment-variables@master
- name: Set ARM and kubelogin environment
uses: DFE-Digital/github-actions/set-kubelogin-environment@master
with:
azure-credentials: ${{ inputs.AZURE_CREDENTIALS }}

Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ domains-infra-apply: domains-infra-init

get-cluster-credentials: set-azure-account ## make <config> get-cluster-credentials [ENVIRONMENT=<clusterX>]
az aks get-credentials --overwrite-existing -g ${RESOURCE_GROUP_NAME} -n ${RESOURCE_PREFIX}-tsc-${ENVIRONMENT}${CLONE_STRING}-aks
kubelogin convert-kubeconfig -l $(if ${GITHUB_ACTIONS},spn,azurecli)

disable-cluster-node-autoscaler: set-azure-account
$(if $(NODE_POOL), , $(error Please specify a node pool))
Expand Down
4 changes: 3 additions & 1 deletion cluster/terraform_aks_cluster/config/development.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@
},
"orchestrator_version": "1.26.10"
}
}
},
"admin_group_id": "f77b2daf-7ff4-4aa5-8138-cf983d0b4a18",
"enable_azure_RBAC": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
},
"orchestrator_version": "1.26.10"
}
}
},
"admin_group_id": "f726cc54-78cb-4c98-89a6-b8e4396afb98"
}
3 changes: 2 additions & 1 deletion cluster/terraform_aks_cluster/config/production.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
},
"orchestrator_version": "1.26.10"
}
}
},
"admin_group_id": "5b0f84de-54a8-481a-8689-f3c226597259"
}
3 changes: 2 additions & 1 deletion cluster/terraform_aks_cluster/config/test.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
},
"orchestrator_version": "1.26.10"
}
}
},
"admin_group_id": "21b2f2a6-231e-45cb-b624-d5521b820941"
}
22 changes: 22 additions & 0 deletions cluster/terraform_aks_cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ resource "azurerm_kubernetes_cluster" "main" {
dns_prefix = local.dns_prefix
kubernetes_version = var.kubernetes_version

dynamic "azure_active_directory_role_based_access_control" {
for_each = var.enable_azure_RBAC ? [1] : []

content {
managed = true
azure_rbac_enabled = true
admin_group_object_ids = [var.admin_group_id]
}
}
local_account_disabled = var.enable_azure_RBAC

oms_agent {
log_analytics_workspace_id = azurerm_log_analytics_workspace.aks_system_logs.id
}
Expand Down Expand Up @@ -76,6 +87,17 @@ resource "azurerm_kubernetes_cluster" "clone" {
dns_prefix = "${azurerm_kubernetes_cluster.main.dns_prefix}-clone"
kubernetes_version = azurerm_kubernetes_cluster.main.kubernetes_version

dynamic "azure_active_directory_role_based_access_control" {
for_each = var.enable_azure_RBAC_clone ? [1] : []

content {
managed = true
azure_rbac_enabled = true
admin_group_object_ids = [var.admin_group_id]
}
}
local_account_disabled = var.enable_azure_RBAC_clone

default_node_pool {
name = azurerm_kubernetes_cluster.main.default_node_pool[0].name
node_count = azurerm_kubernetes_cluster.main.default_node_pool[0].node_count
Expand Down
11 changes: 11 additions & 0 deletions cluster/terraform_aks_cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ variable "clone_cluster" {
type = bool
default = false
}
variable "admin_group_id" {
description = "Object Id of the cluster admins Entra ID group"
}
variable "enable_azure_RBAC" {
default = false
description = "Enable Azure AD RBAC on this cluster"
}
variable "enable_azure_RBAC_clone" {
default = false
description = "Enable Azure AD RBAC on the clone cluster"
}

locals {
backing_services_resource_group_name = "${var.resource_prefix}-tsc-${var.environment}-bs-rg"
Expand Down
23 changes: 23 additions & 0 deletions cluster/terraform_kubernetes/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions cluster/terraform_kubernetes/ingress_controller.tf
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,12 @@ resource "helm_release" "ingress-nginx-clone" {
version = helm_release.ingress-nginx.version

dynamic "set" {
# Exclude loadBalancerIP set to force clone to use dynamic Public IP for load balancer ingress
for_each = [for s in helm_release.ingress-nginx.set : s if s.name != "controller.service.loadBalancerIP"]
# Exclude the load balancer IP to force clone to use dynamic Public IP for load balancer ingress
for_each = [
for s in helm_release.ingress-nginx.set : s
if s.name != "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-ipv4"
&& s.name != "controller.service.annotations.service\\.beta\\.kubernetes\\.io/azure-load-balancer-resource-group"
]

content {
name = set.value["name"]
Expand Down
57 changes: 49 additions & 8 deletions cluster/terraform_kubernetes/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ terraform {
source = "hashicorp/azurerm"
version = "3.82.0"
}
environment = {
source = "EppO/environment"
version = "1.3.5"
}
helm = {
source = "hashicorp/helm"
version = "2.12.0"
Expand All @@ -18,6 +22,7 @@ terraform {
version = "2.0.4"
}
}

backend "azurerm" {
container_name = "tsc-tfstate"
}
Expand All @@ -42,35 +47,71 @@ data "azurerm_kubernetes_cluster" "clone" {

provider "kubernetes" {
host = data.azurerm_kubernetes_cluster.main.kube_config[0].host
client_certificate = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_certificate)
client_key = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_key)
cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].cluster_ca_certificate)
client_certificate = local.rbac_enabled ? null : base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_certificate)
client_key = local.rbac_enabled ? null : base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_key)

dynamic "exec" {
for_each = local.rbac_enabled ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "kubelogin"
args = local.kubelogin_args
}
}
}

provider "kubernetes" {
alias = "clone"
host = try(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].host, null)
client_certificate = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_certificate), null)
client_key = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_key), null)
cluster_ca_certificate = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].cluster_ca_certificate), null)
client_certificate = local.rbac_enabled_clone ? null : try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_certificate), null)
client_key = local.rbac_enabled_clone ? null : try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_key), null)

dynamic "exec" {
for_each = local.rbac_enabled_clone ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "kubelogin"
args = local.kubelogin_args
}
}
}

provider "helm" {
kubernetes {
host = data.azurerm_kubernetes_cluster.main.kube_config[0].host
client_key = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_key)
client_certificate = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_certificate)
cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].cluster_ca_certificate)
client_certificate = local.rbac_enabled ? null : base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_certificate)
client_key = local.rbac_enabled ? null : base64decode(data.azurerm_kubernetes_cluster.main.kube_config[0].client_key)

dynamic "exec" {
for_each = local.rbac_enabled ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "kubelogin"
args = local.kubelogin_args
}
}
}
}

provider "helm" {
alias = "clone"
kubernetes {
host = try(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].host, null)
client_key = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_key), null)
client_certificate = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_certificate), null)
cluster_ca_certificate = try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].cluster_ca_certificate), null)
client_certificate = local.rbac_enabled_clone ? null : try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_certificate), null)
client_key = local.rbac_enabled_clone ? null : try(base64decode(data.azurerm_kubernetes_cluster.clone[0].kube_config[0].client_key), null)

dynamic "exec" {
for_each = local.rbac_enabled_clone ? [1] : []
content {
api_version = "client.authentication.k8s.io/v1beta1"
command = "kubelogin"
args = local.kubelogin_args
}
}
}
}

Expand Down
33 changes: 33 additions & 0 deletions cluster/terraform_kubernetes/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ variable "lowpriority_app_replicas" {
default = 3
}

data "azurerm_client_config" "current" {}

data "environment_variables" "github_actions" {
filter = "GITHUB_ACTIONS"
}

locals {
cluster_name = (
Expand All @@ -86,4 +91,32 @@ locals {
welcome_app_namespace = "infra"
lowpriority_app_name = "lowpriority-app"
lowpriority_app_namespace = "infra"

rbac_enabled = length(data.azurerm_kubernetes_cluster.main.azure_active_directory_role_based_access_control) > 0
rbac_enabled_clone = try(
length(data.azurerm_kubernetes_cluster.clone[0].azure_active_directory_role_based_access_control) > 0,
false
)

kubelogin_spn_args = [
"get-token",
"--login",
"spn",
"--environment",
"AzurePublicCloud",
"--tenant-id",
data.azurerm_client_config.current.tenant_id,
"--server-id",
"6dae42f8-4368-4678-94ff-3960e28e3630" # See https://azure.github.io/kubelogin/concepts/aks.html
]
kubelogin_azurecli_args = [
"get-token",
"--login",
"azurecli",
"--server-id",
"6dae42f8-4368-4678-94ff-3960e28e3630"
]

spn_authentication = contains(keys(data.environment_variables.github_actions.items), "GITHUB_ACTIONS")
kubelogin_args = local.spn_authentication ? local.kubelogin_spn_args : local.kubelogin_azurecli_args
}
18 changes: 15 additions & 3 deletions documentation/developer-onboarding.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,32 @@ Documentation for the Teacher services application developers
- [kubectl](https://kubernetes.io/docs/reference/kubectl/)
- [terraform](https://technical-guidance.education.gov.uk/infrastructure/dev-tools/#installation)
- [jq](https://stedolan.github.io/jq/)
- [kubelogin](https://azure.github.io/kubelogin/install.html)

## How to request access?
- There is an assumption that you have been given a [CIP account](https://technical-guidance.education.gov.uk/infrastructure/hosting/azure-cip/#onboarding-users). For BYOD users, please make sure to request a digitalauth account.
- You can then request access to the S189 subscriptions by contacting the Teacher Services Infrastructure team
- This gives you access to the 3 s189 subscriptions:
- s189-teacher-services-cloud-development: infra team development work
- s189-teacher-services-cloud-test: contains the **test cluster**
- s189-teacher-services-cloud-production: contains the **production cluster**
- s189-teacher-services-cloud-test: contains the [test cluster](#test-cluster)
- s189-teacher-services-cloud-production: contains the [production cluster](#production-cluster)

## How to request and approve PIM?

> [!IMPORTANT]
> The clusters are soon to be migrated to Azure RBAC, which will change the process to access them. This documentation will be updated as we go along.

### Test and Production clusters
- Microsoft Entra Privileged Identity Management (PIM) allows gaining new user permissions in the s189 subscriptions. This is required to access the cluster and troubleshoot application or database. **We must be very cautious** as this gives access to all the other services deployed to s189 subscriptions.
- Once added to the s189 subscription, you can PIM yourself to the *test* subscription. See the [technical guidance PIM section](https://technical-guidance.education.gov.uk/infrastructure/hosting/azure-cip/#privileged-identity-management-pim-requests).
- You can request PIM to the *production* subscription, however this will need to be approved by members of the Managers group
- As a manager, you should receive and email with the user request. You can also approve PIM requests by going to [Privileged Identity Management](https://portal.azure.com/?feature.msaljs=true#view/Microsoft_Azure_PIMCommon/CommonMenuBlade/~/quickStart) (PIM) in the Azure portal and selecting Approve request, Azure resources, select the user and approve the request.

### Future process
Use [PIM for groups](https://learn.microsoft.com/en-us/entra/id-governance/privileged-identity-management/groups-activate-roles) to elevate your access. Two groups are available:
- `s189 AKS admin test PIM`: access to the test cluster, self-approved
- `s189 AKS admin production PIM`: access to the production cluster, must be approved by another team member

## Which clusters can I use?
The infra team maintains several AKS clusters. Two are usable by developers to deploy their services:

Expand All @@ -39,8 +50,9 @@ Used for all your production and production-like environments, especially if the
- Subscription: `s189-teacher-services-cloud-production`

## How to access the cluster?
- If not present in your repository, set up the `get-cluster-credentials` make command from the template [Makefile](https://github.com/DFE-Digital/teacher-services-cloud/blob/main/templates/new_service/Makefile)
- If not present in your repository, set up the `get-cluster-credentials` make command from the template [Makefile](https://github.com/DFE-Digital/teacher-services-cloud/blob/main/templates/new_service/Makefile). For Azure RBAC clusters, it must include the *kubelogin convert-kubeconfig* command.
- Raise a [PIM request](#how-to-request-and-approve-pim) for either the test or production subscription
- Login to azure command line using `az login` or `az login --use-device-code`
- Run `make <environment> get-cluster-credentials`
- This configures the `kubectl` context so you can run commands against this cluster. Be careful as the context may last even after the PIM has expired.

Expand Down
8 changes: 4 additions & 4 deletions documentation/rebuild-cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Then we use scripts to export the resources from each namespace from one cluster
- Install prerequisites:
- [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl)
- [jq](https://stedolan.github.io/jq/)
- [kubectl-neat](https://github.com/itaysk/kubectl-neat)
- [kubectl krew](https://krew.sigs.k8s.io/docs/user-guide/setup/install/) and [kubectl-neat](https://github.com/itaysk/kubectl-neat)
- Inform the dev teams to stop making changes. This process should be done outsdide of business hours.
- Raise the required PIM requests
- Determine the *applications domain* of this cluster. e.g.:
Expand Down Expand Up @@ -44,18 +44,18 @@ Since the applications domain points to the main cluster, you won't be able to t
- Add the cloned cluster ingress IP for the applications domain (see [Preparation](#preparation)) to your [hosts file](https://en.wikipedia.org/wiki/Hosts_(file)) (see this simple [tutorial](https://www.nublue.co.uk/guides/edit-hosts-file/)). e.g.:

```
51.52.53.54 *.test.teacherservices.cloud
51.52.53.54 webapplication123.test.teacherservices.cloud
```
- Validate webapps on the applications domain
- Restore your hosts file

## Route traffic to the cloned cluster
- Change the applications domain record in the DNS zone manually
- Change the applications domain record (see [preparation](#preparation)) in the DNS zone manually
- Wait at least 5 min for TTL to expire

## Rebuild the first cluster
- Wait for traffic to stop on main cluster. You can now make changes on the main cluster without impacting users.
- Delete the pod disruption budgets (check with `kubectl get pdb -A`)
- Delete the non-system pod disruption budgets (check with `kubectl get pdb -A`)
- Make the required code changes in terraform
- Run terraform-plan to check the changes and make sure only the main cluter is updated. Most of the cloned configuration is referenced from the main cluster, so if the value changes on the main cluster, it would also impact the cloned cluster and force a rebuild, which would disrupt users. If it's the case, hardcode the original value temporarily for the cloned cluster. For instance, if you want to change the default node pool vm_size from "Standard_D2_v2", change the cloned cluster from:

Expand Down
2 changes: 1 addition & 1 deletion scripts/export_aks_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
set -eu
set -o pipefail

NAMESPACE_BLACKLIST="default kube-node-lease kube-public kube-system"
NAMESPACE_BLACKLIST="default kube-node-lease kube-public kube-system gatekeeper-system"

echo "Fetching custom namespaces..."
all_namespaces=$(kubectl get namespaces -o name)
Expand Down
1 change: 1 addition & 0 deletions templates/new_service/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ production-cluster:

get-cluster-credentials: set-azure-account
az aks get-credentials --overwrite-existing -g ${CLUSTER_RESOURCE_GROUP_NAME} -n ${CLUSTER_NAME}
kubelogin convert-kubeconfig -l $(if ${GITHUB_ACTIONS},spn,azurecli)

bin/konduit.sh:
curl -s https://raw.githubusercontent.com/DFE-Digital/teacher-services-cloud/main/scripts/konduit.sh -o bin/konduit.sh \
Expand Down
Loading
Loading