Skip to content

Commit 76f04a4

Browse files
authored
Merge pull request #3 from truefoundry/karpenter-sqs
Added support for interruption handling in karpenter
2 parents 525cdcc + 285f3bf commit 76f04a4

File tree

6 files changed

+128
-5
lines changed

6 files changed

+128
-5
lines changed

README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,20 @@ Truefoundry AWS Karpenter Module
1919

2020
| Name | Source | Version |
2121
|------|--------|---------|
22-
| <a name="module_karpenter_irsa_role"></a> [karpenter\_irsa\_role](#module\_karpenter\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | 5.27.0 |
22+
| <a name="module_karpenter_irsa_role"></a> [karpenter\_irsa\_role](#module\_karpenter\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | 5.32.0 |
2323

2424
## Resources
2525

2626
| Name | Type |
2727
|------|------|
28+
| [aws_cloudwatch_event_rule.this](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/cloudwatch_event_rule) | resource |
29+
| [aws_cloudwatch_event_target.this](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/cloudwatch_event_target) | resource |
2830
| [aws_iam_instance_profile.karpenter](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/iam_instance_profile) | resource |
31+
| [aws_iam_policy.sqs](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/iam_policy) | resource |
32+
| [aws_sqs_queue.karpenter](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/sqs_queue) | resource |
33+
| [aws_sqs_queue_policy.karpenter](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/resources/sqs_queue_policy) | resource |
34+
| [aws_iam_policy_document.node_termination_queue](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/data-sources/iam_policy_document) | data source |
35+
| [aws_iam_policy_document.sqs](https://registry.terraform.io/providers/hashicorp/aws/5.17.0/docs/data-sources/iam_policy_document) | data source |
2936

3037
## Inputs
3138

@@ -36,13 +43,16 @@ Truefoundry AWS Karpenter Module
3643
| <a name="input_controller_nodegroup_name"></a> [controller\_nodegroup\_name](#input\_controller\_nodegroup\_name) | The initial nodegroup name | `string` | n/a | yes |
3744
| <a name="input_k8s_service_account_name"></a> [k8s\_service\_account\_name](#input\_k8s\_service\_account\_name) | The k8s karpenter service account name | `string` | n/a | yes |
3845
| <a name="input_k8s_service_account_namespace"></a> [k8s\_service\_account\_namespace](#input\_k8s\_service\_account\_namespace) | The k8s karpenter namespace | `string` | n/a | yes |
46+
| <a name="input_message_retention_seconds"></a> [message\_retention\_seconds](#input\_message\_retention\_seconds) | Message retention in seconds for SQS queue | `number` | `300` | no |
3947
| <a name="input_oidc_provider_arn"></a> [oidc\_provider\_arn](#input\_oidc\_provider\_arn) | The oidc provider arn of the eks cluster | `string` | n/a | yes |
48+
| <a name="input_sqs_enable_encryption"></a> [sqs\_enable\_encryption](#input\_sqs\_enable\_encryption) | Enable Server side encryption for SQS | `bool` | `true` | no |
4049
| <a name="input_tags"></a> [tags](#input\_tags) | AWS Tags common to all the resources created | `map(string)` | `{}` | no |
4150

4251
## Outputs
4352

4453
| Name | Description |
4554
|------|-------------|
46-
| <a name="output_karpenter_instance_profile_id"></a> [karpenter\_instance\_profile\_id](#output\_karpenter\_instance\_profile\_id) | n/a |
47-
| <a name="output_karpenter_role_arn"></a> [karpenter\_role\_arn](#output\_karpenter\_role\_arn) | n/a |
55+
| <a name="output_karpenter_instance_profile_id"></a> [karpenter\_instance\_profile\_id](#output\_karpenter\_instance\_profile\_id) | Karpenter instance profile ID |
56+
| <a name="output_karpenter_role_arn"></a> [karpenter\_role\_arn](#output\_karpenter\_role\_arn) | Karpenter role ARN |
57+
| <a name="output_karpenter_sqs_name"></a> [karpenter\_sqs\_name](#output\_karpenter\_sqs\_name) | Name of the SQS queue for interruption handling |
4858
<!-- END_TF_DOCS -->

locals.tf

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,38 @@ locals {
77
},
88
var.tags
99
)
10-
}
10+
events = {
11+
health_event = {
12+
name = "HealthEvent"
13+
description = "Karpenter interrupt - AWS health event"
14+
event_pattern = {
15+
source = ["aws.health"]
16+
detail-type = ["AWS Health Event"]
17+
}
18+
}
19+
spot_interupt = {
20+
name = "SpotInterrupt"
21+
description = "Karpenter interrupt - EC2 spot instance interruption warning"
22+
event_pattern = {
23+
source = ["aws.ec2"]
24+
detail-type = ["EC2 Spot Instance Interruption Warning"]
25+
}
26+
}
27+
instance_rebalance = {
28+
name = "InstanceRebalance"
29+
description = "Karpenter interrupt - EC2 instance rebalance recommendation"
30+
event_pattern = {
31+
source = ["aws.ec2"]
32+
detail-type = ["EC2 Instance Rebalance Recommendation"]
33+
}
34+
}
35+
instance_state_change = {
36+
name = "InstanceStateChange"
37+
description = "Karpenter interrupt - EC2 instance state-change notification"
38+
event_pattern = {
39+
source = ["aws.ec2"]
40+
detail-type = ["EC2 Instance State-change Notification"]
41+
}
42+
}
43+
}
44+
}

main.tf

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# From https://github.com/terraform-aws-modules/terraform-aws-eks/blob/master/examples/irsa/irsa.tf
2+
23
module "karpenter_irsa_role" {
34
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
4-
version = "5.27.0"
5+
version = "5.32.0"
56
role_name = "${var.cluster_name}-karpenter"
67
attach_karpenter_controller_policy = true
78

@@ -10,6 +11,10 @@ module "karpenter_irsa_role" {
1011

1112
attach_vpc_cni_policy = true
1213
vpc_cni_enable_ipv4 = true
14+
15+
role_policy_arns = {
16+
"sqs_policy" = aws_iam_policy.sqs.arn
17+
}
1318

1419
oidc_providers = {
1520
main = {

output.tf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
output "karpenter_role_arn" {
22
value = module.karpenter_irsa_role.iam_role_arn
3+
description = "Karpenter role ARN"
34
}
45

56
output "karpenter_instance_profile_id" {
67
value = aws_iam_instance_profile.karpenter.id
8+
description = "Karpenter instance profile ID"
9+
}
10+
11+
output "karpenter_sqs_name" {
12+
value = aws_sqs_queue.karpenter.name
13+
description = "Name of the SQS queue for interruption handling"
714
}

sqs.tf

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
resource "aws_sqs_queue" "karpenter" {
2+
name = "${var.cluster_name}-karpenter"
3+
message_retention_seconds = var.message_retention_seconds
4+
sqs_managed_sse_enabled = var.sqs_enable_encryption
5+
tags = local.tags
6+
}
7+
8+
data "aws_iam_policy_document" "sqs" {
9+
statement {
10+
resources = [aws_sqs_queue.karpenter.arn]
11+
actions = ["sqs:DeleteMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes", "sqs:ReceiveMessage"]
12+
effect = "Allow"
13+
}
14+
}
15+
16+
resource "aws_iam_policy" "sqs" {
17+
name_prefix = "${var.cluster_name}-karpenter-access-to-sqs"
18+
description = "Access policy for karpenter to access SQS for ${var.cluster_name}"
19+
policy = data.aws_iam_policy_document.sqs.json
20+
tags = local.tags
21+
}
22+
23+
resource "aws_sqs_queue_policy" "karpenter" {
24+
policy = data.aws_iam_policy_document.node_termination_queue.json
25+
queue_url = aws_sqs_queue.karpenter.url
26+
}
27+
28+
data "aws_iam_policy_document" "node_termination_queue" {
29+
statement {
30+
resources = [aws_sqs_queue.karpenter.arn]
31+
sid = "SQSWrite"
32+
actions = ["sqs:SendMessage"]
33+
principals {
34+
type = "Service"
35+
identifiers = ["events.amazonaws.com", "sqs.amazonaws.com"]
36+
}
37+
}
38+
}
39+
40+
resource "aws_cloudwatch_event_rule" "this" {
41+
for_each = { for k, v in local.events : k => v }
42+
43+
name_prefix = substr("${var.cluster_name}-${each.value.name}-", 0, 37)
44+
description = each.value.description
45+
event_pattern = jsonencode(each.value.event_pattern)
46+
47+
tags = local.tags
48+
}
49+
50+
resource "aws_cloudwatch_event_target" "this" {
51+
for_each = { for k, v in local.events : k => v }
52+
53+
rule = aws_cloudwatch_event_rule.this[each.key].name
54+
target_id = "KarpenterInterruptionQueueTarget"
55+
arn = aws_sqs_queue.karpenter.arn
56+
}

variables.tf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ variable "controller_nodegroup_name" {
2727
description = "The initial nodegroup name"
2828
type = string
2929
}
30+
variable "sqs_enable_encryption" {
31+
description = "Enable Server side encryption for SQS"
32+
type = bool
33+
default = true
34+
}
35+
36+
variable "message_retention_seconds" {
37+
description = "Message retention in seconds for SQS queue"
38+
type = number
39+
default = 300
40+
}
3041

3142
variable "tags" {
3243
type = map(string)

0 commit comments

Comments
 (0)