Skip to content

Commit 906be52

Browse files
authored
Refactored the Terraform directory structures. (#1046)
* Reorganize Terraform scripts by moving the sub directories. * Updated the flexible to PSC-based cluster type
1 parent 2758580 commit 906be52

File tree

37 files changed

+1184
-91
lines changed

37 files changed

+1184
-91
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
variable "project_id" {}
22
variable "resource_name_prefix" {}
3+
variable "node_pool_prefix" {}
34
variable "region" {}
45
variable "tpu_node_pools" {}
56
variable "maintenance_interval" {}
7+
variable "is_tpu_node_private" {}
68

79

810
module "tpu-gke" {
911
source = "../../module"
1012
project_id = var.project_id
1113
resource_name_prefix = var.resource_name_prefix
14+
node_pool_prefix = var.node_pool_prefix
1215
region = var.region
1316
tpu_node_pools = var.tpu_node_pools
1417
maintenance_interval = var.maintenance_interval
18+
is_tpu_node_private = var.is_tpu_node_private
1519
}

tools/kubernetes/terraform/examples/v5e/outputs.tf renamed to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/examples/v5e/outputs.tf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ output "kubernetes_cluster_name" {
1313
description = "GKE Cluster Name"
1414
}
1515

16-
output "kubernetes_cluster_host" {
17-
value = module.tpu-gke.kubernetes_cluster_host
18-
description = "GKE Cluster Host"
19-
}
20-
2116
output "placement_policy_names" {
2217
value = module.tpu-gke.placement_policy_names
2318
description = "GKE TPU Placement Policy Names"
2419
}
20+
21+
output "is_tpu_node_private" {
22+
value = var.is_tpu_node_private
23+
description = "whether we want to make TPU node private"
24+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
project_id = "project-id"
2+
resource_name_prefix = "tpu-v5e-test"
3+
node_pool_prefix = "rp1"
4+
region = "us-east5"
5+
is_tpu_node_private = false
6+
tpu_node_pools = [{
7+
zone = "us-east5-b"
8+
node_count = 32
9+
machine_type = "ct5lp-hightpu-4t"
10+
topology = "8x16"
11+
policy = "sb-compact-rp1"
12+
disk_type = "pd-balanced"
13+
disk_size_gb = 120
14+
}, {
15+
zone = "us-east5-b"
16+
node_count = 32
17+
machine_type = "ct5lp-hightpu-4t"
18+
topology = "8x16"
19+
policy = "sb-compact-rp1"
20+
disk_type = "pd-balanced"
21+
disk_size_gb = 120
22+
}]
23+
maintenance_interval = "PERIODIC"
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Copyright 2023 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
# GKE cluster
18+
data "google_container_engine_versions" "gke_version" {
19+
location = var.region
20+
version_prefix = "1.27."
21+
}
22+
23+
provider "google" {
24+
project = var.project_id
25+
region = var.region
26+
}
27+
28+
# Separately Managed Node Pool
29+
resource "google_container_node_pool" "multihost_tpu" {
30+
count = length(var.tpu_node_pools)
31+
name = "${var.resource_name_prefix}-gke-${var.node_pool_prefix}-${count.index}"
32+
provider = google-beta
33+
project = var.project_id
34+
location = var.region
35+
node_locations = [var.tpu_node_pools[count.index].zone]
36+
cluster = "${var.resource_name_prefix}-gke-cluster"
37+
38+
initial_node_count = var.tpu_node_pools[count.index].node_count
39+
40+
management {
41+
auto_upgrade = false
42+
}
43+
44+
node_config {
45+
oauth_scopes = [
46+
"https://www.googleapis.com/auth/logging.write",
47+
"https://www.googleapis.com/auth/monitoring",
48+
"https://www.googleapis.com/auth/cloud-platform",
49+
]
50+
host_maintenance_policy {
51+
maintenance_interval = var.maintenance_interval
52+
}
53+
labels = {
54+
env = var.project_id
55+
}
56+
gvnic {
57+
enabled = true
58+
}
59+
gcfs_config {
60+
enabled = true
61+
}
62+
63+
image_type = "COS_CONTAINERD"
64+
machine_type = var.tpu_node_pools[count.index].machine_type
65+
disk_type = var.tpu_node_pools[count.index].disk_type
66+
disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb
67+
tags = ["gke-node"]
68+
metadata = {
69+
disable-legacy-endpoints = "true"
70+
}
71+
}
72+
placement_policy {
73+
type = "COMPACT"
74+
policy_name = var.tpu_node_pools[count.index].policy
75+
}
76+
77+
network_config {
78+
enable_private_nodes = var.is_tpu_node_private
79+
}
80+
}

tools/kubernetes/terraform/module/outputs.tf renamed to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/outputs.tf

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,18 @@ output "project_id" {
99
}
1010

1111
output "kubernetes_cluster_name" {
12-
value = google_container_cluster.tpu_cluster.name
12+
value = google_container_node_pool.multihost_tpu[0].cluster
1313
description = "GKE Cluster Name"
1414
}
1515

16-
output "kubernetes_cluster_host" {
17-
value = google_container_cluster.tpu_cluster.endpoint
18-
description = "GKE Cluster Host"
19-
}
20-
2116
output "placement_policy_names" {
22-
value = flatten([
17+
value = flatten([
2318
google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name
2419
])
2520
description = "GKE TPU Placement Policy Names"
2621
}
22+
23+
output "is_tpu_node_private" {
24+
value = var.is_tpu_node_private
25+
description = "whether we want to make TPU node private"
26+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
project_id = "project-id"
2+
resource_name_prefix = "tpu-test"
3+
region = "us-east5"
4+
tpu_node_pools = [{
5+
zone = "us-east5-b"
6+
node_count = 32
7+
machine_type = "ct5lp-hightpu-4t"
8+
topology = "8x16"
9+
policy = "sb-compact-rp1"
10+
}]
11+
maintenance_interval = "AS_NEEDED"

tools/kubernetes/terraform/module/variables.tf renamed to tools/kubernetes/terraform/batching_with_compact_placement/add_node_pool/module/variables.tf

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ variable "resource_name_prefix" {
2727
description = "prefix for all the resouce naming"
2828
}
2929

30+
variable "node_pool_prefix" {
31+
default = ""
32+
description = "prefix for all the resouce naming"
33+
}
34+
3035
variable "tpu_node_pools" {
3136
description = "tpu podslice config"
3237
type = list(object({
@@ -35,10 +40,17 @@ variable "tpu_node_pools" {
3540
machine_type = string,
3641
topology = string,
3742
policy = string,
43+
disk_type = optional(string),
44+
disk_size_gb = optional(number),
3845
}))
3946
}
4047

48+
variable "is_tpu_node_private" {
49+
description = "whether we want to make TPU node private"
50+
default = false
51+
}
52+
4153
variable "maintenance_interval" {
42-
default = "AS_NEEDED"
54+
default = "AS_NEEDED"
4355
description = "maintenance interval for TPU machines."
4456
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
variable "project_id" {}
2+
variable "resource_name_prefix" {}
3+
variable "region" {}
4+
variable "cpu_node_pool" {}
5+
variable "authorized_cidr_blocks" {}
6+
variable "is_cpu_node_private" {}
7+
8+
9+
module "tpu-gke" {
10+
source = "../../module"
11+
project_id = var.project_id
12+
resource_name_prefix = var.resource_name_prefix
13+
region = var.region
14+
cpu_node_pool = var.cpu_node_pool
15+
is_cpu_node_private = var.is_cpu_node_private
16+
authorized_cidr_blocks = var.authorized_cidr_blocks
17+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
output "region" {
2+
value = var.region
3+
description = "GCloud Region"
4+
}
5+
6+
output "project_id" {
7+
value = var.project_id
8+
description = "GCloud Project ID"
9+
}
10+
11+
output "kubernetes_cluster_name" {
12+
value = module.tpu-gke.kubernetes_cluster_name
13+
description = "GKE Cluster Name"
14+
}
15+
16+
output "authorized_cidr_blocks" {
17+
value = var.authorized_cidr_blocks
18+
description = "Cluster allowed cidr blocks "
19+
}
20+
21+
output "is_cpu_node_private" {
22+
value = var.is_cpu_node_private
23+
description = "whether we want to make CPU node private"
24+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
project_id = "project-id"
2+
resource_name_prefix = "tpu-v5e-test"
3+
region = "us-east5"
4+
authorized_cidr_blocks = []
5+
is_cpu_node_private = false
6+
cpu_node_pool = {
7+
zone = ["us-east5-a", "us-east5-b", "us-east5-c"]
8+
machine_type = "n2-standard-8",
9+
initial_node_count_per_zone = 1,
10+
min_node_count_per_zone = 1,
11+
max_node_count_per_zone = 30,
12+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/**
2+
* Copyright 2023 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
# GKE cluster
18+
data "google_container_engine_versions" "gke_version" {
19+
location = var.region
20+
version_prefix = "1.27."
21+
}
22+
23+
provider "google" {
24+
project = var.project_id
25+
region = var.region
26+
}
27+
28+
# VPC
29+
resource "google_compute_network" "vpc" {
30+
name = "${var.resource_name_prefix}-vpc"
31+
auto_create_subnetworks = "false"
32+
}
33+
34+
# Subnet
35+
resource "google_compute_subnetwork" "subnet" {
36+
name = "${var.resource_name_prefix}-subnet"
37+
region = var.region
38+
network = google_compute_network.vpc.name
39+
ip_cidr_range = "10.10.0.0/19"
40+
}
41+
42+
resource "google_container_cluster" "tpu_cluster" {
43+
name = "${var.resource_name_prefix}-gke-cluster"
44+
location = var.region
45+
46+
# We can't create a cluster with no node pool defined, but we want to only use
47+
# separately managed node pools. So we create the smallest possible default
48+
# node pool and immediately delete it.
49+
remove_default_node_pool = true
50+
initial_node_count = 1
51+
networking_mode = "VPC_NATIVE"
52+
ip_allocation_policy {
53+
cluster_ipv4_cidr_block = "/14"
54+
services_ipv4_cidr_block = "/20"
55+
}
56+
default_max_pods_per_node = 15
57+
58+
release_channel {
59+
channel = "UNSPECIFIED"
60+
}
61+
62+
network = google_compute_network.vpc.name
63+
subnetwork = google_compute_subnetwork.subnet.name
64+
logging_service = "logging.googleapis.com/kubernetes"
65+
monitoring_service = "monitoring.googleapis.com/kubernetes"
66+
67+
master_authorized_networks_config {
68+
gcp_public_cidrs_access_enabled = false
69+
70+
dynamic "cidr_blocks" {
71+
for_each = var.authorized_cidr_blocks
72+
content {
73+
cidr_block = cidr_blocks.value
74+
display_name = "cidr-blocks-group-${cidr_blocks.key}"
75+
}
76+
}
77+
}
78+
79+
// Needs to be false when creating a PSC-based GKE cluster.
80+
// After that, set as true to disable public endpoint of cluster master.
81+
private_cluster_config {
82+
enable_private_endpoint = false
83+
}
84+
85+
timeouts {
86+
create = "120m"
87+
update = "120m"
88+
}
89+
}
90+
91+
resource "google_container_node_pool" "cpu_node_pool" {
92+
provider = google-beta
93+
project = var.project_id
94+
name = "cpu-node-pool"
95+
location = var.region
96+
node_locations = var.cpu_node_pool.zone
97+
cluster = google_container_cluster.tpu_cluster.name
98+
initial_node_count = var.cpu_node_pool.initial_node_count_per_zone
99+
autoscaling {
100+
min_node_count = var.cpu_node_pool.min_node_count_per_zone
101+
max_node_count = var.cpu_node_pool.max_node_count_per_zone
102+
}
103+
max_pods_per_node = 63
104+
node_config {
105+
oauth_scopes = [
106+
"https://www.googleapis.com/auth/cloud-platform"
107+
]
108+
machine_type = var.cpu_node_pool.machine_type
109+
110+
metadata = {
111+
disable-legacy-endpoints = "true"
112+
}
113+
gcfs_config {
114+
enabled = true
115+
}
116+
}
117+
118+
network_config {
119+
enable_private_nodes = var.is_cpu_node_private
120+
}
121+
}

0 commit comments

Comments
 (0)