Skip to content

Commit 1545b4e

Browse files
committed
Add support for autopilot
1 parent 23161f5 commit 1545b4e

File tree

14 files changed

+230
-13
lines changed

14 files changed

+230
-13
lines changed

platform/main.tf

+17-5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ data "google_client_config" "provider" {}
1717
data "google_container_cluster" "ml_cluster" {
1818
name = var.cluster_name
1919
location = var.region
20-
depends_on = [module.gke_cluster]
20+
depends_on = [module.gke_autopilot, module.gke_standard]
2121
}
2222

2323
provider "google" {
@@ -52,26 +52,38 @@ provider "helm" {
5252
}
5353
}
5454

55-
module "gke_cluster" {
56-
source = "./modules/gke_cluster"
55+
module "gke_autopilot" {
56+
source = "./modules/gke_autopilot"
5757

5858
project_id = var.project_id
5959
region = var.region
6060
cluster_name = var.cluster_name
61+
enable_autopilot = var.enable_autopilot
62+
}
63+
64+
65+
module "gke_standard" {
66+
source = "./modules/gke_standard"
67+
68+
project_id = var.project_id
69+
region = var.region
70+
cluster_name = var.cluster_name
71+
enable_autopilot = var.enable_autopilot
6172
}
6273

6374
module "kubernetes" {
6475
source = "./modules/kubernetes"
6576

66-
depends_on = [module.gke_cluster]
77+
depends_on = [module.gke_standard]
6778
region = var.region
6879
cluster_name = var.cluster_name
80+
enable_autopilot = var.enable_autopilot
6981
}
7082

7183
module "kuberay" {
7284
source = "./modules/kuberay"
7385

74-
depends_on = [module.kubernetes]
86+
depends_on = [module.gke_autopilot, module.gke_standard]
7587
region = var.region
7688
cluster_name = var.cluster_name
7789
}
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
provider "google" {
16+
project = var.project_id
17+
region = var.region
18+
}
19+
20+
21+
# GKE cluster
22+
resource "google_container_cluster" "ml_cluster" {
23+
name = var.cluster_name
24+
location = var.region
25+
count = var.enable_autopilot == true ? 1 : 0
26+
27+
initial_node_count = 1
28+
29+
logging_config {
30+
enable_components = ["SYSTEM_COMPONENTS", "WORKLOADS"]
31+
}
32+
33+
monitoring_config {
34+
enable_components = ["SYSTEM_COMPONENTS"]
35+
managed_prometheus {
36+
enabled = "true"
37+
}
38+
}
39+
ip_allocation_policy {
40+
cluster_ipv4_cidr_block = ""
41+
services_ipv4_cidr_block = ""
42+
}
43+
44+
enable_autopilot = true
45+
46+
release_channel {
47+
channel = "RAPID"
48+
}
49+
50+
min_master_version = "1.27"
51+
}
52+

platform/modules/gke_cluster/output.tf platform/modules/gke_autopilot/output.tf

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,26 @@
1414

1515
output "project_id" {
1616
description = "GCP project id"
17-
value = resource.google_container_cluster.ml_cluster.project
17+
value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].project : null
1818
}
1919

2020
output "region" {
2121
description = "GCP region"
22-
value = resource.google_container_cluster.ml_cluster.location
22+
value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].location : null
2323
}
2424

2525
output "cluster_name" {
2626
description = "The name of the GKE cluster"
27-
value = resource.google_container_cluster.ml_cluster.name
27+
value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].name : null
2828
}
2929

3030
output "kubernetes_host" {
3131
description = "Kubernetes cluster host"
32-
value = resource.google_container_cluster.ml_cluster.endpoint
32+
value = var.enable_autopilot ? resource.google_container_cluster.ml_cluster[0].endpoint : null
3333
}
3434

3535
output "cluster_certicicate" {
3636
description = "Kubernetes cluster ca certificate"
37-
value = base64decode(resource.google_container_cluster.ml_cluster.master_auth[0].cluster_ca_certificate)
37+
value = var.enable_autopilot ? base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate) : null
3838
sensitive = true
3939
}

platform/modules/gke_cluster/variables.tf platform/modules/gke_autopilot/variables.tf

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ variable "project_id" {
2121
variable "region" {
2222
type = string
2323
description = "GCP project region or zone"
24-
default = "us-central1-c"
24+
default = "us-central1"
2525
}
2626

2727
variable "cluster_name" {
@@ -40,3 +40,9 @@ variable "num_gpu_nodes" {
4040
description = "Number of GPU nodes in the cluster"
4141
default = 1
4242
}
43+
44+
variable "enable_autopilot" {
45+
type = bool
46+
description = "Set to true to enable GKE Autopilot clusters"
47+
default = false
48+
}

platform/modules/gke_cluster/main.tf platform/modules/gke_standard/main.tf

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ provider "google" {
2222
resource "google_container_cluster" "ml_cluster" {
2323
name = var.cluster_name
2424
location = var.region
25+
count = var.enable_autopilot == false ? 1 : 0
2526

2627
initial_node_count = 3
2728

@@ -44,8 +45,9 @@ resource "google_container_cluster" "ml_cluster" {
4445
resource "google_container_node_pool" "gpu_pool" {
4546
name = "gpu-pool"
4647
location = var.region
47-
cluster = google_container_cluster.ml_cluster.name
48+
cluster = var.enable_autopilot == false ? google_container_cluster.ml_cluster[0].name : null
4849
node_count = var.num_gpu_nodes
50+
count = var.enable_autopilot == false ? 1 : 0
4951

5052
autoscaling {
5153
min_node_count = "1"
+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
output "project_id" {
16+
description = "GCP project id"
17+
value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].project
18+
}
19+
20+
output "region" {
21+
description = "GCP region"
22+
value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].location
23+
}
24+
25+
output "cluster_name" {
26+
description = "The name of the GKE cluster"
27+
value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].name
28+
}
29+
30+
output "kubernetes_host" {
31+
description = "Kubernetes cluster host"
32+
value = var.enable_autopilot ? null : resource.google_container_cluster.ml_cluster[0].endpoint
33+
}
34+
35+
output "cluster_certicicate" {
36+
description = "Kubernetes cluster ca certificate"
37+
value = var.enable_autopilot ? null : base64decode(resource.google_container_cluster.ml_cluster[0].master_auth[0].cluster_ca_certificate)
38+
sensitive = true
39+
}
+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
variable "project_id" {
16+
type = string
17+
description = "GCP project id"
18+
default = "ricliu-gke-dev"
19+
}
20+
21+
variable "region" {
22+
type = string
23+
description = "GCP project region or zone"
24+
default = "us-central1"
25+
}
26+
27+
variable "cluster_name" {
28+
type = string
29+
description = "GKE cluster name"
30+
default = "ml-cluster"
31+
}
32+
33+
variable "namespace" {
34+
type = string
35+
description = "Kubernetes namespace where resources are deployed"
36+
default = "ray"
37+
}
38+
39+
variable "num_gpu_nodes" {
40+
description = "Number of GPU nodes in the cluster"
41+
default = 1
42+
}
43+
44+
variable "enable_autopilot" {
45+
type = bool
46+
description = "Set to true to enable GKE Autopilot clusters"
47+
default = false
48+
}
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
terraform {
16+
required_providers {
17+
}
18+
provider_meta "google" {
19+
module_name = "blueprints/terraform/terraform-google-kubernetes-engine:kuberay/v0.1.0"
20+
}
21+
}

platform/modules/kubernetes/kubernetes.tf

+1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ data "http" "nvidia_driver_installer_manifest" {
1818

1919
resource "kubectl_manifest" "nvidia_driver_installer" {
2020
yaml_body = data.http.nvidia_driver_installer_manifest.response_body
21+
count = var.enable_autopilot == false ? 1 : 0
2122
}

platform/modules/kubernetes/variables.tf

+6
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,9 @@ variable "namespace" {
2929
description = "Kubernetes namespace where resources are deployed"
3030
default = "ray"
3131
}
32+
33+
variable "enable_autopilot" {
34+
type = bool
35+
description = "Set to true to enable GKE Autopilot clusters"
36+
default = false
37+
}

platform/variables.tf

+7-1
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,17 @@ variable "project_id" {
2121
variable "region" {
2222
type = string
2323
description = "GCP project region or zone"
24-
default = "us-central1-c"
24+
default = "us-central1"
2525
}
2626

2727
variable "cluster_name" {
2828
type = string
2929
description = "GKE cluster name"
3030
default = "ml-cluster"
3131
}
32+
33+
variable "enable_autopilot" {
34+
type = bool
35+
description = "Set to true to enable GKE Autopilot clusters"
36+
default = false
37+
}

user/modules/jupyterhub/jupyterhub-values.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,24 @@ hub:
2626
password: password
2727
JupyterHub:
2828
authenticator_class: dummy
29+
networkPolicy:
30+
enabled: false
31+
32+
prePuller:
33+
hook:
34+
enabled: false
35+
36+
proxy:
37+
chp:
38+
networkPolicy:
39+
enabled: false
40+
traefik:
41+
networkPolicy:
42+
enabled: false
43+
44+
scheduling:
45+
userScheduler:
46+
enabled: false
2947

3048
singleuser:
3149
nodeSelector:
@@ -40,3 +58,7 @@ singleuser:
4058
# `cmd: null` allows the custom CMD of the Jupyter docker-stacks to be used
4159
# which performs further customization on startup.
4260
cmd: null
61+
cloudMetadata:
62+
blockWithIptables: false
63+
networkPolicy:
64+
enabled: false

user/modules/kuberay/kuberay-values.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ head:
9595
annotations: {}
9696
nodeSelector:
9797
iam.gke.io/gke-metadata-server-enabled: "true"
98+
cloud.google.com/gke-accelerator: "nvidia-tesla-a100"
9899
tolerations: []
99100
affinity: {}
100101
# Ray container security context.
@@ -179,6 +180,7 @@ worker:
179180
key: value
180181
nodeSelector:
181182
iam.gke.io/gke-metadata-server-enabled: "true"
183+
cloud.google.com/gke-accelerator: "nvidia-tesla-a100"
182184
tolerations: []
183185
affinity: {}
184186
# Ray container security context.

0 commit comments

Comments
 (0)