Skip to content

Commit c274552

Browse files
committed
add variables to control autoscaler
1 parent 4a4a8a4 commit c274552

6 files changed

+87
-16
lines changed

add_on_k8s_autoscaler.tf

+32-7
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ locals {
77
cluster_autoscaler_image_version = lookup(local.cluster_autoscaler_supported_k8s_versions, local.k8s_major_minor_version, reverse(values(local.cluster_autoscaler_supported_k8s_versions))[0])
88
cluster_autoscaler_image = "iad.ocir.io/oracle/oci-cluster-autoscaler:${local.cluster_autoscaler_image_version}"
99
cluster_autoscaler_log_level_verbosity = 4
10-
cluster_autoscaler_max_node_provision_time = "25m"
11-
cluster_autoscaler_scale_down_delay_after_add = "10m"
12-
cluster_autoscaler_scale_down_unneeded_time = "10m"
13-
cluster_autoscaler_unremovable_node_recheck_timeout = "5m"
10+
cluster_autoscaler_max_node_provision_time = "${var.cluster_autoscaler_max_node_provision_time}m"
11+
cluster_autoscaler_scale_down_delay_after_add = "${var.cluster_autoscaler_scale_down_delay_after_add}m"
12+
cluster_autoscaler_scale_down_unneeded_time = "${var.cluster_autoscaler_scale_down_unneeded_time}m"
13+
cluster_autoscaler_unremovable_node_recheck_timeout = "${var.cluster_autoscaler_unremovable_node_recheck_timeout}m"
1414
cluster_autoscaler_cloud_provider = tonumber(local.k8s_minor_version) <= 23 ? "oci" : "oci-oke"
1515
cluster_autoscaler_enabled = contains(keys(local.cluster_autoscaler_supported_k8s_versions), local.k8s_major_minor_version) ? (var.np1_enable_autoscaler || var.np2_enable_autoscaler || var.np3_enable_autoscaler) : false
1616
k8s_major_minor_version = regex("\\d+(?:\\.(?:\\d+|x)(?:))", local.kubernetes_version)
@@ -273,9 +273,9 @@ resource "kubernetes_deployment" "cluster_autoscaler_deployment" {
273273
"--stderrthreshold=info",
274274
"--cloud-provider=${local.cluster_autoscaler_cloud_provider}",
275275
"--max-node-provision-time=${local.cluster_autoscaler_max_node_provision_time}",
276-
"--nodes=${var.np1_autoscaler_min_nodes}:${var.np1_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[0].id}",
277-
var.node_pool_count >= 2 ? "--nodes=${var.np2_autoscaler_min_nodes}:${var.np2_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[1].id}" : "",
278-
var.node_pool_count >= 3 ? "--nodes=${var.np3_autoscaler_min_nodes}:${var.np3_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[2].id}" : "",
276+
var.np1_enable_autoscaler ? "--nodes=${var.np1_autoscaler_min_nodes}:${var.np1_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[0].id}" : "",
277+
var.node_pool_count >= 2 && var.np2_enable_autoscaler ? "--nodes=${var.np2_autoscaler_min_nodes}:${var.np2_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[1].id}" : "",
278+
var.node_pool_count >= 3 && var.np3_enable_autoscaler ? "--nodes=${var.np3_autoscaler_min_nodes}:${var.np3_autoscaler_max_nodes}:${oci_containerengine_node_pool.oci_oke_node_pool[2].id}" : "",
279279
"--scale-down-delay-after-add=${local.cluster_autoscaler_scale_down_delay_after_add}",
280280
"--scale-down-unneeded-time=${local.cluster_autoscaler_scale_down_unneeded_time}",
281281
"--unremovable-node-recheck-timeout=${local.cluster_autoscaler_unremovable_node_recheck_timeout}",
@@ -338,3 +338,28 @@ resource "kubernetes_pod_disruption_budget_v1" "core_dns_pod_disruption_budget"
338338
oci_containerengine_node_pool.oci_oke_node_pool
339339
]
340340
}
341+
342+
resource "kubernetes_pod_disruption_budget_v1" "cluster_autoscaler_pod_disruption_budget" {
343+
count = local.cluster_autoscaler_enabled ? 1 : 0
344+
345+
metadata {
346+
name = "cluster-autoscaler-pdb"
347+
namespace = "kube-system"
348+
labels = {
349+
k8s-app = "cluster-autoscaler"
350+
}
351+
}
352+
spec {
353+
max_unavailable = "1"
354+
selector {
355+
match_labels = {
356+
app = "cluster-autoscaler"
357+
}
358+
}
359+
}
360+
361+
depends_on = [
362+
data.oci_containerengine_cluster_kube_config.oke,
363+
oci_containerengine_node_pool.oci_oke_node_pool
364+
]
365+
}

oke_cluster.tf

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
## All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl
33

44
locals {
5-
kubernetes_version = var.kubernetes_version != "" ? var.kubernetes_version : reverse(data.oci_containerengine_cluster_option.cluster_options.kubernetes_versions)[0]
5+
kubernetes_version = (var.kubernetes_version != "" && var.kubernetes_version != null) ? var.kubernetes_version : reverse(data.oci_containerengine_cluster_option.cluster_options.kubernetes_versions)[0]
66
}
77

88
resource "oci_containerengine_cluster" "oci_oke_cluster" {

oke_node_pools.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ resource "oci_containerengine_node_pool" "oci_oke_node_pool" {
5757

5858
cluster_id = oci_containerengine_cluster.oci_oke_cluster.id
5959
compartment_id = var.cluster_compartment_id
60-
kubernetes_version = var.kubernetes_version != "" ? var.kubernetes_version : reverse(data.oci_containerengine_cluster_option.cluster_options.kubernetes_versions)[0]
61-
name = "${local.node_pools[count.index]["node_shape"]}_Node_Pool"
60+
kubernetes_version = (var.kubernetes_version != "" && var.kubernetes_version != null) ? var.kubernetes_version : reverse(data.oci_containerengine_cluster_option.cluster_options.kubernetes_versions)[0]
61+
name = "${replace(local.node_pools[count.index]["node_shape"], "Standard", "Std")}${length(regexall("Flex", local.node_pools[count.index]["node_shape"])) > 0 ? "-${local.node_pools[count.index]["ocpus"]}-${local.node_pools[count.index]["memory_gb"]}GB" : ""}"
6262
node_shape = local.node_pools[count.index]["node_shape"]
6363

6464
# initial_node_labels {

schema.yaml

+29-1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,18 @@ variableGroups:
156156
variables:
157157
- enable_pod_admission_controller
158158

159+
- title: "Cluster Autoscaler Settings"
160+
variables:
161+
- cluster_autoscaler_max_node_provision_time
162+
- cluster_autoscaler_scale_down_delay_after_add
163+
- cluster_autoscaler_scale_down_unneeded_time
164+
- cluster_autoscaler_unremovable_node_recheck_timeout
165+
visible:
166+
or:
167+
- np1_enable_autoscaler
168+
- np2_enable_autoscaler
169+
- np3
170+
159171
- title: "Deployments"
160172
variables:
161173
- enable_flink
@@ -1079,6 +1091,22 @@ variables:
10791091
description: |
10801092
Deploys Prometheus, Grafana and related datasources, plugins and dashboards.
10811093
1094+
cluster_autoscaler_max_node_provision_time:
1095+
type: number
1096+
title: Max node provisioning time before it is considered failed.
1097+
1098+
cluster_autoscaler_scale_down_delay_after_add:
1099+
type: number
1100+
title: Wait time before scaling a node down after it was added.
1101+
1102+
cluster_autoscaler_scale_down_unneeded_time:
1103+
type: number
1104+
title: Wait time before scaling a node down after it is unneeded.
1105+
1106+
cluster_autoscaler_unremovable_node_recheck_timeout:
1107+
type: number
1108+
title: Wait time between checks on unremovable nodes.
1109+
10821110
outputGroups:
10831111
- title: Access
10841112
outputs:
@@ -1117,4 +1145,4 @@ outputs:
11171145
type: copyableString
11181146
title: Grafana access
11191147
displayText: "Grafana password for 'admin' user"
1120-
visible:
1148+
visible: enable_monitoring_stack

terraform.tfvars.template

+5-4
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
## All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl
33

44
region = "us-ashburn-1"
5-
tenancy_ocid = "ocid1.tenancy.oc1..."
5+
tenancy_ocid = "ocid1.tenancy.oc1.."
66

77
## Compartments
8-
vcn_compartment_id = "ocid1.compartment.oc1..."
9-
cluster_compartment_id = "ocid1.compartment.oc1..."
8+
vcn_compartment_id = "ocid1.compartment.oc1.."
9+
cluster_compartment_id = "ocid1.compartment.oc1.."
1010

1111
## Network
1212
use_existing_vcn = false
@@ -17,7 +17,7 @@ is_endpoint_public = true
1717

1818
## Cluster
1919
cluster_name = "Flink Cluster"
20-
# kubernetes_version=
20+
kubernetes_version=""
2121
ssh_public_key = "ssh-rsa AAAA..."
2222
node_pool_count = 1
2323
# add_cluster_tag=
@@ -76,3 +76,4 @@ image_validation_key_id = null # "ocid1.key.oc1..."
7676
enable_cert_manager = true
7777
enable_flink = true
7878
enable_metrics_server = true
79+
enable_monitoring_stack = true

variables.tf

+18-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ variable "cluster_name" {
3838
}
3939

4040
variable "kubernetes_version" {
41-
default = ""
41+
default = null
4242
# default to latest version if null
4343
}
4444

@@ -277,4 +277,21 @@ variable "enable_flink" {
277277

278278
variable "enable_monitoring_stack" {
279279
default = true
280+
}
281+
282+
variable "cluster_autoscaler_max_node_provision_time" {
283+
default = 25
284+
description = "Maximum wait time (min) for nodes to provision before failure"
285+
}
286+
variable cluster_autoscaler_scale_down_delay_after_add {
287+
default = 10
288+
description = "Minimum delay (min) before scaling a node down after it was provisioned"
289+
}
290+
variable cluster_autoscaler_scale_down_unneeded_time {
291+
default = 10
292+
description = "Minimum delay (min) before scaling a node down once it is unneeded"
293+
}
294+
variable cluster_autoscaler_unremovable_node_recheck_timeout {
295+
default = 5
296+
description = "Time (min) between checks on status of unremovable"
280297
}

0 commit comments

Comments
 (0)