Skip to content

Commit

Permalink
Initial transition to applications from helm for applications products;
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-k-nebius committed Oct 9, 2024
1 parent 15a7245 commit 153749e
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 75 deletions.
4 changes: 3 additions & 1 deletion k8s-inference/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ module "gpu-operator" {
nebius_mk8s_v1_node_group.gpu,
nebius_mk8s_v1_node_group.cpu-only,
]
source = "../modules/gpu-operator"
source = "../modules/gpu-operator"
parent_id = var.parent_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
}

module "o11y" {
Expand Down
64 changes: 34 additions & 30 deletions k8s-training/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ module "network-operator" {
nebius_mk8s_v1_node_group.cpu-only,
nebius_mk8s_v1_node_group.gpu,
]
source = "../modules/network-operator"
source = "../modules/network-operator"
parent_id = var.parent_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
}

module "gpu-operator" {
depends_on = [
module.network-operator
]
source = "../modules/gpu-operator"
source = "../modules/gpu-operator"
parent_id = var.parent_id
cluster_id = nebius_mk8s_v1_cluster.k8s-cluster.id
}

module "o11y" {
Expand Down Expand Up @@ -44,34 +48,34 @@ module "o11y" {
}
test_mode = var.test_mode
}

module "nccl-test" {
depends_on = [
module.gpu-operator,
]

count = var.test_mode ? 1 : 0
source = "../modules/nccl-test"
number_of_hosts = nebius_mk8s_v1_node_group.gpu.fixed_node_count
}

module "kuberay" {
source = "../modules/kuberay"
count = var.enable_kuberay ? 1 : 0

depends_on = [
nebius_mk8s_v1_node_group.cpu-only,
nebius_mk8s_v1_node_group.gpu,
module.network-operator,
module.gpu-operator,
module.csi-mounted-fs-path,
]

gpu_platform = var.gpu_nodes_platform
cpu_platform = var.cpu_nodes_platform
min_gpu_replicas = var.kuberay_min_gpu_replicas
max_gpu_replicas = var.kuberay_max_gpu_replicas
}
#
#module "nccl-test" {
# depends_on = [
# module.gpu-operator,
# ]
#
# count = var.test_mode ? 1 : 0
# source = "../modules/nccl-test"
# number_of_hosts = nebius_mk8s_v1_node_group.gpu.fixed_node_count
#}
#
#module "kuberay" {
# source = "../modules/kuberay"
# count = var.enable_kuberay ? 1 : 0
#
# depends_on = [
# nebius_mk8s_v1_node_group.cpu-only,
# nebius_mk8s_v1_node_group.gpu,
# module.network-operator,
# module.gpu-operator,
# module.csi-mounted-fs-path,
# ]
#
# gpu_platform = var.gpu_nodes_platform
# cpu_platform = var.cpu_nodes_platform
# min_gpu_replicas = var.kuberay_min_gpu_replicas
# max_gpu_replicas = var.kuberay_max_gpu_replicas
#}


module "csi-mounted-fs-path" {
Expand Down
30 changes: 16 additions & 14 deletions modules/gpu-operator/helm.tf
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
resource "helm_release" "gpu-operator" {
name = "gpu-operator"
# FIXME set to production product
repository = var.helm_repository
chart = "gpu-operator"
namespace = "gpu-operator"
create_namespace = true
version = var.helm_version
atomic = true
timeout = 600

set {
name = "driver.version"
value = var.driver_version
resource "nebius_applications_v1alpha1_k8s_release" "gpu-operator" {
lifecycle {
ignore_changes = all
}
cluster_id = var.cluster_id
parent_id = var.parent_id

application_name = "gpu-operator"
namespace = "gpu-operator"
product_slug = "nebius/nvidia-gpu-operator"

# set = {
# "driver.version" : var.driver_version
# }
values = <<EOT
driver:
version: ${var.driver_version}
EOT
}
16 changes: 10 additions & 6 deletions modules/gpu-operator/variables.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
variable "helm_repository" {
description = "GPU Operator Helm chart source repository."
variable "cluster_id" {
description = "K8s cluster id."
type = string
default = "oci://cr.eu-north1.nebius.cloud/marketplace/nebius/nvidia-gpu-operator/chart"
}

variable "helm_version" {
description = "Version of GPU Operator Helm chart."
variable "parent_id" {
description = "Project id."
type = string
default = "v24.6.2"
}

variable "product_slug" {
description = "Marketplace product slug."
type = string
default = "nebius/nvidia-gpu-operator"
}

variable "driver_version" {
Expand Down
7 changes: 7 additions & 0 deletions modules/gpu-operator/versions.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
terraform {
required_providers {
nebius = {
source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius"
}
}
}
38 changes: 20 additions & 18 deletions modules/network-operator/helm.tf
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
resource "helm_release" "network_operator" {
name = "network-operator"
repository = var.helm_repository
chart = "network-operator"
namespace = "network-operator"
atomic = true
timeout = 600

create_namespace = true
version = var.helm_version

set {
name = "operator.resources.limits.cpu"
value = var.limit_cpu
resource "nebius_applications_v1alpha1_k8s_release" "network-operator" {
lifecycle {
ignore_changes = all
}
cluster_id = var.cluster_id
parent_id = var.parent_id

set {
name = "operator.resources.limits.memory"
value = var.limit_memory
}
application_name = "network-operator"
namespace = "network-operator"
product_slug = var.product_slug

# set = {
# "operator.resources.limits.cpu" : var.limit_cpu,
# "operator.resources.limits.memory" : var.limit_memory
# }
values = <<EOT
operator:
resources:
limits:
cpu: ${var.limit_cpu}
memory: ${var.limit_memory}
EOT
}
16 changes: 10 additions & 6 deletions modules/network-operator/variables.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
variable "helm_repository" {
description = "Network Operator Helm chart source repository."
variable "cluster_id" {
description = "K8s cluster id."
type = string
default = "oci://cr.eu-north1.nebius.cloud/marketplace/nebius/nvidia-network-operator/chart"
}

variable "helm_version" {
description = "Version of Network Operator Helm chart."
variable "parent_id" {
description = "Project id."
type = string
default = "24.4.0"
}

variable "product_slug" {
description = "Marketplace product slug."
type = string
default = "nebius/nvidia-network-operator"
}

variable "limit_cpu" {
Expand Down
7 changes: 7 additions & 0 deletions modules/network-operator/versions.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
terraform {
required_providers {
nebius = {
source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius"
}
}
}

0 comments on commit 153749e

Please sign in to comment.