Skip to content

Commit

Permalink
Merge pull request #47 from nebius/release/soperator
Browse files Browse the repository at this point in the history
Soperator Release v1.14.11
  • Loading branch information
dstaroff authored Oct 22, 2024
2 parents 6253b42 + b165086 commit 71a23ab
Show file tree
Hide file tree
Showing 16 changed files with 487 additions and 425 deletions.
516 changes: 317 additions & 199 deletions soperator/README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.14.10
1.14.11
55 changes: 36 additions & 19 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
locals {
create_nlb = var.slurm_login_service_type == "NodePort"

worker_resources = module.resources.this[var.k8s_cluster_node_group_gpu.resource.platform][var.k8s_cluster_node_group_gpu.resource.preset]
}

module "filestore" {
Expand Down Expand Up @@ -115,21 +117,45 @@ module "k8s" {
}
}

module "nvidia_operators" {
module "nvidia_operator_network" {
count = local.worker_resources.gpus > 0 ? 1 : 0

depends_on = [
module.k8s
]

source = "../../modules/nvidia_operators"
source = "../../../modules/network-operator"

cluster_id = module.k8s.cluster_id
parent_id = data.nebius_iam_v1_project.this.id

providers = {
helm = helm
nebius = nebius
}
}

module "nvidia_operator_gpu" {
count = local.worker_resources.gpus > 0 ? 1 : 0

depends_on = [
module.nvidia_operator_network
]

source = "../../../modules/gpu-operator"

cluster_id = module.k8s.cluster_id
parent_id = data.nebius_iam_v1_project.this.id

enable_dcgm_service_monitor = var.telemetry_enabled

providers = {
nebius = nebius
}
}

module "slurm" {
depends_on = [
module.k8s
module.k8s,
]

source = "../../modules/slurm"
Expand All @@ -139,20 +165,12 @@ module "slurm" {

node_count = var.slurm_node_count

worker_resources = tomap({
"8gpu-128vcpu-1600gb" = {
cpu_cores = 128 - 48
memory_gibibytes = 1600 - 400
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = 8
}
"1gpu-20vcpu-200gb" = {
cpu_cores = 20 - 4
memory_gibibytes = 200 - 50
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = 1
}
})[var.k8s_cluster_node_group_gpu.resource.preset]
worker_resources = {
cpu_cores = local.worker_resources.cpu_cores
memory_gibibytes = local.worker_resources.memory_gibibytes
ephemeral_storage_gibibytes = ceil(var.k8s_cluster_node_group_gpu.boot_disk.size_gibibytes / 2)
gpus = local.worker_resources.gpus
}

login_service_type = var.slurm_login_service_type
login_node_port = var.slurm_login_node_port
Expand All @@ -164,7 +182,6 @@ module "slurm" {
slurmdbd_config = var.slurmdbd_config
slurm_accounting_config = var.slurm_accounting_config

# TODO: MSP-2817 - use computed values of filestore sizes
filestores = {
controller_spool = {
size_gibibytes = module.filestore.controller_spool.size_gibibytes
Expand Down
6 changes: 5 additions & 1 deletion soperator/installations/example/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ terraform {
required_providers {
nebius = {
source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius"
version = "0.3.22"
version = "0.4.4"
}

units = {
Expand Down Expand Up @@ -41,3 +41,7 @@ provider "helm" {
token = var.iam_token
}
}

module "resources" {
source = "../../modules/available_resources"
}
2 changes: 1 addition & 1 deletion soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ slurm_cluster_name = "my-amazing-slurm"

# Version of soperator.
# ---
slurm_operator_version = "1.14.10"
slurm_operator_version = "1.14.11"

#----------------------------------------------------------------------------------------------------------------------#
# #
Expand Down
92 changes: 92 additions & 0 deletions soperator/modules/available_resources/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
locals {
# TODO: Get to know exact amount of allocatable resources
resources = tomap({
"cpu-e2" = tomap({
# Insufficient resource presets
# 2vcpu-8gb
# 4vcpu-16gb
"8vcpu-32gb" = {
cpu_cores = 8 - 2
memory_gibibytes = 32 - 10
gpus = 0
gpu_cluster_compatible = false
}
"16vcpu-64gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 64 - 10
gpus = 0
gpu_cluster_compatible = false
}
"32vcpu-128gb" = {
cpu_cores = 32 - 2
memory_gibibytes = 128 - 10
gpus = 0
gpu_cluster_compatible = false
}
"48vcpu-192gb" = {
cpu_cores = 48 - 2
memory_gibibytes = 192 - 10
gpus = 0
gpu_cluster_compatible = false
}
"64vcpu-256gb" = {
cpu_cores = 64 - 2
memory_gibibytes = 256 - 10
gpus = 0
gpu_cluster_compatible = false
}
"80vcpu-320gb" = {
cpu_cores = 80 - 2
memory_gibibytes = 320 - 10
gpus = 0
gpu_cluster_compatible = false
}
})
"gpu-h100-sxm" = tomap({
"1gpu-16vcpu-200gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 200 - 15
gpus = 1
gpu_cluster_compatible = false
}
"8gpu-128vcpu-1600gb" = {
cpu_cores = 128 - 2
memory_gibibytes = 1600 - 350
gpus = 8
gpu_cluster_compatible = true
}
})
"gpu-l40s-a" = tomap({
"1gpu-8vcpu-32gb" = {
cpu_cores = 8 - 2
memory_gibibytes = 32 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-16vcpu-64gb" = {
cpu_cores = 16 - 2
memory_gibibytes = 64 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-24vcpu-96gb" = {
cpu_cores = 24 - 2
memory_gibibytes = 96 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-32vcpu-128gb" = {
cpu_cores = 32 - 2
memory_gibibytes = 128 - 10
gpus = 1
gpu_cluster_compatible = false
}
"1gpu-40vcpu-160gb" = {
cpu_cores = 40 - 2
memory_gibibytes = 160 - 10
gpus = 1
gpu_cluster_compatible = false
}
})
})
}
4 changes: 4 additions & 0 deletions soperator/modules/available_resources/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "this" {
description = "Map of available node resources grouped by platform -> preset."
value = local.resources
}
25 changes: 12 additions & 13 deletions soperator/modules/k8s/k8s_ng_gpu.tf
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
locals {
gpu = {
cluster = {
create = tomap({
"8gpu-128vcpu-1600gb" = true
"1gpu-20vcpu-200gb" = false
})[var.node_group_gpu.resource.preset]

create = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpu_cluster_compatible
name = join("-", [
trimsuffix(
substr(
Expand All @@ -18,11 +14,6 @@ locals {
var.node_group_gpu.gpu_cluster.infiniband_fabric
])
}

count = tomap({
"8gpu-128vcpu-1600gb" = 8
"1gpu-20vcpu-200gb" = 1
})[var.node_group_gpu.resource.preset]
}
}

Expand Down Expand Up @@ -62,11 +53,11 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
metadata = {
labels = module.labels.label_group_name_gpu
}
taints = [{
taints = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus > 0 ? [{
key = "nvidia.com/gpu",
value = local.gpu.count
value = module.resources.this[var.node_group_gpu.resource.platform][var.node_group_gpu.resource.preset].gpus
effect = "NO_SCHEDULE"
}]
}] : null

resources = {
platform = var.node_group_gpu.resource.platform
Expand Down Expand Up @@ -105,5 +96,13 @@ resource "nebius_mk8s_v1_node_group" "gpu" {
ignore_changes = [
labels,
]

precondition {
condition = (var.node_group_gpu.resource.platform == "cpu-e2"
? !contains(["2vcpu-8gb", "4vcpu-16gb"], var.node_group_gpu.resource.preset)
: true
)
error_message = "Worker resource preset '${var.node_group_gpu.resource.preset}' is insufficient."
}
}
}
5 changes: 5 additions & 0 deletions soperator/modules/k8s/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ output "control_plane" {
}
}

output "cluster_id" {
description = "K8s cluster ID."
value = nebius_mk8s_v1_cluster.this.id
}

output "allocation_id" {
description = "ID of the VPC allocation used for SSH connection into Slurm cluster."
value = local.allocation_id
Expand Down
4 changes: 4 additions & 0 deletions soperator/modules/k8s/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ terraform {
module "labels" {
source = "../labels"
}

module "resources" {
source = "../available_resources"
}
2 changes: 1 addition & 1 deletion soperator/modules/login/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ resource "local_file" "this" {
terraform_data.connection_ip,
]

filename = "${path.root}/login.sh"
filename = "${path.root}/${var.script_name}.sh"
file_permission = "0774"
content = templatefile("${path.module}/templates/login.sh.tftpl", {
address = terraform_data.connection_ip.output
Expand Down
6 changes: 6 additions & 0 deletions soperator/modules/login/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ variable "slurm_cluster_name" {
type = string
nullable = false
}

variable "script_name" {
description = "Name of the script file."
type = string
default = "login"
}
17 changes: 0 additions & 17 deletions soperator/modules/nvidia_operators/locals.tf

This file was deleted.

60 changes: 0 additions & 60 deletions soperator/modules/nvidia_operators/main.tf

This file was deleted.

Loading

0 comments on commit 71a23ab

Please sign in to comment.