From 7840fa12380f72e219c98d838a5c2c5e2b8dbc6a Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 19 Dec 2024 13:48:31 -0500 Subject: [PATCH] Sakkara support (#118) * Add support for Sakkara Co-authored-by: Lixiang "Eric" Luo * drop bypassCoscheduler; add test for setting schedulerName --------- Co-authored-by: Lixiang "Eric" Luo --- tools/pytorchjob-generator/chart/README.md | 2 +- .../chart/templates/_helpers.tpl | 16 +- .../__snapshot__/helloworld_test.yaml.snap | 150 ++++++++++++++++++ .../chart/tests/helloworld_test.yaml | 7 + .../chart/values.schema.json | 5 +- tools/pytorchjob-generator/chart/values.yaml | 8 +- .../examples/helloworld-sakkara.settings.yaml | 26 +++ 7 files changed, 205 insertions(+), 9 deletions(-) create mode 100644 tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md index e4820ca..589850c 100644 --- a/tools/pytorchjob-generator/chart/README.md +++ b/tools/pytorchjob-generator/chart/README.md @@ -59,7 +59,7 @@ customize the Jobs generated by the tool. | initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. | | autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. | | hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). | -| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** | +| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** | | serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job | ### Fault Tolerance diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index f5dc4f2..458589e 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -34,8 +34,8 @@ metadata: {{- if ne .Values.terminationGracePeriodSeconds nil }} terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} {{- end }} -{{- if .Values.bypassCoscheduler }} -schedulerName: default-scheduler +{{- if .Values.schedulerName }} +schedulerName: {{ .Values.schedulerName }} {{- end }} priorityClassName: {{ .Values.priority }} affinity: @@ -81,8 +81,14 @@ envFrom: - configMapRef: name: {{ .Values.ncclGdrEnvConfigMap }} {{- end }} -{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }} +{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }} env: + {{- if eq .Values.schedulerName "sakkara" }} + - name: SAKKARA_RANK + valueFrom: + fieldRef: + fieldPath: metadata.labels['sakkara.member.rank'] + {{- end }} {{- if .Values.topologyFileConfigMap }} - name: NCCL_TOPO_FILE value: /var/run/nvidia-topologyd/virtualTopology.xml @@ -146,6 +152,10 @@ command: # # User commands # + {{- if eq .Values.schedulerName "sakkara" }} + echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" + export RANK=$SAKKARA_RANK + {{- end }} {{- range $command := .Values.setupCommands }} {{ $command }} {{- end }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap index fe516e9..29ba19e 100644 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap @@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: - emptyDir: medium: Memory name: dshm +scheduler can be set: + 1: | + apiVersion: workload.codeflare.dev/v1beta2 + kind: AppWrapper + metadata: + annotations: + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6 + labels: + kueue.x-k8s.io/queue-name: default-queue + name: my-job + namespace: my-namespace + spec: + components: + - template: + apiVersion: kubeflow.org/v1 + kind: PyTorchJob + metadata: + name: my-job + spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: Never + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: autopilot.ibm.com/gpuhealth + operator: NotIn + values: + - ERR + - TESTING + - EVICT + containers: + - command: + - sh + - -c + - | + echo "Environment variables set by the kubeflow training operator:" + echo ${MASTER_ADDR}:${MASTER_PORT} + echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} + echo My global rank is ${RANK} / ${WORLD_SIZE} + echo "Other injected environment variables:" + echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} + # + # User commands + # + echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" + export RANK=$SAKKARA_RANK + git clone https://github.com/dbarnett/python-helloworld + cd python-helloworld + echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + env: + - name: SAKKARA_RANK + valueFrom: + fieldRef: + fieldPath: metadata.labels['sakkara.member.rank'] + image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + imagePullPolicy: IfNotPresent + name: pytorch + resources: + limits: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + requests: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + imagePullSecrets: [] + priorityClassName: default-priority + schedulerName: sakkara + volumes: + - emptyDir: + medium: Memory + name: dshm + Worker: + replicas: 3 + restartPolicy: Never + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: autopilot.ibm.com/gpuhealth + operator: NotIn + values: + - ERR + - TESTING + - EVICT + containers: + - command: + - sh + - -c + - | + echo "Environment variables set by the kubeflow training operator:" + echo ${MASTER_ADDR}:${MASTER_PORT} + echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} + echo My global rank is ${RANK} / ${WORLD_SIZE} + echo "Other injected environment variables:" + echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} + # + # User commands + # + echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" + export RANK=$SAKKARA_RANK + git clone https://github.com/dbarnett/python-helloworld + cd python-helloworld + echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + env: + - name: SAKKARA_RANK + valueFrom: + fieldRef: + fieldPath: metadata.labels['sakkara.member.rank'] + image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + imagePullPolicy: IfNotPresent + name: pytorch + resources: + limits: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + requests: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + volumeMounts: + - mountPath: /dev/shm + name: dshm + imagePullSecrets: [] + priorityClassName: default-priority + schedulerName: sakkara + volumes: + - emptyDir: + medium: Memory + name: dshm diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 01b7fb5..1d8c607 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -86,6 +86,13 @@ tests: path: metadata.namespace value: testing-ns +- it: scheduler can be set + set: + schedulerName: sakkara + asserts: + - matchSnapshot: + path: spec.components[0].template + - it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts set: sshGitCloneConfig.secretName: my-git-secret diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index 241ac06..246a198 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -114,7 +114,10 @@ { "type": "null" }, { "type": "array" } ]}, - "bypassCoscheduler": { "type": "boolean" }, + "schedulerName": { "oneOf": [ + { "type": "null" }, + { "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] } + ]}, "serviceAccountName": { "oneOf" : [ { "type": "null" }, { "$ref": "#/$defs/rfc1123Label" } diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml index d5aca09..b947546 100644 --- a/tools/pytorchjob-generator/chart/values.yaml +++ b/tools/pytorchjob-generator/chart/values.yaml @@ -211,11 +211,11 @@ hostIgnoreList: # - a100-large-drlfv-worker-3-with-secondary-nw5qh # - a100-large-drlfv-worker-3-with-secondary-lb7ch -# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler. -# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set -# to true when explicitly directed to do so by a cluster admin!*** +# -- (string) If non-nil, use the specified Kubernetes scheduler. +# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this +# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** # @section -- Advanced Options -bypassCoscheduler: false +schedulerName: # -- (string) Service account to be used for running the Job # @section -- Advanced Options diff --git a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml new file mode 100644 index 0000000..67c83cc --- /dev/null +++ b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml @@ -0,0 +1,26 @@ +jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) +queueName: default-queue # local queue to submit to (default: default-queue) + +schedulerName: sakkara +# If additional constraints are used, specify the configmap here: +#customLabels: +# - key: sakkara.group.name +# value: my-topogrp-0 + +numPods: 4 # total pod count including master and worker pods (default: 1) +numCpusPerPod: 500m # requested number of cpus per pod (default: 1) +numGpusPerPod: 8 # requested number of gpus per pod (default: 0) +totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) + +priority: default-priority # default-priority (default), low-priority, or high-priority + +# container image for the pods (required) +containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + +# setup commands to run in each pod (optional) +setupCommands: +- git clone https://github.com/dbarnett/python-helloworld +- cd python-helloworld + +# main program to invoke via torchrun (optional) +mainProgram: helloworld.py