Skip to content

Commit 01050db

Browse files
committed
drop bypassCoscheduler; add test for setting schedulerName
1 parent 02213ef commit 01050db

File tree

7 files changed

+164
-10
lines changed

7 files changed

+164
-10
lines changed

tools/pytorchjob-generator/chart/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
5959
| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. |
6060
| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
6161
| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
62-
| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
62+
| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
6363
| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |
6464

6565
### Fault Tolerance

tools/pytorchjob-generator/chart/templates/_helpers.tpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ metadata:
3434
{{- if ne .Values.terminationGracePeriodSeconds nil }}
3535
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
3636
{{- end }}
37-
{{- if .Values.bypassCoscheduler }}
37+
{{- if .Values.schedulerName }}
3838
schedulerName: {{ .Values.schedulerName }}
3939
{{- end }}
4040
priorityClassName: {{ .Values.priority }}

tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap

+150
Original file line numberDiff line numberDiff line change
@@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
13621362
- emptyDir:
13631363
medium: Memory
13641364
name: dshm
1365+
scheduler can be set:
1366+
1: |
1367+
apiVersion: workload.codeflare.dev/v1beta2
1368+
kind: AppWrapper
1369+
metadata:
1370+
annotations:
1371+
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
1372+
labels:
1373+
kueue.x-k8s.io/queue-name: default-queue
1374+
name: my-job
1375+
namespace: my-namespace
1376+
spec:
1377+
components:
1378+
- template:
1379+
apiVersion: kubeflow.org/v1
1380+
kind: PyTorchJob
1381+
metadata:
1382+
name: my-job
1383+
spec:
1384+
pytorchReplicaSpecs:
1385+
Master:
1386+
replicas: 1
1387+
restartPolicy: Never
1388+
template:
1389+
spec:
1390+
affinity:
1391+
nodeAffinity:
1392+
requiredDuringSchedulingIgnoredDuringExecution:
1393+
nodeSelectorTerms:
1394+
- matchExpressions:
1395+
- key: autopilot.ibm.com/gpuhealth
1396+
operator: NotIn
1397+
values:
1398+
- ERR
1399+
- TESTING
1400+
- EVICT
1401+
containers:
1402+
- command:
1403+
- sh
1404+
- -c
1405+
- |
1406+
echo "Environment variables set by the kubeflow training operator:"
1407+
echo ${MASTER_ADDR}:${MASTER_PORT}
1408+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1409+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1410+
echo "Other injected environment variables:"
1411+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1412+
#
1413+
# User commands
1414+
#
1415+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
1416+
export RANK=$SAKKARA_RANK
1417+
git clone https://github.com/dbarnett/python-helloworld
1418+
cd python-helloworld
1419+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1420+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1421+
env:
1422+
- name: SAKKARA_RANK
1423+
valueFrom:
1424+
fieldRef:
1425+
fieldPath: metadata.labels['sakkara.member.rank']
1426+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1427+
imagePullPolicy: IfNotPresent
1428+
name: pytorch
1429+
resources:
1430+
limits:
1431+
cpu: 500m
1432+
memory: 1Gi
1433+
nvidia.com/gpu: 8
1434+
nvidia.com/roce_gdr: 0
1435+
requests:
1436+
cpu: 500m
1437+
memory: 1Gi
1438+
nvidia.com/gpu: 8
1439+
nvidia.com/roce_gdr: 0
1440+
volumeMounts:
1441+
- mountPath: /dev/shm
1442+
name: dshm
1443+
imagePullSecrets: []
1444+
priorityClassName: default-priority
1445+
schedulerName: sakkara
1446+
volumes:
1447+
- emptyDir:
1448+
medium: Memory
1449+
name: dshm
1450+
Worker:
1451+
replicas: 3
1452+
restartPolicy: Never
1453+
template:
1454+
spec:
1455+
affinity:
1456+
nodeAffinity:
1457+
requiredDuringSchedulingIgnoredDuringExecution:
1458+
nodeSelectorTerms:
1459+
- matchExpressions:
1460+
- key: autopilot.ibm.com/gpuhealth
1461+
operator: NotIn
1462+
values:
1463+
- ERR
1464+
- TESTING
1465+
- EVICT
1466+
containers:
1467+
- command:
1468+
- sh
1469+
- -c
1470+
- |
1471+
echo "Environment variables set by the kubeflow training operator:"
1472+
echo ${MASTER_ADDR}:${MASTER_PORT}
1473+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1474+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1475+
echo "Other injected environment variables:"
1476+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1477+
#
1478+
# User commands
1479+
#
1480+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
1481+
export RANK=$SAKKARA_RANK
1482+
git clone https://github.com/dbarnett/python-helloworld
1483+
cd python-helloworld
1484+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1485+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1486+
env:
1487+
- name: SAKKARA_RANK
1488+
valueFrom:
1489+
fieldRef:
1490+
fieldPath: metadata.labels['sakkara.member.rank']
1491+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1492+
imagePullPolicy: IfNotPresent
1493+
name: pytorch
1494+
resources:
1495+
limits:
1496+
cpu: 500m
1497+
memory: 1Gi
1498+
nvidia.com/gpu: 8
1499+
nvidia.com/roce_gdr: 0
1500+
requests:
1501+
cpu: 500m
1502+
memory: 1Gi
1503+
nvidia.com/gpu: 8
1504+
nvidia.com/roce_gdr: 0
1505+
volumeMounts:
1506+
- mountPath: /dev/shm
1507+
name: dshm
1508+
imagePullSecrets: []
1509+
priorityClassName: default-priority
1510+
schedulerName: sakkara
1511+
volumes:
1512+
- emptyDir:
1513+
medium: Memory
1514+
name: dshm

tools/pytorchjob-generator/chart/tests/helloworld_test.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ tests:
8686
path: metadata.namespace
8787
value: testing-ns
8888

89+
- it: scheduler can be set
90+
set:
91+
schedulerName: sakkara
92+
asserts:
93+
- matchSnapshot:
94+
path: spec.components[0].template
95+
8996
- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
9097
set:
9198
sshGitCloneConfig.secretName: my-git-secret

tools/pytorchjob-generator/chart/values.schema.json

+1-2
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,9 @@
114114
{ "type": "null" },
115115
{ "type": "array" }
116116
]},
117-
"bypassCoscheduler": { "type": "boolean" },
118117
"schedulerName": { "oneOf": [
119118
{ "type": "null" },
120-
{ "type": "string", "enum": ["sakkara", "default-scheduler" ] }
119+
{ "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
121120
]},
122121
"serviceAccountName": { "oneOf" : [
123122
{ "type": "null" },

tools/pytorchjob-generator/chart/values.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,11 @@ hostIgnoreList:
211211
# - a100-large-drlfv-worker-3-with-secondary-nw5qh
212212
# - a100-large-drlfv-worker-3-with-secondary-lb7ch
213213

214-
# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
215-
# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
216-
# to true when explicitly directed to do so by a cluster admin!***
214+
# -- (string) If non-nil, use the specified Kubernetes scheduler.
215+
# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
216+
# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
217217
# @section -- Advanced Options
218-
bypassCoscheduler: false
218+
schedulerName:
219219

220220
# -- (string) Service account to be used for running the Job
221221
# @section -- Advanced Options

tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
namespace: my-namespace # namespace to deploy to (required)
21
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
32
queueName: default-queue # local queue to submit to (default: default-queue)
43

5-
bypassCoscheduler: true
64
schedulerName: sakkara
75
# If additional constraints are used, specify the configmap here:
86
#customLabels:

0 commit comments

Comments
 (0)