Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit d75e9ff

Browse files
committedNov 20, 2024··
generate explicit autopilot anti-affinities
Works around project-codeflare/appwrapper#259 as a stopgap until RHOAI 2.16 is released.
1 parent b9010d8 commit d75e9ff

File tree

2 files changed

+183
-1
lines changed

2 files changed

+183
-1
lines changed
 

‎tools/pytorchjob-generator/chart/templates/_helpers.tpl

+7-1
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,18 @@ terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
3636
schedulerName: default-scheduler
3737
{{- end }}
3838
priorityClassName: {{ .Values.priority }}
39-
{{- if .Values.hostIgnoreList }}
4039
affinity:
4140
nodeAffinity:
4241
requiredDuringSchedulingIgnoredDuringExecution:
4342
nodeSelectorTerms:
4443
- matchExpressions:
44+
- key: autopilot.ibm.com/gpuhealth
45+
operator: NotIn
46+
values:
47+
- ERR
48+
- TESTING
49+
- EVICT
50+
{{- if .Values.hostIgnoreList }}
4551
- key: kubernetes.io/hostname
4652
operator: NotIn
4753
values:

‎tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap

+176
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,17 @@ Adding Volume Mounts:
2626
metadata:
2727
namespace: my-namespace
2828
spec:
29+
affinity:
30+
nodeAffinity:
31+
requiredDuringSchedulingIgnoredDuringExecution:
32+
nodeSelectorTerms:
33+
- matchExpressions:
34+
- key: autopilot.ibm.com/gpuhealth
35+
operator: NotIn
36+
values:
37+
- ERR
38+
- TESTING
39+
- EVICT
2940
containers:
3041
- command:
3142
- sh
@@ -85,6 +96,17 @@ Adding Volume Mounts:
8596
metadata:
8697
namespace: my-namespace
8798
spec:
99+
affinity:
100+
nodeAffinity:
101+
requiredDuringSchedulingIgnoredDuringExecution:
102+
nodeSelectorTerms:
103+
- matchExpressions:
104+
- key: autopilot.ibm.com/gpuhealth
105+
operator: NotIn
106+
values:
107+
- ERR
108+
- TESTING
109+
- EVICT
88110
containers:
89111
- command:
90112
- sh
@@ -165,6 +187,17 @@ Adding initContainers:
165187
metadata:
166188
namespace: my-namespace
167189
spec:
190+
affinity:
191+
nodeAffinity:
192+
requiredDuringSchedulingIgnoredDuringExecution:
193+
nodeSelectorTerms:
194+
- matchExpressions:
195+
- key: autopilot.ibm.com/gpuhealth
196+
operator: NotIn
197+
values:
198+
- ERR
199+
- TESTING
200+
- EVICT
168201
containers:
169202
- command:
170203
- sh
@@ -227,6 +260,17 @@ Adding initContainers:
227260
metadata:
228261
namespace: my-namespace
229262
spec:
263+
affinity:
264+
nodeAffinity:
265+
requiredDuringSchedulingIgnoredDuringExecution:
266+
nodeSelectorTerms:
267+
- matchExpressions:
268+
- key: autopilot.ibm.com/gpuhealth
269+
operator: NotIn
270+
values:
271+
- ERR
272+
- TESTING
273+
- EVICT
230274
containers:
231275
- command:
232276
- sh
@@ -310,6 +354,17 @@ AppWrapper metadata should match snapshot:
310354
metadata:
311355
namespace: my-namespace
312356
spec:
357+
affinity:
358+
nodeAffinity:
359+
requiredDuringSchedulingIgnoredDuringExecution:
360+
nodeSelectorTerms:
361+
- matchExpressions:
362+
- key: autopilot.ibm.com/gpuhealth
363+
operator: NotIn
364+
values:
365+
- ERR
366+
- TESTING
367+
- EVICT
313368
containers:
314369
- command:
315370
- sh
@@ -359,6 +414,17 @@ AppWrapper metadata should match snapshot:
359414
metadata:
360415
namespace: my-namespace
361416
spec:
417+
affinity:
418+
nodeAffinity:
419+
requiredDuringSchedulingIgnoredDuringExecution:
420+
nodeSelectorTerms:
421+
- matchExpressions:
422+
- key: autopilot.ibm.com/gpuhealth
423+
operator: NotIn
424+
values:
425+
- ERR
426+
- TESTING
427+
- EVICT
362428
containers:
363429
- command:
364430
- sh
@@ -429,6 +495,17 @@ AppWrapper spec should match snapshot:
429495
metadata:
430496
namespace: my-namespace
431497
spec:
498+
affinity:
499+
nodeAffinity:
500+
requiredDuringSchedulingIgnoredDuringExecution:
501+
nodeSelectorTerms:
502+
- matchExpressions:
503+
- key: autopilot.ibm.com/gpuhealth
504+
operator: NotIn
505+
values:
506+
- ERR
507+
- TESTING
508+
- EVICT
432509
containers:
433510
- command:
434511
- sh
@@ -478,6 +555,17 @@ AppWrapper spec should match snapshot:
478555
metadata:
479556
namespace: my-namespace
480557
spec:
558+
affinity:
559+
nodeAffinity:
560+
requiredDuringSchedulingIgnoredDuringExecution:
561+
nodeSelectorTerms:
562+
- matchExpressions:
563+
- key: autopilot.ibm.com/gpuhealth
564+
operator: NotIn
565+
values:
566+
- ERR
567+
- TESTING
568+
- EVICT
481569
containers:
482570
- command:
483571
- sh
@@ -548,6 +636,17 @@ Enabling NVMe:
548636
metadata:
549637
namespace: my-namespace
550638
spec:
639+
affinity:
640+
nodeAffinity:
641+
requiredDuringSchedulingIgnoredDuringExecution:
642+
nodeSelectorTerms:
643+
- matchExpressions:
644+
- key: autopilot.ibm.com/gpuhealth
645+
operator: NotIn
646+
values:
647+
- ERR
648+
- TESTING
649+
- EVICT
551650
containers:
552651
- command:
553652
- sh
@@ -612,6 +711,17 @@ Enabling NVMe:
612711
metadata:
613712
namespace: my-namespace
614713
spec:
714+
affinity:
715+
nodeAffinity:
716+
requiredDuringSchedulingIgnoredDuringExecution:
717+
nodeSelectorTerms:
718+
- matchExpressions:
719+
- key: autopilot.ibm.com/gpuhealth
720+
operator: NotIn
721+
values:
722+
- ERR
723+
- TESTING
724+
- EVICT
615725
containers:
616726
- command:
617727
- sh
@@ -699,6 +809,17 @@ Enabling RoCE GDR:
699809
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
700810
namespace: my-namespace
701811
spec:
812+
affinity:
813+
nodeAffinity:
814+
requiredDuringSchedulingIgnoredDuringExecution:
815+
nodeSelectorTerms:
816+
- matchExpressions:
817+
- key: autopilot.ibm.com/gpuhealth
818+
operator: NotIn
819+
values:
820+
- ERR
821+
- TESTING
822+
- EVICT
702823
containers:
703824
- command:
704825
- sh
@@ -764,6 +885,17 @@ Enabling RoCE GDR:
764885
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
765886
namespace: my-namespace
766887
spec:
888+
affinity:
889+
nodeAffinity:
890+
requiredDuringSchedulingIgnoredDuringExecution:
891+
nodeSelectorTerms:
892+
- matchExpressions:
893+
- key: autopilot.ibm.com/gpuhealth
894+
operator: NotIn
895+
values:
896+
- ERR
897+
- TESTING
898+
- EVICT
767899
containers:
768900
- command:
769901
- sh
@@ -850,6 +982,17 @@ Enabling all advanced features at once:
850982
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
851983
namespace: my-namespace
852984
spec:
985+
affinity:
986+
nodeAffinity:
987+
requiredDuringSchedulingIgnoredDuringExecution:
988+
nodeSelectorTerms:
989+
- matchExpressions:
990+
- key: autopilot.ibm.com/gpuhealth
991+
operator: NotIn
992+
values:
993+
- ERR
994+
- TESTING
995+
- EVICT
853996
containers:
854997
- command:
855998
- sh
@@ -967,6 +1110,17 @@ Enabling all advanced features at once:
9671110
k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3
9681111
namespace: my-namespace
9691112
spec:
1113+
affinity:
1114+
nodeAffinity:
1115+
requiredDuringSchedulingIgnoredDuringExecution:
1116+
nodeSelectorTerms:
1117+
- matchExpressions:
1118+
- key: autopilot.ibm.com/gpuhealth
1119+
operator: NotIn
1120+
values:
1121+
- ERR
1122+
- TESTING
1123+
- EVICT
9701124
containers:
9711125
- command:
9721126
- sh
@@ -1103,6 +1257,17 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
11031257
metadata:
11041258
namespace: my-namespace
11051259
spec:
1260+
affinity:
1261+
nodeAffinity:
1262+
requiredDuringSchedulingIgnoredDuringExecution:
1263+
nodeSelectorTerms:
1264+
- matchExpressions:
1265+
- key: autopilot.ibm.com/gpuhealth
1266+
operator: NotIn
1267+
values:
1268+
- ERR
1269+
- TESTING
1270+
- EVICT
11061271
containers:
11071272
- command:
11081273
- sh
@@ -1166,6 +1331,17 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
11661331
metadata:
11671332
namespace: my-namespace
11681333
spec:
1334+
affinity:
1335+
nodeAffinity:
1336+
requiredDuringSchedulingIgnoredDuringExecution:
1337+
nodeSelectorTerms:
1338+
- matchExpressions:
1339+
- key: autopilot.ibm.com/gpuhealth
1340+
operator: NotIn
1341+
values:
1342+
- ERR
1343+
- TESTING
1344+
- EVICT
11691345
containers:
11701346
- command:
11711347
- sh

0 commit comments

Comments
 (0)
Please sign in to comment.