Skip to content

Commit ad361c0

Browse files
authored
pytorchjob-generator: add successTTLDuration (#129)
1 parent 86c0e24 commit ad361c0

File tree

5 files changed

+19
-4
lines changed

5 files changed

+19
-4
lines changed

tools/pytorchjob-generator/chart/README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ customize the Jobs generated by the tool.
7171
| failureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
7272
| retryPausePeriodDuration | string | The AppWrapper defaults will be used | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
7373
| retryLimit | integer | The AppWrapper defaults will be used | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
74-
| forcefulDeletionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
74+
| forcefulDeletionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the forcefulDeletionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
7575
| deletionOnFailureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
76-
| restartPolicy | string | `"Never"` | Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod). |
76+
| successTTLDuration | string | The AppWrapper defaults will be used | Customize the successTTL; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ |
77+
| restartPolicy | string | `"Never"` | Set Kubernetes policy for restarting failed containers "in place" (without restarting the Pod). |
7778
| terminationGracePeriodSeconds | integer | Kubernetes's default value is used | Set a non-default pod termination grace period (in seconds). |

tools/pytorchjob-generator/chart/templates/appwrapper.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ metadata:
7878
{{- if .Values.deletionOnFailureGracePeriodDuration }}
7979
workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "{{ .Values.deletionOnFailureGracePeriodDuration }}"
8080
{{- end }}
81+
{{- if .Values.successTTLDuration }}
82+
workload.codeflare.dev.appwrapper/successTTLDuration: "{{ .Values.successTTLDuration }}"
83+
{{- end }}
8184
{{- if or .Values.queueName .Values.customLabels }}
8285
labels:
8386
{{- if .Values.queueName }}

tools/pytorchjob-generator/chart/tests/helloworld_test.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ tests:
186186
retryLimit: 42
187187
forcefulDeletionGracePeriodDuration: "19s"
188188
deletionOnFailureGracePeriodDuration: "2s"
189+
successTTLDuration: "600s"
189190
asserts:
190191
- isSubset:
191192
path: metadata.annotations
@@ -197,6 +198,7 @@ tests:
197198
workload.codeflare.dev.appwrapper/retryLimit: "42"
198199
workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "19s"
199200
workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "2s"
201+
workload.codeflare.dev.appwrapper/successTTLDuration: "600s"
200202

201203
- it: Setting integer fault tolerance annotation to 0
202204
set:

tools/pytorchjob-generator/chart/values.schema.json

+4
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@
153153
"deletionOnFailureGracePeriodDuration" : { "oneOf" : [
154154
{ "type": "null" },
155155
{ "$ref": "#/$defs/duration" }
156+
]},
157+
"successTTLDuration" : { "oneOf" : [
158+
{ "type": "null" },
159+
{ "$ref": "#/$defs/duration" }
156160
]}
157161
},
158162

tools/pytorchjob-generator/chart/values.yaml

+7-2
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ retryPausePeriodDuration:
255255
# @default -- The AppWrapper defaults will be used
256256
retryLimit:
257257

258-
# -- (string) Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/
258+
# -- (string) Customize the forcefulDeletionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/
259259
# @section -- Fault Tolerance
260260
# @default -- The AppWrapper defaults will be used
261261
forcefulDeletionGracePeriodDuration:
@@ -265,7 +265,12 @@ forcefulDeletionGracePeriodDuration:
265265
# @default -- The AppWrapper defaults will be used
266266
deletionOnFailureGracePeriodDuration:
267267

268-
# -- (string) Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod).
268+
# -- (string) Customize the successTTL; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/
269+
# @section -- Fault Tolerance
270+
# @default -- The AppWrapper defaults will be used
271+
successTTLDuration:
272+
273+
# -- (string) Set Kubernetes policy for restarting failed containers "in place" (without restarting the Pod).
269274
# @section -- Fault Tolerance
270275
restartPolicy: "Never"
271276

0 commit comments

Comments
 (0)