@@ -320,6 +320,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
320320 f"--resource_manager_address=localhost:{ _PATHWAYS_RESOURCE_MANAGER_PORT } " ,
321321 f"--server_port={ _PATHWAYS_PROXY_PORT } " ,
322322 f"--gcs_scratch_location={ staging_location } " ,
323+ # This should be made configurable
324+ f"--num_elastic_slices={ cfg .accelerator .num_replicas } "
323325 ]
324326 cmd_args .extend (xla_flags_from_options (self ._xla_options ).split ())
325327
@@ -581,14 +583,19 @@ def _build_pathways_worker_job(
581583 annotations .update (
582584 {"alpha.jobset.sigs.k8s.io/exclusive-topology" : "cloud.google.com/gke-nodepool" }
583585 )
586+ # Default value for suspend and resume.
587+ # References:
588+ # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
589+ # backoffLimit = system.vms_per_slice * 4
590+
591+ # This backoffLimit is just for verifying elastic fast-resume
592+ large_number = 1000
593+ backoffLimit = system .vms_per_slice * 4 * large_number
584594
585595 spec = dict (
586596 parallelism = system .vms_per_slice ,
587597 completions = system .vms_per_slice ,
588- # Default value for suspend and resume.
589- # References:
590- # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
591- backoffLimit = system .vms_per_slice * 4 ,
598+ backoffLimit = backoffLimit ,
592599 template = self ._build_pathways_worker_pod (pathways_worker_replicated_job_index ),
593600 )
594601 worker_job = dict (
0 commit comments