From 3bee975d22a04adbf712d490d1e618ddbe31056f Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Sat, 14 Aug 2021 14:31:59 -0700 Subject: [PATCH] fix incorrect torch env population --- pkg/controller.v1/pytorch/pytorch.go | 4 ++-- pkg/controller.v1/pytorch/pytorchjob_controller.go | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/controller.v1/pytorch/pytorch.go b/pkg/controller.v1/pytorch/pytorch.go index 1826fd9674..fff2716620 100644 --- a/pkg/controller.v1/pytorch/pytorch.go +++ b/pkg/controller.v1/pytorch/pytorch.go @@ -13,7 +13,7 @@ import ( func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype, index string) error { pytorchjob, ok := obj.(*pytorchv1.PyTorchJob) if !ok { - return fmt.Errorf("%+v is not a type of XGBoostJob", obj) + return fmt.Errorf("%+v is not a type of PyTorchJob", obj) } rank, err := strconv.Atoi(index) @@ -29,7 +29,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype, } masterAddr := genGeneralName(pytorchjob.Name, strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)), strconv.Itoa(0)) - if rtype == string(pytorchv1.PyTorchReplicaTypeMaster) { + if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) { if rank != 0 { return fmt.Errorf("invalid config: There should be only a single master with index=0") } diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index bd952fde55..04b3d7a4d0 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -253,7 +253,7 @@ func (r *PyTorchJobReconciler) GetJobFromAPIClient(namespace, name string) (meta err = clientReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) if err != nil { if errors.IsNotFound(err) { - logrus.Error(err, "xgboost job not found", "namespace", namespace, "name", name) + logrus.Error(err, "pytorch job not found", "namespace", namespace, "name", name) } else { logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) } @@ -327,12 +327,12 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[com running := status.Active failed := status.Failed - logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", + logrus.Infof("PyTorchJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", pytorchjob.Name, rtype, expected, running, succeeded, failed) if rtype == commonv1.ReplicaType(pytorchv1.PyTorchReplicaTypeMaster) { if running > 0 { - msg := fmt.Sprintf("XGBoostJob %s is running.", pytorchjob.Name) + msg := fmt.Sprintf("PyTorchJob %s is running.", pytorchjob.Name) err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append job condition error: %v", err) @@ -341,7 +341,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } // when master is succeed, the job is finished. if expected == 0 { - msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", pytorchjob.Name) + msg := fmt.Sprintf("PyTorchJob %s is successfully completed.", pytorchjob.Name) logrus.Info(msg) r.Recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobSucceededReason, msg) if jobStatus.CompletionTime == nil { @@ -358,7 +358,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } if failed > 0 { if spec.RestartPolicy == commonv1.RestartPolicyExitCode { - msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype) + msg := fmt.Sprintf("PyTorchJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype) r.Recorder.Event(pytorchjob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg) err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg) if err != nil { @@ -366,7 +366,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[com return err } } else { - msg := fmt.Sprintf("XGBoostJob %s is failed because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype) + msg := fmt.Sprintf("PyTorchJob %s is failed because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype) r.Recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobFailedReason, msg) if pytorchjob.Status.CompletionTime == nil { now := metav1.Now() @@ -386,7 +386,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[com commonutil.LoggerForJob(pytorchjob).Infof(msg) if err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg); err != nil { - commonutil.LoggerForJob(pytorchjob).Error(err, "failed to update XGBoost Job conditions") + commonutil.LoggerForJob(pytorchjob).Error(err, "failed to update PyTorch Job conditions") return err }