Skip to content

Commit 13eb7b2

Browse files
authored
[Feature] Display reconcile failures as events (ServiceAccount) (#2290)
* [Feature] Display reconcile failures as events on ray clusters * reproduce * clean up * add more eventf for sa specific * add eventf for failed to createHeadPod, this can happen when service account not available in namespace * add woker pod create fail event * reuse actionable msg * add AutoscalerServiceAccountNotFound new K8sEventType
1 parent 5231dbf commit 13eb7b2

File tree

2 files changed

+10
-6
lines changed

2 files changed

+10
-6
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,10 +1375,13 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con
13751375
// zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/issues/1123
13761376
// for more details.
13771377
if instance.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName == namespacedName.Name {
1378-
logger.Error(err, fmt.Sprintf(
1379-
"If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
1380-
"However, ServiceAccount %s is not found. Please create one. "+
1381-
"See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name), "ServiceAccount", namespacedName)
1378+
actionableMessage := fmt.Sprintf("If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
1379+
"However, ServiceAccount %s is not found. Please create one. See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name)
1380+
1381+
logger.Error(
1382+
err,
1383+
actionableMessage)
1384+
r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.AutoscalerServiceAccountNotFound), "Failed to reconcile RayCluster %s/%s. %s", instance.Namespace, instance.Name, actionableMessage)
13821385
return err
13831386
}
13841387

ray-operator/controllers/ray/utils/constant.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,9 @@ const (
256256
FailedToCreateService K8sEventType = "FailedToCreateService"
257257

258258
// ServiceAccount event list
259-
CreatedServiceAccount K8sEventType = "CreatedServiceAccount"
260-
FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount"
259+
CreatedServiceAccount K8sEventType = "CreatedServiceAccount"
260+
FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount"
261+
AutoscalerServiceAccountNotFound K8sEventType = "AutoscalerServiceAccountNotFound"
261262

262263
// Role event list
263264
CreatedRole K8sEventType = "CreatedRole"

0 commit comments

Comments
 (0)