Skip to content

Commit

Permalink
[Feature] Display reconcile failures as events (ServiceAccount) (#2290)
Browse files Browse the repository at this point in the history
* [Feature] Display reconcile failures as events on ray clusters

* reproduce

* clean up

* add more eventf for sa specific

* add eventf for failed to createHeadPod, this can happen when service account not available in namespace

* add woker pod create fail event

* reuse actionable msg

* add AutoscalerServiceAccountNotFound new K8sEventType
  • Loading branch information
cchen777 authored Sep 5, 2024
1 parent 5231dbf commit 13eb7b2
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
11 changes: 7 additions & 4 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1375,10 +1375,13 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con
// zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/issues/1123
// for more details.
if instance.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName == namespacedName.Name {
logger.Error(err, fmt.Sprintf(
"If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
"However, ServiceAccount %s is not found. Please create one. "+
"See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name), "ServiceAccount", namespacedName)
actionableMessage := fmt.Sprintf("If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
"However, ServiceAccount %s is not found. Please create one. See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name)

logger.Error(
err,
actionableMessage)
r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.AutoscalerServiceAccountNotFound), "Failed to reconcile RayCluster %s/%s. %s", instance.Namespace, instance.Name, actionableMessage)
return err
}

Expand Down
5 changes: 3 additions & 2 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,9 @@ const (
FailedToCreateService K8sEventType = "FailedToCreateService"

// ServiceAccount event list
CreatedServiceAccount K8sEventType = "CreatedServiceAccount"
FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount"
CreatedServiceAccount K8sEventType = "CreatedServiceAccount"
FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount"
AutoscalerServiceAccountNotFound K8sEventType = "AutoscalerServiceAccountNotFound"

// Role event list
CreatedRole K8sEventType = "CreatedRole"
Expand Down

0 comments on commit 13eb7b2

Please sign in to comment.