From 13eb7b21714002bd42f8c6a57642d81651f3ed44 Mon Sep 17 00:00:00 2001 From: Chia-Wei <56123090+cchen777@users.noreply.github.com> Date: Fri, 6 Sep 2024 00:35:49 +0800 Subject: [PATCH] [Feature] Display reconcile failures as events (ServiceAccount) (#2290) * [Feature] Display reconcile failures as events on ray clusters * reproduce * clean up * add more eventf for sa specific * add eventf for failed to createHeadPod, this can happen when service account not available in namespace * add woker pod create fail event * reuse actionable msg * add AutoscalerServiceAccountNotFound new K8sEventType --- ray-operator/controllers/ray/raycluster_controller.go | 11 +++++++---- ray-operator/controllers/ray/utils/constant.go | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index c6df3b6262..6138befdc9 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -1375,10 +1375,13 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con // zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/issues/1123 // for more details. if instance.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName == namespacedName.Name { - logger.Error(err, fmt.Sprintf( - "If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+ - "However, ServiceAccount %s is not found. Please create one. "+ - "See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name), "ServiceAccount", namespacedName) + actionableMessage := fmt.Sprintf("If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+ + "However, ServiceAccount %s is not found. Please create one. See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name) + + logger.Error( + err, + actionableMessage) + r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.AutoscalerServiceAccountNotFound), "Failed to reconcile RayCluster %s/%s. %s", instance.Namespace, instance.Name, actionableMessage) return err } diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index b0375968e4..a6fd2806c2 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -256,8 +256,9 @@ const ( FailedToCreateService K8sEventType = "FailedToCreateService" // ServiceAccount event list - CreatedServiceAccount K8sEventType = "CreatedServiceAccount" - FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount" + CreatedServiceAccount K8sEventType = "CreatedServiceAccount" + FailedToCreateServiceAccount K8sEventType = "FailedToCreateServiceAccount" + AutoscalerServiceAccountNotFound K8sEventType = "AutoscalerServiceAccountNotFound" // Role event list CreatedRole K8sEventType = "CreatedRole"