Skip to content

Commit a68ca2e

Browse files
committed
[RayCluster][Feature] skip suspending worker groups if the in-tree autoscaler is enabled to prevent ray cluster from malfunctioning
Signed-off-by: Rueian <[email protected]>
1 parent 42f299a commit a68ca2e

File tree

2 files changed

+57
-10
lines changed

2 files changed

+57
-10
lines changed

ray-operator/controllers/ray/raycluster_controller.go

+16-10
Original file line numberDiff line numberDiff line change
@@ -772,16 +772,24 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
772772
return err
773773
}
774774

775-
// Delete all workers if worker group is suspended and skip reconcile
775+
// Delete all workers if worker group is suspended and skip reconcile if enableInTreeAutoscaling is not enabled.
776+
enableInTreeAutoscaling := (instance.Spec.EnableInTreeAutoscaling != nil) && (*instance.Spec.EnableInTreeAutoscaling)
776777
if worker.Suspend != nil && *worker.Suspend {
777-
if _, err := r.deleteAllPods(ctx, common.RayClusterGroupPodsAssociationOptions(instance, worker.GroupName)); err != nil {
778-
r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeleteWorkerPodCollection),
779-
"Failed deleting worker Pods for suspended group %s in RayCluster %s/%s, %v", worker.GroupName, instance.Namespace, instance.Name, err)
780-
return errstd.Join(utils.ErrFailedDeleteWorkerPod, err)
778+
if enableInTreeAutoscaling {
779+
// TODO: This can be supported in future Ray. We should check the RayVersion on the CR once we know the future version.
780+
r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.InvalidRayClusterStatus),
781+
"Suspending the worker group %s is not supported in RayCluster %s/%s because its Autoscaler is enabled", worker.GroupName, instance.Namespace, instance.Name)
782+
continue
783+
} else {
784+
if _, err := r.deleteAllPods(ctx, common.RayClusterGroupPodsAssociationOptions(instance, worker.GroupName)); err != nil {
785+
r.Recorder.Eventf(instance, corev1.EventTypeWarning, string(utils.FailedToDeleteWorkerPodCollection),
786+
"Failed deleting worker Pods for suspended group %s in RayCluster %s/%s, %v", worker.GroupName, instance.Namespace, instance.Name, err)
787+
return errstd.Join(utils.ErrFailedDeleteWorkerPod, err)
788+
}
789+
r.Recorder.Eventf(instance, corev1.EventTypeNormal, string(utils.DeletedWorkerPod),
790+
"Deleted all pods for suspended worker group %s in RayCluster %s/%s", worker.GroupName, instance.Namespace, instance.Name)
791+
continue
781792
}
782-
r.Recorder.Eventf(instance, corev1.EventTypeNormal, string(utils.DeletedWorkerPod),
783-
"Deleted all pods for suspended worker group %s in RayCluster %s/%s", worker.GroupName, instance.Namespace, instance.Name)
784-
continue
785793
}
786794

787795
// Delete unhealthy worker Pods.
@@ -869,8 +877,6 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
869877
// diff < 0 indicates the need to delete some Pods to match the desired number of replicas. However,
870878
// randomly deleting Pods is certainly not ideal. So, if autoscaling is enabled for the cluster, we
871879
// will disable random Pod deletion, making Autoscaler the sole decision-maker for Pod deletions.
872-
enableInTreeAutoscaling := (instance.Spec.EnableInTreeAutoscaling != nil) && (*instance.Spec.EnableInTreeAutoscaling)
873-
874880
// TODO (kevin85421): `enableRandomPodDelete` is a feature flag for KubeRay v0.6.0. If users want to use
875881
// the old behavior, they can set the environment variable `ENABLE_RANDOM_POD_DELETE` to `true`. When the
876882
// default behavior is stable enough, we can remove this feature flag.

ray-operator/controllers/ray/raycluster_controller_test.go

+41
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,47 @@ var _ = Context("Inside the default namespace", func() {
861861
})
862862
})
863863

864+
Describe("Suspend RayCluster worker group with Autoscaler enabled", Ordered, func() {
865+
ctx := context.Background()
866+
namespace := "default"
867+
rayCluster := rayClusterTemplate("raycluster-suspend-workergroup-autoscaler", namespace)
868+
rayCluster.Spec.EnableInTreeAutoscaling = ptr.To(true)
869+
allPods := corev1.PodList{}
870+
allFilters := common.RayClusterAllPodsAssociationOptions(rayCluster).ToListOptions()
871+
workerFilters := common.RayClusterGroupPodsAssociationOptions(rayCluster, rayCluster.Spec.WorkerGroupSpecs[0].GroupName).ToListOptions()
872+
headFilters := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
873+
874+
It("Create a RayCluster custom resource", func() {
875+
err := k8sClient.Create(ctx, rayCluster)
876+
Expect(err).NotTo(HaveOccurred(), "Failed to create RayCluster")
877+
Eventually(getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
878+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
879+
})
880+
881+
It("Check the number of Pods and add finalizers", func() {
882+
Eventually(listResourceFunc(ctx, &allPods, allFilters...), time.Second*3, time.Millisecond*500).
883+
Should(Equal(4), fmt.Sprintf("all pods %v", allPods.Items))
884+
})
885+
886+
It("Setting suspend=true in first worker group should not fail", func() {
887+
// suspend the Raycluster worker group
888+
err := updateRayClusterWorkerGroupSuspendField(ctx, rayCluster, true)
889+
Expect(err).NotTo(HaveOccurred(), "Failed to update RayCluster")
890+
})
891+
892+
It("Worker pods should be not deleted and head pod should still be running", func() {
893+
Consistently(listResourceFunc(ctx, &allPods, workerFilters...), time.Second*5, time.Millisecond*500).
894+
Should(Equal(3), fmt.Sprintf("all pods %v", allPods.Items))
895+
Consistently(listResourceFunc(ctx, &allPods, headFilters...), time.Second*5, time.Millisecond*500).
896+
Should(Equal(1), fmt.Sprintf("all pods %v", allPods.Items))
897+
})
898+
899+
It("Delete the cluster", func() {
900+
err := k8sClient.Delete(ctx, rayCluster)
901+
Expect(err).NotTo(HaveOccurred())
902+
})
903+
})
904+
864905
Describe("RayCluster with a multi-host worker group", Ordered, func() {
865906
ctx := context.Background()
866907
namespace := "default"

0 commit comments

Comments
 (0)