Skip to content

Commit 7430be3

Browse files
rueianwin5923
authored andcommitted
[RayCluster][Feature] skip suspending worker groups if the in-tree autoscaler is enabled (ray-project#2748)
1 parent 26b7dbe commit 7430be3

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,16 @@ func validateRayClusterSpec(instance *rayv1.RayCluster) error {
261261
// TODO (kevin85421): If GcsFaultToleranceOptions is set, users should use `GcsFaultToleranceOptions.RedisAddress` instead of `RAY_REDIS_ADDRESS`.
262262
// TODO (kevin85421): If GcsFaultToleranceOptions is set, users should use `GcsFaultToleranceOptions.ExternalStorageNamespace` instead of
263263
// the annotation `ray.io/external-storage-namespace`.
264+
265+
enableInTreeAutoscaling := (instance.Spec.EnableInTreeAutoscaling != nil) && (*instance.Spec.EnableInTreeAutoscaling)
266+
if enableInTreeAutoscaling {
267+
for _, workerGroup := range instance.Spec.WorkerGroupSpecs {
268+
if workerGroup.Suspend != nil && *workerGroup.Suspend {
269+
// TODO (rueian): This can be supported in future Ray. We should check the RayVersion once we know the version.
270+
return fmt.Errorf("suspending worker groups is not currently supported with Autoscaler enabled")
271+
}
272+
}
273+
}
264274
return nil
265275
}
266276

ray-operator/controllers/ray/raycluster_controller_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,47 @@ var _ = Context("Inside the default namespace", func() {
861861
})
862862
})
863863

864+
Describe("Suspend RayCluster worker group with Autoscaler enabled", Ordered, func() {
865+
ctx := context.Background()
866+
namespace := "default"
867+
rayCluster := rayClusterTemplate("raycluster-suspend-workergroup-autoscaler", namespace)
868+
rayCluster.Spec.EnableInTreeAutoscaling = ptr.To(true)
869+
allPods := corev1.PodList{}
870+
allFilters := common.RayClusterAllPodsAssociationOptions(rayCluster).ToListOptions()
871+
workerFilters := common.RayClusterGroupPodsAssociationOptions(rayCluster, rayCluster.Spec.WorkerGroupSpecs[0].GroupName).ToListOptions()
872+
headFilters := common.RayClusterHeadPodsAssociationOptions(rayCluster).ToListOptions()
873+
874+
It("Create a RayCluster custom resource", func() {
875+
err := k8sClient.Create(ctx, rayCluster)
876+
Expect(err).NotTo(HaveOccurred(), "Failed to create RayCluster")
877+
Eventually(getResourceFunc(ctx, client.ObjectKey{Name: rayCluster.Name, Namespace: namespace}, rayCluster),
878+
time.Second*3, time.Millisecond*500).Should(BeNil(), "Should be able to see RayCluster: %v", rayCluster.Name)
879+
})
880+
881+
It("Check the number of Pods", func() {
882+
Eventually(listResourceFunc(ctx, &allPods, allFilters...), time.Second*3, time.Millisecond*500).
883+
Should(Equal(4), fmt.Sprintf("all pods %v", allPods.Items))
884+
})
885+
886+
It("Setting suspend=true in first worker group should not fail", func() {
887+
// suspend the Raycluster worker group
888+
err := updateRayClusterWorkerGroupSuspendField(ctx, rayCluster, true)
889+
Expect(err).NotTo(HaveOccurred(), "Failed to update RayCluster")
890+
})
891+
892+
It("Worker pods should not be deleted and head pod should still be running", func() {
893+
Consistently(listResourceFunc(ctx, &allPods, workerFilters...), time.Second*5, time.Millisecond*500).
894+
Should(Equal(3), fmt.Sprintf("all pods %v", allPods.Items))
895+
Consistently(listResourceFunc(ctx, &allPods, headFilters...), time.Second*5, time.Millisecond*500).
896+
Should(Equal(1), fmt.Sprintf("all pods %v", allPods.Items))
897+
})
898+
899+
It("Delete the cluster", func() {
900+
err := k8sClient.Delete(ctx, rayCluster)
901+
Expect(err).NotTo(HaveOccurred())
902+
})
903+
})
904+
864905
Describe("RayCluster with a multi-host worker group", Ordered, func() {
865906
ctx := context.Background()
866907
namespace := "default"

ray-operator/controllers/ray/raycluster_controller_unit_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3729,6 +3729,67 @@ func TestValidateRayClusterSpecEmptyContainers(t *testing.T) {
37293729
}
37303730
}
37313731

3732+
func TestValidateRayClusterSpecSuspendingWorkerGroup(t *testing.T) {
3733+
headGroupSpec := rayv1.HeadGroupSpec{
3734+
Template: corev1.PodTemplateSpec{
3735+
Spec: corev1.PodSpec{
3736+
Containers: []corev1.Container{{Name: "ray-head"}},
3737+
},
3738+
},
3739+
}
3740+
workerGroupSpecSuspended := rayv1.WorkerGroupSpec{
3741+
Template: corev1.PodTemplateSpec{
3742+
Spec: corev1.PodSpec{
3743+
Containers: []corev1.Container{{Name: "ray-worker"}},
3744+
},
3745+
},
3746+
}
3747+
workerGroupSpecSuspended.Suspend = ptr.To[bool](true)
3748+
3749+
tests := []struct {
3750+
rayCluster *rayv1.RayCluster
3751+
name string
3752+
errorMessage string
3753+
expectError bool
3754+
}{
3755+
{
3756+
name: "suspend without autoscaler",
3757+
rayCluster: &rayv1.RayCluster{
3758+
Spec: rayv1.RayClusterSpec{
3759+
HeadGroupSpec: headGroupSpec,
3760+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{workerGroupSpecSuspended},
3761+
},
3762+
},
3763+
expectError: false,
3764+
},
3765+
{
3766+
// TODO (rueian): This can be supported in future Ray. We should check the RayVersion once we know the version.
3767+
name: "suspend with autoscaler",
3768+
rayCluster: &rayv1.RayCluster{
3769+
Spec: rayv1.RayClusterSpec{
3770+
HeadGroupSpec: headGroupSpec,
3771+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{workerGroupSpecSuspended},
3772+
EnableInTreeAutoscaling: ptr.To[bool](true),
3773+
},
3774+
},
3775+
expectError: true,
3776+
errorMessage: "suspending worker groups is not currently supported with Autoscaler enabled",
3777+
},
3778+
}
3779+
3780+
for _, tt := range tests {
3781+
t.Run(tt.name, func(t *testing.T) {
3782+
err := validateRayClusterSpec(tt.rayCluster)
3783+
if tt.expectError {
3784+
assert.Error(t, err)
3785+
assert.EqualError(t, err, tt.errorMessage)
3786+
} else {
3787+
assert.Nil(t, err)
3788+
}
3789+
})
3790+
}
3791+
}
3792+
37323793
func TestValidateRayClusterStatus(t *testing.T) {
37333794
tests := []struct {
37343795
name string

0 commit comments

Comments
 (0)