From 14887c6c1efaa860081126ff9257650b5a5ab390 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 10 Jan 2025 13:19:02 +0000 Subject: [PATCH 1/4] Add test for custom idle timeout seconds Signed-off-by: Ryan O'Leary --- ray-operator/Makefile | 3 + .../raycluster_autoscaler_test.go | 70 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 2ca043467d9..39c0aafa8db 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -72,6 +72,9 @@ test-e2e: WHAT ?= ./test/e2e test-e2e: manifests fmt vet ## Run e2e tests. go test -timeout 30m -v $(WHAT) +test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler +test-e2e-autoscaler: manifests fmt vet ## Run e2e tests. + go test -timeout 30m -v $(WHAT) test-sampleyaml: WHAT ?= ./test/sampleyaml test-sampleyaml: manifests fmt vet diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 403a380d5b5..1d702d44408 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -2,7 +2,9 @@ package e2eautoscaler import ( "fmt" + "os" "testing" + "time" "github.com/onsi/gomega" corev1ac "k8s.io/client-go/applyconfigurations/core/v1" @@ -355,3 +357,71 @@ func TestRayClusterAutoscalerMinReplicasUpdate(t *testing.T) { }) } } + +func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { + // Only test with the V2 Autoscaler + name := "Create a RayCluster with autoscaler v2 enabled" + tc := tests["Create a RayCluster with autoscaler v2 enabled"] + + test := With(t) + g := gomega.NewWithT(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + // Minimum Ray Version for custom idleTimeoutSeconds + IDLE_TIMEOUT_MIN_RAY_VERSION := "2.40.0" + os.Setenv(KuberayTestRayImage, IDLE_TIMEOUT_MIN_RAY_VERSION) + + customIdleTimeoutSeconds := 30 + defaultIdleTimeoutSeconds := 60 + + test.T().Run(name, func(_ *testing.T) { + rayClusterSpecAC := rayv1ac.RayClusterSpec(). + WithEnableInTreeAutoscaling(true). + WithRayVersion(IDLE_TIMEOUT_MIN_RAY_VERSION). + WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). + WithRayStartParams(map[string]string{"num-cpus": "0"}). + WithTemplate(tc.HeadPodTemplateGetter())). + WithWorkerGroupSpecs( + rayv1ac.WorkerGroupSpec(). + WithReplicas(2). + WithMinReplicas(0). + WithMaxReplicas(4). + WithGroupName("no-idle-timeout-group"). + WithRayStartParams(map[string]string{"num-cpus": "1"}). + WithTemplate(tc.WorkerPodTemplateGetter()), + rayv1ac.WorkerGroupSpec(). + WithReplicas(2). + WithMinReplicas(0). + WithMaxReplicas(4). + WithIdleTimeoutSeconds(int32(customIdleTimeoutSeconds)). + WithGroupName("custom-idle-timeout-group"). + WithRayStartParams(map[string]string{"num-cpus": "1"}). + WithTemplate(tc.WorkerPodTemplateGetter()), + ) + rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).WithSpec(apply(rayClusterSpecAC)) + rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions) + g.Expect(err).NotTo(gomega.HaveOccurred()) + test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name) + + // Wait for RayCluster to become ready and verify the number of available worker replicas. + // Each worker group should scale up two initial replicas before idle timeout. + g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(gomega.WithTransform(RayClusterState, gomega.Equal(rayv1.Ready))) + g.Expect(GetRayCluster(test, rayCluster.Namespace, rayCluster.Name)).To(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(4)))) + + headPod, err := GetHeadPod(test, rayCluster) + g.Expect(err).NotTo(gomega.HaveOccurred()) + test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) + + // After customIdleTimeoutSeconds, both replicas in the worker group with custom idleTimeoutSeconds set + // should be scaled down. + g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), time.Duration(customIdleTimeoutSeconds)*time.Second). + Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(2)))) + + // After the default idleTimeoutSeconds applied by Ray, all worker replicas should be scaled down. + g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), time.Duration(defaultIdleTimeoutSeconds)*time.Second). + Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) + }) +} From 7750a0d5a6603bad90d2c1f08a33d50b48c8d510 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 10 Jan 2025 13:32:08 +0000 Subject: [PATCH 2/4] Fix comment Signed-off-by: Ryan O'Leary --- ray-operator/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 39c0aafa8db..06aec9de897 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -73,7 +73,7 @@ test-e2e: manifests fmt vet ## Run e2e tests. go test -timeout 30m -v $(WHAT) test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler -test-e2e-autoscaler: manifests fmt vet ## Run e2e tests. +test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests. go test -timeout 30m -v $(WHAT) test-sampleyaml: WHAT ?= ./test/sampleyaml From a5b5e8d9de7b974139638d99811cacc14f2c586f Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 10 Jan 2025 20:42:50 +0000 Subject: [PATCH 3/4] Fix lint Signed-off-by: Ryan O'Leary --- .../test/e2eautoscaler/raycluster_autoscaler_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 1d702d44408..a5638404ba4 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -370,16 +370,16 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { namespace := test.NewTestNamespace() // Minimum Ray Version for custom idleTimeoutSeconds - IDLE_TIMEOUT_MIN_RAY_VERSION := "2.40.0" - os.Setenv(KuberayTestRayImage, IDLE_TIMEOUT_MIN_RAY_VERSION) + idleTimeoutMinRayVersion := "2.40.0" + os.Setenv(KuberayTestRayImage, idleTimeoutMinRayVersion) - customIdleTimeoutSeconds := 30 - defaultIdleTimeoutSeconds := 60 + customIdleTimeoutSeconds := int32(30) + defaultIdleTimeoutSeconds := int32(60) test.T().Run(name, func(_ *testing.T) { rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). - WithRayVersion(IDLE_TIMEOUT_MIN_RAY_VERSION). + WithRayVersion(idleTimeoutMinRayVersion). WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). WithRayStartParams(map[string]string{"num-cpus": "0"}). WithTemplate(tc.HeadPodTemplateGetter())). @@ -395,7 +395,7 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { WithReplicas(2). WithMinReplicas(0). WithMaxReplicas(4). - WithIdleTimeoutSeconds(int32(customIdleTimeoutSeconds)). + WithIdleTimeoutSeconds(customIdleTimeoutSeconds). WithGroupName("custom-idle-timeout-group"). WithRayStartParams(map[string]string{"num-cpus": "1"}). WithTemplate(tc.WorkerPodTemplateGetter()), From e4225995005064c4b030b1312260227b5792fcbb Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Tue, 14 Jan 2025 09:40:11 +0000 Subject: [PATCH 4/4] Fix failing test Signed-off-by: Ryan O'Leary --- .../raycluster_autoscaler_test.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index a5638404ba4..1f7797d6174 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -2,7 +2,6 @@ package e2eautoscaler import ( "fmt" - "os" "testing" "time" @@ -371,7 +370,6 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { // Minimum Ray Version for custom idleTimeoutSeconds idleTimeoutMinRayVersion := "2.40.0" - os.Setenv(KuberayTestRayImage, idleTimeoutMinRayVersion) customIdleTimeoutSeconds := int32(30) defaultIdleTimeoutSeconds := int32(60) @@ -385,14 +383,14 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { WithTemplate(tc.HeadPodTemplateGetter())). WithWorkerGroupSpecs( rayv1ac.WorkerGroupSpec(). - WithReplicas(2). + WithReplicas(1). WithMinReplicas(0). WithMaxReplicas(4). WithGroupName("no-idle-timeout-group"). WithRayStartParams(map[string]string{"num-cpus": "1"}). WithTemplate(tc.WorkerPodTemplateGetter()), rayv1ac.WorkerGroupSpec(). - WithReplicas(2). + WithReplicas(1). WithMinReplicas(0). WithMaxReplicas(4). WithIdleTimeoutSeconds(customIdleTimeoutSeconds). @@ -400,27 +398,26 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) { WithRayStartParams(map[string]string{"num-cpus": "1"}). WithTemplate(tc.WorkerPodTemplateGetter()), ) - rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).WithSpec(apply(rayClusterSpecAC)) + rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).WithSpec((rayClusterSpecAC)) + rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions) g.Expect(err).NotTo(gomega.HaveOccurred()) test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name) // Wait for RayCluster to become ready and verify the number of available worker replicas. - // Each worker group should scale up two initial replicas before idle timeout. g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(gomega.WithTransform(RayClusterState, gomega.Equal(rayv1.Ready))) - g.Expect(GetRayCluster(test, rayCluster.Namespace, rayCluster.Name)).To(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(4)))) + g.Expect(GetRayCluster(test, rayCluster.Namespace, rayCluster.Name)).To(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(2)))) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(gomega.HaveOccurred()) test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) - // After customIdleTimeoutSeconds, both replicas in the worker group with custom idleTimeoutSeconds set - // should be scaled down. + // After customIdleTimeoutSeconds, the replica in the worker group with custom idleTimeoutSeconds set should be scaled down. g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), time.Duration(customIdleTimeoutSeconds)*time.Second). - Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(2)))) + Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(1)))) - // After the default idleTimeoutSeconds applied by Ray, all worker replicas should be scaled down. + // After the default idleTimeoutSeconds, all worker replicas should be scaled down. g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), time.Duration(defaultIdleTimeoutSeconds)*time.Second). Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) })