Skip to content

Commit

Permalink
finish GCS fault tolerant test
Browse files Browse the repository at this point in the history
Signed-off-by: owenowenisme <[email protected]>
  • Loading branch information
owenowenisme committed Jan 9, 2025
1 parent a101002 commit e621ad6
Showing 1 changed file with 17 additions and 13 deletions.
30 changes: 17 additions & 13 deletions ray-operator/test/e2e/raycluster_gcs_ft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

// k8serrors "k8s.io/apimachinery/pkg/api/errors"
corev1 "k8s.io/api/core/v1"

// "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
Expand All @@ -24,14 +25,15 @@ func TestRayClusterGCSFaultTolerence(t *testing.T) {
// Create a namespace
namespace := test.NewTestNamespace()

test.T().Log("Creating Cluster")
test.T().Log("Creating Cluster for GCSFaultTolerence testing.")
yamlFilePath := "testdata/ray-cluster.ray-ft.yaml"
rayClusterFromYaml := DeserializeRayClusterYAML(test, yamlFilePath)
KubectlApplyYAML(test, yamlFilePath, namespace.Name)

rayCluster, err := GetRayCluster(test, namespace.Name, rayClusterFromYaml.Name)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(rayCluster).NotTo(BeNil())

test.T().Logf("Waiting for RayCluster %s/%s to become ready", rayCluster.Namespace, rayCluster.Name)
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
Expand All @@ -50,38 +52,40 @@ func TestRayClusterGCSFaultTolerence(t *testing.T) {
// [Test 1: Kill GCS process to "restart" the head Pod]
// become running and ready, the RayCluster still needs tens of seconds
// Hence, `test_detached_actor_2.py` will retry until a Ray client
//connection succeeds.
// connection succeeds.
// Assert is implement in python, so no furthur handling needed here, and so are other ExecPodCmd
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"pkill", "gcs_server"})

// Restart count should eventually become 1
// Restart count should eventually become 1, not creating a new pod
HeadPodRestartCount := func(p *corev1.Pod) int32 { return p.Status.ContainerStatuses[0].RestartCount }
g.Eventually(HeadPod(test, rayCluster)).
Should(WithTransform(HeadPodRestartCount, Equal(int32(1))))

// Pos Status should eventually become Running
HeadPodState := func(p *corev1.Pod) string { return string(p.Status.Phase) }
PodState := func(p *corev1.Pod) string { return string(p.Status.Phase) }
g.Eventually(HeadPod(test, rayCluster)).
Should(WithTransform(HeadPodState, Equal("Running")))
Should(WithTransform(PodState, Equal("Running")))

headPod, err = GetHeadPod(test, rayClusterFromYaml)
g.Expect(err).NotTo(HaveOccurred())

expectedOutput := "3"
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_2.py", rayNamespace, expectedOutput})

// [Test 2: Delete the head Pod and wait for a new head Pod]
// Delete the head Pod. The `kubectl delete pod` command has a default flag `--wait=true`,
// which waits for resources to be gone before returning.
// Test 2: Delete the head Pod
kubectlCmd := exec.CommandContext(test.Ctx(), "kubectl", "delete", "pod", headPod.Name, "-n", namespace.Name)
kubectlCmd.Run()
// Restart count should eventually become 1
g.Eventually(HeadPod(test, rayCluster)).
Should(WithTransform(HeadPodRestartCount, Equal(int32(1))))

// Will get 2 head pods while one is terminating and another is creating, so wait until one is left
g.Eventually(func() error {
_, err := GetHeadPod(test, rayClusterFromYaml)
return err
}, TestTimeoutMedium).ShouldNot(HaveOccurred())

headPod, err = GetHeadPod(test, rayClusterFromYaml)
g.Expect(err).NotTo(HaveOccurred())
expectedOutput = "4"
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_2.py", rayNamespace, expectedOutput})

KubectlDeleteAllPods(test, rayNamespace)
})

}

0 comments on commit e621ad6

Please sign in to comment.