Skip to content

Commit a7fe266

Browse files
committed
Create a Ray Cluster SDK upgrade scenarios
1 parent 0feab0f commit a7fe266

7 files changed

+548
-42
lines changed

Diff for: go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ go 1.20
44

55
require (
66
github.com/onsi/gomega v1.27.10
7-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
7+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069
88
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0
99
github.com/ray-project/kuberay/ray-operator v1.0.0
1010
k8s.io/api v0.26.3

Diff for: go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -369,8 +369,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
369369
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
370370
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
371371
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
372-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM=
373-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
372+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 h1:81+ma1mchF/LtAGsf+poAt50kJ/fLYjoTAcZOxci1Yc=
373+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
374374
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0 h1:oyhdLdc4BgA4zcH1zlRrSrYpzuVxV5QLDbyIXrwnQqs=
375375
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0/go.mod h1:Yge6GRNpO9YIDfeL+XOcCE9xbmfCTD5C1h5dlW87mxQ=
376376
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=

Diff for: tests/e2e/mnist_raycluster_sdk_test.go

+3-37
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@ limitations under the License.
1717
package e2e
1818

1919
import (
20-
"strings"
2120
"testing"
22-
"time"
2321

2422
. "github.com/onsi/gomega"
2523
. "github.com/project-codeflare/codeflare-common/support"
@@ -137,7 +135,7 @@ func TestMNISTRayClusterSDK(t *testing.T) {
137135
Command: []string{
138136
"/bin/sh", "-c",
139137
"while [ ! -f /codeflare-sdk/pyproject.toml ]; do sleep 1; done; " +
140-
"cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_raycluster_sdk.py " + namespace.Name,
138+
"cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_raycluster_sdk.py " + namespace.Name,
141139
},
142140
VolumeMounts: []corev1.VolumeMount{
143141
{
@@ -194,40 +192,8 @@ func TestMNISTRayClusterSDK(t *testing.T) {
194192
test.Expect(err).NotTo(HaveOccurred())
195193
test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name)
196194

197-
go func() {
198-
// Checking if pod is found and running
199-
podName := ""
200-
foundPod := false
201-
for !foundPod {
202-
pods, _ := test.Client().Core().CoreV1().Pods(namespace.Name).List(test.Ctx(), metav1.ListOptions{
203-
LabelSelector: "job-name=sdk",
204-
})
205-
for _, pod := range pods.Items {
206-
if strings.HasPrefix(pod.Name, "sdk-") && pod.Status.Phase == corev1.PodRunning {
207-
podName = pod.Name
208-
foundPod = true
209-
test.T().Logf("Pod is running!")
210-
break
211-
}
212-
}
213-
if !foundPod {
214-
test.T().Logf("Waiting for pod to start...")
215-
time.Sleep(5 * time.Second)
216-
}
217-
}
218-
219-
// Get rest config
220-
restConfig, err := GetRestConfig(test); if err != nil {
221-
test.T().Errorf("Error getting rest config: %v", err)
222-
}
223-
224-
// Copy codeflare-sdk to the pod
225-
srcDir := "../.././"
226-
dstDir := "/codeflare-sdk"
227-
if err := CopyToPod(test, namespace.Name, podName, restConfig, srcDir, dstDir); err != nil {
228-
test.T().Errorf("Error copying codeflare-sdk to pod: %v", err)
229-
}
230-
}()
195+
// Setup the codeflare-sdk inside the pod associated to the created job
196+
SetupCodeflareSDKInsidePod(test, namespace, job.Name)
231197

232198
test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name)
233199
test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should(

Diff for: tests/e2e/mnist_rayjob.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import sys
2+
3+
from time import sleep
4+
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import get_cluster
8+
from codeflare_sdk.job.jobs import DDPJobDefinition
9+
10+
namespace = sys.argv[1]
11+
12+
cluster = get_cluster("mnist", namespace)
13+
14+
cluster.details()
15+
16+
jobdef = DDPJobDefinition(
17+
name="mnist",
18+
script="mnist.py",
19+
scheduler_args={"requirements": "requirements.txt"},
20+
)
21+
job = jobdef.submit(cluster)
22+
23+
done = False
24+
time = 0
25+
timeout = 900
26+
while not done:
27+
status = job.status()
28+
if is_terminal(status.state):
29+
break
30+
if not done:
31+
print(status)
32+
if timeout and time >= timeout:
33+
raise TimeoutError(f"job has timed out after waiting {timeout}s")
34+
sleep(5)
35+
time += 5
36+
37+
print(f"Job has completed: {status.state}")
38+
39+
print(job.logs())
40+
41+
cluster.down()
42+
43+
if not status.state == AppState.SUCCEEDED:
44+
exit(1)
45+
else:
46+
exit(0)

Diff for: tests/e2e/start_ray_cluster.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
6+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
7+
8+
namespace = sys.argv[1]
9+
ray_image = os.getenv("RAY_IMAGE")
10+
host = os.getenv("CLUSTER_HOSTNAME")
11+
12+
ingress_options = {}
13+
if host is not None:
14+
ingress_options = {
15+
"ingresses": [
16+
{
17+
"ingressName": "ray-dashboard",
18+
"port": 8265,
19+
"pathType": "Prefix",
20+
"path": "/",
21+
"host": host,
22+
},
23+
]
24+
}
25+
26+
cluster = Cluster(
27+
ClusterConfiguration(
28+
name="mnist",
29+
namespace=namespace,
30+
num_workers=1,
31+
head_cpus="500m",
32+
head_memory=2,
33+
min_cpus="500m",
34+
max_cpus=1,
35+
min_memory=1,
36+
max_memory=2,
37+
num_gpus=0,
38+
instascale=False,
39+
image=ray_image,
40+
ingress_options=ingress_options,
41+
)
42+
)
43+
44+
cluster.up()
45+
46+
cluster.status()
47+
48+
cluster.wait_ready()
49+
50+
cluster.status()
51+
52+
cluster.details()

Diff for: tests/e2e/support.go

+48-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@ import (
2020
"embed"
2121
"os"
2222
"path/filepath"
23+
"strings"
24+
"time"
2325

2426
"github.com/onsi/gomega"
27+
"github.com/project-codeflare/codeflare-common/support"
28+
corev1 "k8s.io/api/core/v1"
2529
"k8s.io/apimachinery/pkg/api/meta"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2631
"k8s.io/apimachinery/pkg/runtime/schema"
2732
"k8s.io/apimachinery/pkg/runtime/serializer"
2833
"k8s.io/cli-runtime/pkg/genericclioptions"
@@ -33,8 +38,6 @@ import (
3338
"k8s.io/kubectl/pkg/cmd/cp"
3439
"k8s.io/kubectl/pkg/cmd/util"
3540
"k8s.io/kubectl/pkg/scheme"
36-
37-
"github.com/project-codeflare/codeflare-common/support"
3841
)
3942

4043
//go:embed *.py *.txt *.sh
@@ -109,3 +112,46 @@ func (r restClientGetter) ToDiscoveryClient() (discovery.CachedDiscoveryInterfac
109112
func (r restClientGetter) ToRESTMapper() (meta.RESTMapper, error) {
110113
return nil, nil
111114
}
115+
116+
func SetupCodeflareSDKInsidePod(test support.Test, namespace *corev1.Namespace, labelName string) {
117+
118+
// Get pod name
119+
podName := GetPodName(test, namespace, labelName)
120+
121+
// Get rest config
122+
restConfig, err := GetRestConfig(test)
123+
if err != nil {
124+
test.T().Errorf("Error getting rest config: %v", err)
125+
}
126+
127+
// Copy codeflare-sdk to the pod
128+
srcDir := "../.././"
129+
dstDir := "/codeflare-sdk"
130+
if err := CopyToPod(test, namespace.Name, podName, restConfig, srcDir, dstDir); err != nil {
131+
test.T().Errorf("Error copying codeflare-sdk to pod: %v", err)
132+
}
133+
}
134+
135+
func GetPodName(test support.Test, namespace *corev1.Namespace, labelName string) string {
136+
podName := ""
137+
foundPod := false
138+
for !foundPod {
139+
pods, _ := test.Client().Core().CoreV1().Pods(namespace.Name).List(test.Ctx(), metav1.ListOptions{
140+
LabelSelector: "job-name=" + labelName,
141+
})
142+
for _, pod := range pods.Items {
143+
144+
if strings.HasPrefix(pod.Name, labelName+"-") && pod.Status.Phase == corev1.PodRunning {
145+
podName = pod.Name
146+
foundPod = true
147+
test.T().Logf("Pod is running!")
148+
break
149+
}
150+
}
151+
if !foundPod {
152+
test.T().Logf("Waiting for pod to start...")
153+
time.Sleep(5 * time.Second)
154+
}
155+
}
156+
return podName
157+
}

0 commit comments

Comments
 (0)