Skip to content

Commit 9442fe0

Browse files
committed
Create a Ray Cluster SDK upgrade scenarios
1 parent 0feab0f commit 9442fe0

File tree

5 files changed

+593
-3
lines changed

5 files changed

+593
-3
lines changed

Diff for: go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ go 1.20
44

55
require (
66
github.com/onsi/gomega v1.27.10
7-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
7+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069
88
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0
99
github.com/ray-project/kuberay/ray-operator v1.0.0
1010
k8s.io/api v0.26.3

Diff for: go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -369,8 +369,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
369369
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
370370
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
371371
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
372-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM=
373-
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
372+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 h1:81+ma1mchF/LtAGsf+poAt50kJ/fLYjoTAcZOxci1Yc=
373+
github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
374374
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0 h1:oyhdLdc4BgA4zcH1zlRrSrYpzuVxV5QLDbyIXrwnQqs=
375375
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0/go.mod h1:Yge6GRNpO9YIDfeL+XOcCE9xbmfCTD5C1h5dlW87mxQ=
376376
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=

Diff for: tests/e2e/mnist_rayjob.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import sys
2+
3+
from time import sleep
4+
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import get_cluster
8+
from codeflare_sdk.job.jobs import DDPJobDefinition
9+
10+
namespace = sys.argv[1]
11+
12+
cluster = get_cluster("mnist", namespace)
13+
14+
cluster.details()
15+
16+
jobdef = DDPJobDefinition(
17+
name="mnist",
18+
script="mnist.py",
19+
scheduler_args={"requirements": "requirements.txt"},
20+
)
21+
job = jobdef.submit(cluster)
22+
23+
done = False
24+
time = 0
25+
timeout = 900
26+
while not done:
27+
status = job.status()
28+
if is_terminal(status.state):
29+
break
30+
if not done:
31+
print(status)
32+
if timeout and time >= timeout:
33+
raise TimeoutError(f"job has timed out after waiting {timeout}s")
34+
sleep(5)
35+
time += 5
36+
37+
print(f"Job has completed: {status.state}")
38+
39+
print(job.logs())
40+
41+
cluster.down()
42+
43+
if not status.state == AppState.SUCCEEDED:
44+
exit(1)
45+
else:
46+
exit(0)

Diff for: tests/e2e/start_ray_cluster.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
6+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
7+
8+
namespace = sys.argv[1]
9+
ray_image = os.getenv("RAY_IMAGE")
10+
host = os.getenv("CLUSTER_HOSTNAME")
11+
12+
ingress_options = {}
13+
if host is not None:
14+
ingress_options = {
15+
"ingresses": [
16+
{
17+
"ingressName": "ray-dashboard",
18+
"port": 8265,
19+
"pathType": "Prefix",
20+
"path": "/",
21+
"host": host,
22+
},
23+
]
24+
}
25+
26+
cluster = Cluster(
27+
ClusterConfiguration(
28+
name="mnist",
29+
namespace=namespace,
30+
num_workers=1,
31+
head_cpus="500m",
32+
head_memory=2,
33+
min_cpus="500m",
34+
max_cpus=1,
35+
min_memory=1,
36+
max_memory=2,
37+
num_gpus=0,
38+
instascale=False,
39+
image=ray_image,
40+
ingress_options=ingress_options,
41+
)
42+
)
43+
44+
cluster.up()
45+
46+
cluster.status()
47+
48+
cluster.wait_ready()
49+
50+
cluster.status()
51+
52+
cluster.details()

0 commit comments

Comments
 (0)