File tree 5 files changed +593
-3
lines changed
5 files changed +593
-3
lines changed Original file line number Diff line number Diff line change 4
4
5
5
require (
6
6
github.com/onsi/gomega v1.27.10
7
- github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
7
+ github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069
8
8
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0
9
9
github.com/ray-project/kuberay/ray-operator v1.0.0
10
10
k8s.io/api v0.26.3
Original file line number Diff line number Diff line change @@ -369,8 +369,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
369
369
github.com/pkg/errors v0.9.1 /go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0 =
370
370
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM =
371
371
github.com/pmezard/go-difflib v1.0.0 /go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4 =
372
- github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM =
373
- github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb /go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk =
372
+ github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 h1:81+ma1mchF/LtAGsf+poAt50kJ/fLYjoTAcZOxci1Yc =
373
+ github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 /go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk =
374
374
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0 h1:oyhdLdc4BgA4zcH1zlRrSrYpzuVxV5QLDbyIXrwnQqs =
375
375
github.com/project-codeflare/multi-cluster-app-dispatcher v1.37.0 /go.mod h1:Yge6GRNpO9YIDfeL+XOcCE9xbmfCTD5C1h5dlW87mxQ =
376
376
github.com/prometheus/client_golang v0.9.1 /go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw =
Original file line number Diff line number Diff line change
1
+ import sys
2
+
3
+ from time import sleep
4
+
5
+ from torchx .specs .api import AppState , is_terminal
6
+
7
+ from codeflare_sdk .cluster .cluster import get_cluster
8
+ from codeflare_sdk .job .jobs import DDPJobDefinition
9
+
10
+ namespace = sys .argv [1 ]
11
+
12
+ cluster = get_cluster ("mnist" , namespace )
13
+
14
+ cluster .details ()
15
+
16
+ jobdef = DDPJobDefinition (
17
+ name = "mnist" ,
18
+ script = "mnist.py" ,
19
+ scheduler_args = {"requirements" : "requirements.txt" },
20
+ )
21
+ job = jobdef .submit (cluster )
22
+
23
+ done = False
24
+ time = 0
25
+ timeout = 900
26
+ while not done :
27
+ status = job .status ()
28
+ if is_terminal (status .state ):
29
+ break
30
+ if not done :
31
+ print (status )
32
+ if timeout and time >= timeout :
33
+ raise TimeoutError (f"job has timed out after waiting { timeout } s" )
34
+ sleep (5 )
35
+ time += 5
36
+
37
+ print (f"Job has completed: { status .state } " )
38
+
39
+ print (job .logs ())
40
+
41
+ cluster .down ()
42
+
43
+ if not status .state == AppState .SUCCEEDED :
44
+ exit (1 )
45
+ else :
46
+ exit (0 )
Original file line number Diff line number Diff line change
1
+ import sys
2
+ import os
3
+
4
+ from time import sleep
5
+
6
+ from codeflare_sdk .cluster .cluster import Cluster , ClusterConfiguration
7
+
8
+ namespace = sys .argv [1 ]
9
+ ray_image = os .getenv ("RAY_IMAGE" )
10
+ host = os .getenv ("CLUSTER_HOSTNAME" )
11
+
12
+ ingress_options = {}
13
+ if host is not None :
14
+ ingress_options = {
15
+ "ingresses" : [
16
+ {
17
+ "ingressName" : "ray-dashboard" ,
18
+ "port" : 8265 ,
19
+ "pathType" : "Prefix" ,
20
+ "path" : "/" ,
21
+ "host" : host ,
22
+ },
23
+ ]
24
+ }
25
+
26
+ cluster = Cluster (
27
+ ClusterConfiguration (
28
+ name = "mnist" ,
29
+ namespace = namespace ,
30
+ num_workers = 1 ,
31
+ head_cpus = "500m" ,
32
+ head_memory = 2 ,
33
+ min_cpus = "500m" ,
34
+ max_cpus = 1 ,
35
+ min_memory = 1 ,
36
+ max_memory = 2 ,
37
+ num_gpus = 0 ,
38
+ instascale = False ,
39
+ image = ray_image ,
40
+ ingress_options = ingress_options ,
41
+ )
42
+ )
43
+
44
+ cluster .up ()
45
+
46
+ cluster .status ()
47
+
48
+ cluster .wait_ready ()
49
+
50
+ cluster .status ()
51
+
52
+ cluster .details ()
You can’t perform that action at this time.
0 commit comments