diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html index 21950889a..3b3962215 100644 --- a/docs/cluster/cluster.html +++ b/docs/cluster/cluster.html @@ -141,6 +141,9 @@
codeflare_sdk.cluster.cluster
codeflare_sdk.cluster.cluster
codeflare_sdk.cluster.cluster
codeflare_sdk.cluster.cluster
codeflare_sdk.cluster.cluster
codeflare_sdk.cluster.cluster
-def wait_ready(self, timeout: Optional[int] = None)
+def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True)
Waits for requested cluster to be ready, up to an optional timeout (s). @@ -1662,7 +1718,7 @@
def wait_ready(self, timeout: Optional[int] = None):
+def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
"""
Waits for requested cluster to be ready, up to an optional timeout (s).
Checks every five seconds.
@@ -1672,19 +1728,32 @@ Methods
dashboard_ready = False
status = None
time = 0
- while not ready or not dashboard_ready:
+ while not ready:
status, ready = self.status(print_to_console=False)
- dashboard_ready = self.is_dashboard_ready()
if status == CodeFlareClusterStatus.UNKNOWN:
print(
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
)
- if not ready or not dashboard_ready:
+ if not ready:
+ if timeout and time >= timeout:
+ raise TimeoutError(
+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
+ )
+ sleep(5)
+ time += 5
+ print("Requested cluster is up and running!")
+
+ while dashboard_check and not dashboard_ready:
+ dashboard_ready = self.is_dashboard_ready()
+ if not dashboard_ready:
if timeout and time >= timeout:
- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
+ raise TimeoutError(
+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
+ )
sleep(5)
time += 5
- print("Requested cluster and dashboard are up and running!")
+ if dashboard_ready:
+ print("Dashboard is ready!")
diff --git a/docs/cluster/config.html b/docs/cluster/config.html
index 0575c01c8..2edf0a3c8 100644
--- a/docs/cluster/config.html
+++ b/docs/cluster/config.html
@@ -66,6 +66,9 @@ codeflare_sdk.cluster.config
codeflare_sdk.cluster.config
class ClusterConfiguration
-(name: str, namespace: str = None, head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:2.5.0-py38-cu116', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None)
+(name: str, namespace: str = None, head_info: list = <factory>, head_cpus: int = 2, head_memory: int = 8, head_gpus: int = 0, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:latest-py39-cu118', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None)
This dataclass is used to specify resource requirements and other details, and @@ -111,6 +114,9 @@
var head_cpus : int
var head_gpus : int
var head_info : list
var head_memory : int
var image : str
dispatch_priority
envs
+head_cpus
+head_gpus
head_info
+head_memory
image
image_pull_secrets
instascale
diff --git a/docs/cluster/model.html b/docs/cluster/model.html
index 7d911255b..3832494eb 100644
--- a/docs/cluster/model.html
+++ b/docs/cluster/model.html
@@ -102,6 +102,9 @@ Module codeflare_sdk.cluster.model
name: str
status: RayClusterStatus
+ head_cpus: int
+ head_mem: str
+ head_gpu: int
workers: int
worker_mem_min: str
worker_mem_max: str
@@ -287,7 +290,7 @@ Class variables
class RayCluster
-(name: str, status: RayClusterStatus, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str)
+(name: str, status: RayClusterStatus, head_cpus: int, head_mem: str, head_gpu: int, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str)
For storing information about a Ray cluster.
var head_cpus : int
var head_gpu : int
var head_mem : str
var name : str
RayCluster
dashboard
head_cpus
head_gpu
head_mem
name
namespace
status
codeflare_sdk.utils.generate_yaml
codeflare_sdk.utils.generate_yaml
codeflare_sdk.utils.generate_yaml
codeflare_sdk.utils.generate_yaml
codeflare_sdk.utils.generate_yaml
codeflare_sdk.utils.generate_yaml
-def generate_appwrapper(name: str, namespace: str, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int)
+def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int)
def generate_appwrapper(
name: str,
namespace: str,
+ head_cpus: int,
+ head_memory: int,
+ head_gpus: int,
min_cpu: int,
max_cpu: int,
min_memory: int,
@@ -613,7 +653,16 @@ Functions
update_labels(user_yaml, instascale, instance_types)
update_priority(user_yaml, item, dispatch_priority, priority_val)
update_custompodresources(
- item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+ item,
+ min_cpu,
+ max_cpu,
+ min_memory,
+ max_memory,
+ gpu,
+ workers,
+ head_cpus,
+ head_memory,
+ head_gpus,
)
update_nodes(
item,
@@ -628,6 +677,9 @@ Functions
instascale,
env,
image_pull_secrets,
+ head_cpus,
+ head_memory,
+ head_gpus,
)
update_dashboard_route(route_item, cluster_name, namespace)
if local_interactive:
@@ -700,7 +752,7 @@ Functions
-def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers)
+def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus)
def update_custompodresources(
- item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+ item,
+ min_cpu,
+ max_cpu,
+ min_memory,
+ max_memory,
+ gpu,
+ workers,
+ head_cpus,
+ head_memory,
+ head_gpus,
):
if "custompodresources" in item.keys():
custompodresources = item.get("custompodresources")
for i in range(len(custompodresources)):
+ resource = custompodresources[i]
if i == 0:
# Leave head node resources as template default
- continue
- resource = custompodresources[i]
- for k, v in resource.items():
- if k == "replicas" and i == 1:
- resource[k] = workers
- if k == "requests" or k == "limits":
- for spec, _ in v.items():
- if spec == "cpu":
- if k == "limits":
- resource[k][spec] = max_cpu
- else:
- resource[k][spec] = min_cpu
- if spec == "memory":
- if k == "limits":
- resource[k][spec] = str(max_memory) + "G"
- else:
- resource[k][spec] = str(min_memory) + "G"
- if spec == "nvidia.com/gpu":
- if i == 0:
- resource[k][spec] = 0
- else:
- resource[k][spec] = gpu
+ resource["requests"]["cpu"] = head_cpus
+ resource["limits"]["cpu"] = head_cpus
+ resource["requests"]["memory"] = str(head_memory) + "G"
+ resource["limits"]["memory"] = str(head_memory) + "G"
+ resource["requests"]["nvidia.com/gpu"] = head_gpus
+ resource["limits"]["nvidia.com/gpu"] = head_gpus
+
+ else:
+ for k, v in resource.items():
+ if k == "replicas" and i == 1:
+ resource[k] = workers
+ if k == "requests" or k == "limits":
+ for spec, _ in v.items():
+ if spec == "cpu":
+ if k == "limits":
+ resource[k][spec] = max_cpu
+ else:
+ resource[k][spec] = min_cpu
+ if spec == "memory":
+ if k == "limits":
+ resource[k][spec] = str(max_memory) + "G"
+ else:
+ resource[k][spec] = str(min_memory) + "G"
+ if spec == "nvidia.com/gpu":
+ if i == 0:
+ resource[k][spec] = 0
+ else:
+ resource[k][spec] = gpu
else:
sys.exit("Error: malformed template")
@@ -855,7 +923,7 @@
-def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets)
+def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus)