diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html index 21950889a..3b3962215 100644 --- a/docs/cluster/cluster.html +++ b/docs/cluster/cluster.html @@ -141,6 +141,9 @@

Module codeflare_sdk.cluster.cluster

name = self.config.name namespace = self.config.namespace + head_cpus = self.config.head_cpus + head_memory = self.config.head_memory + head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory @@ -158,6 +161,9 @@

Module codeflare_sdk.cluster.cluster

return generate_appwrapper( name=name, namespace=namespace, + head_cpus=head_cpus, + head_memory=head_memory, + head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, @@ -290,7 +296,7 @@

Module codeflare_sdk.cluster.cluster

else: return False - def wait_ready(self, timeout: Optional[int] = None): + def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True): """ Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. @@ -300,19 +306,32 @@

Module codeflare_sdk.cluster.cluster

dashboard_ready = False status = None time = 0 - while not ready or not dashboard_ready: + while not ready: status, ready = self.status(print_to_console=False) - dashboard_ready = self.is_dashboard_ready() if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) - if not ready or not dashboard_ready: + if not ready: + if timeout and time >= timeout: + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for cluster to be ready" + ) + sleep(5) + time += 5 + print("Requested cluster is up and running!") + + while dashboard_check and not dashboard_ready: + dashboard_ready = self.is_dashboard_ready() + if not dashboard_ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s") + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for dashboard to be ready" + ) sleep(5) time += 5 - print("Requested cluster and dashboard are up and running!") + if dashboard_ready: + print("Dashboard is ready!") def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) @@ -640,6 +659,15 @@

Module codeflare_sdk.cluster.cluster

worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for namespace=rc["metadata"]["namespace"], dashboard=ray_route, + head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"], + head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"], + head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], ) @@ -670,6 +698,9 @@

Module codeflare_sdk.cluster.cluster

worker_gpu=cluster.config.num_gpus, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), + head_cpus=cluster.config.head_cpus, + head_mem=cluster.config.head_memory, + head_gpu=cluster.config.head_gpus, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY @@ -879,6 +910,9 @@

Classes

name = self.config.name namespace = self.config.namespace + head_cpus = self.config.head_cpus + head_memory = self.config.head_memory + head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory @@ -896,6 +930,9 @@

Classes

return generate_appwrapper( name=name, namespace=namespace, + head_cpus=head_cpus, + head_memory=head_memory, + head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, @@ -1028,7 +1065,7 @@

Classes

else: return False - def wait_ready(self, timeout: Optional[int] = None): + def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True): """ Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. @@ -1038,19 +1075,32 @@

Classes

dashboard_ready = False status = None time = 0 - while not ready or not dashboard_ready: + while not ready: status, ready = self.status(print_to_console=False) - dashboard_ready = self.is_dashboard_ready() if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) - if not ready or not dashboard_ready: + if not ready: + if timeout and time >= timeout: + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for cluster to be ready" + ) + sleep(5) + time += 5 + print("Requested cluster is up and running!") + + while dashboard_check and not dashboard_ready: + dashboard_ready = self.is_dashboard_ready() + if not dashboard_ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s") + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for dashboard to be ready" + ) sleep(5) time += 5 - print("Requested cluster and dashboard are up and running!") + if dashboard_ready: + print("Dashboard is ready!") def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) @@ -1267,6 +1317,9 @@

Methods

name = self.config.name namespace = self.config.namespace + head_cpus = self.config.head_cpus + head_memory = self.config.head_memory + head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory @@ -1284,6 +1337,9 @@

Methods

return generate_appwrapper( name=name, namespace=namespace, + head_cpus=head_cpus, + head_memory=head_memory, + head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, @@ -1653,7 +1709,7 @@

Methods

-def wait_ready(self, timeout: Optional[int] = None) +def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True)

Waits for requested cluster to be ready, up to an optional timeout (s). @@ -1662,7 +1718,7 @@

Methods

Expand source code -
def wait_ready(self, timeout: Optional[int] = None):
+
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
     """
     Waits for requested cluster to be ready, up to an optional timeout (s).
     Checks every five seconds.
@@ -1672,19 +1728,32 @@ 

Methods

dashboard_ready = False status = None time = 0 - while not ready or not dashboard_ready: + while not ready: status, ready = self.status(print_to_console=False) - dashboard_ready = self.is_dashboard_ready() if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) - if not ready or not dashboard_ready: + if not ready: + if timeout and time >= timeout: + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for cluster to be ready" + ) + sleep(5) + time += 5 + print("Requested cluster is up and running!") + + while dashboard_check and not dashboard_ready: + dashboard_ready = self.is_dashboard_ready() + if not dashboard_ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s") + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for dashboard to be ready" + ) sleep(5) time += 5 - print("Requested cluster and dashboard are up and running!")
+ if dashboard_ready: + print("Dashboard is ready!")
diff --git a/docs/cluster/config.html b/docs/cluster/config.html index 0575c01c8..2edf0a3c8 100644 --- a/docs/cluster/config.html +++ b/docs/cluster/config.html @@ -66,6 +66,9 @@

Module codeflare_sdk.cluster.config

name: str namespace: str = None head_info: list = field(default_factory=list) + head_cpus: int = 2 + head_memory: int = 8 + head_gpus: int = 0 machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 max_cpus: int = 1 @@ -76,7 +79,7 @@

Module codeflare_sdk.cluster.config

template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False envs: dict = field(default_factory=dict) - image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" + image: str = "quay.io/project-codeflare/ray:latest-py39-cu118" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) dispatch_priority: str = None @@ -93,7 +96,7 @@

Classes

class ClusterConfiguration -(name: str, namespace: str = None, head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:2.5.0-py38-cu116', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None) +(name: str, namespace: str = None, head_info: list = <factory>, head_cpus: int = 2, head_memory: int = 8, head_gpus: int = 0, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:latest-py39-cu118', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None)

This dataclass is used to specify resource requirements and other details, and @@ -111,6 +114,9 @@

Classes

name: str namespace: str = None head_info: list = field(default_factory=list) + head_cpus: int = 2 + head_memory: int = 8 + head_gpus: int = 0 machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 max_cpus: int = 1 @@ -121,7 +127,7 @@

Classes

template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False envs: dict = field(default_factory=dict) - image: str = "quay.io/project-codeflare/ray:2.5.0-py38-cu116" + image: str = "quay.io/project-codeflare/ray:latest-py39-cu118" local_interactive: bool = False image_pull_secrets: list = field(default_factory=list) dispatch_priority: str = None @@ -136,10 +142,22 @@

Class variables

+
var head_cpus : int
+
+
+
+
var head_gpus : int
+
+
+
var head_info : list
+
var head_memory : int
+
+
+
var image : str
@@ -219,7 +237,10 @@

  • dispatch_priority
  • envs
  • +
  • head_cpus
  • +
  • head_gpus
  • head_info
  • +
  • head_memory
  • image
  • image_pull_secrets
  • instascale
  • diff --git a/docs/cluster/model.html b/docs/cluster/model.html index 7d911255b..3832494eb 100644 --- a/docs/cluster/model.html +++ b/docs/cluster/model.html @@ -102,6 +102,9 @@

    Module codeflare_sdk.cluster.model

    name: str status: RayClusterStatus + head_cpus: int + head_mem: str + head_gpu: int workers: int worker_mem_min: str worker_mem_max: str @@ -287,7 +290,7 @@

    Class variables

    class RayCluster -(name: str, status: RayClusterStatus, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str) +(name: str, status: RayClusterStatus, head_cpus: int, head_mem: str, head_gpu: int, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str)

    For storing information about a Ray cluster.

    @@ -302,6 +305,9 @@

    Class variables

    name: str status: RayClusterStatus + head_cpus: int + head_mem: str + head_gpu: int workers: int worker_mem_min: str worker_mem_max: str @@ -316,6 +322,18 @@

    Class variables

    +
    var head_cpus : int
    +
    +
    +
    +
    var head_gpu : int
    +
    +
    +
    +
    var head_mem : str
    +
    +
    +
    var name : str
    @@ -447,6 +465,9 @@

    RayCluster

    • dashboard
    • +
    • head_cpus
    • +
    • head_gpu
    • +
    • head_mem
    • name
    • namespace
    • status
    • diff --git a/docs/utils/generate_yaml.html b/docs/utils/generate_yaml.html index 6d94e4c71..791898a3b 100644 --- a/docs/utils/generate_yaml.html +++ b/docs/utils/generate_yaml.html @@ -138,35 +138,51 @@

      Module codeflare_sdk.utils.generate_yaml

      def update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, ): if "custompodresources" in item.keys(): custompodresources = item.get("custompodresources") for i in range(len(custompodresources)): + resource = custompodresources[i] if i == 0: # Leave head node resources as template default - continue - resource = custompodresources[i] - for k, v in resource.items(): - if k == "replicas" and i == 1: - resource[k] = workers - if k == "requests" or k == "limits": - for spec, _ in v.items(): - if spec == "cpu": - if k == "limits": - resource[k][spec] = max_cpu - else: - resource[k][spec] = min_cpu - if spec == "memory": - if k == "limits": - resource[k][spec] = str(max_memory) + "G" - else: - resource[k][spec] = str(min_memory) + "G" - if spec == "nvidia.com/gpu": - if i == 0: - resource[k][spec] = 0 - else: - resource[k][spec] = gpu + resource["requests"]["cpu"] = head_cpus + resource["limits"]["cpu"] = head_cpus + resource["requests"]["memory"] = str(head_memory) + "G" + resource["limits"]["memory"] = str(head_memory) + "G" + resource["requests"]["nvidia.com/gpu"] = head_gpus + resource["limits"]["nvidia.com/gpu"] = head_gpus + + else: + for k, v in resource.items(): + if k == "replicas" and i == 1: + resource[k] = workers + if k == "requests" or k == "limits": + for spec, _ in v.items(): + if spec == "cpu": + if k == "limits": + resource[k][spec] = max_cpu + else: + resource[k][spec] = min_cpu + if spec == "memory": + if k == "limits": + resource[k][spec] = str(max_memory) + "G" + else: + resource[k][spec] = str(min_memory) + "G" + if spec == "nvidia.com/gpu": + if i == 0: + resource[k][spec] = 0 + else: + resource[k][spec] = gpu else: sys.exit("Error: malformed template") @@ -236,11 +252,15 @@

      Module codeflare_sdk.utils.generate_yaml

      instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ): if "generictemplate" in item.keys(): head = item.get("generictemplate").get("spec").get("headGroupSpec") - worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers worker["minReplicas"] = workers @@ -256,7 +276,9 @@

      Module codeflare_sdk.utils.generate_yaml

      update_env(spec, env) if comp == head: # TODO: Eventually add head node configuration outside of template - continue + update_resources( + spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + ) else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) @@ -381,6 +403,9 @@

      Module codeflare_sdk.utils.generate_yaml

      def generate_appwrapper( name: str, namespace: str, + head_cpus: int, + head_memory: int, + head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, @@ -406,7 +431,16 @@

      Module codeflare_sdk.utils.generate_yaml

      update_labels(user_yaml, instascale, instance_types) update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, ) update_nodes( item, @@ -421,6 +455,9 @@

      Module codeflare_sdk.utils.generate_yaml

      instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ) update_dashboard_route(route_item, cluster_name, namespace) if local_interactive: @@ -577,7 +614,7 @@

      Functions

    -def generate_appwrapper(name: str, namespace: str, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int) +def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int)
    @@ -588,6 +625,9 @@

    Functions

    def generate_appwrapper(
         name: str,
         namespace: str,
    +    head_cpus: int,
    +    head_memory: int,
    +    head_gpus: int,
         min_cpu: int,
         max_cpu: int,
         min_memory: int,
    @@ -613,7 +653,16 @@ 

    Functions

    update_labels(user_yaml, instascale, instance_types) update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, ) update_nodes( item, @@ -628,6 +677,9 @@

    Functions

    instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ) update_dashboard_route(route_item, cluster_name, namespace) if local_interactive: @@ -700,7 +752,7 @@

    Functions

    -def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers) +def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus)
    @@ -709,35 +761,51 @@

    Functions

    Expand source code
    def update_custompodresources(
    -    item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
    +    item,
    +    min_cpu,
    +    max_cpu,
    +    min_memory,
    +    max_memory,
    +    gpu,
    +    workers,
    +    head_cpus,
    +    head_memory,
    +    head_gpus,
     ):
         if "custompodresources" in item.keys():
             custompodresources = item.get("custompodresources")
             for i in range(len(custompodresources)):
    +            resource = custompodresources[i]
                 if i == 0:
                     # Leave head node resources as template default
    -                continue
    -            resource = custompodresources[i]
    -            for k, v in resource.items():
    -                if k == "replicas" and i == 1:
    -                    resource[k] = workers
    -                if k == "requests" or k == "limits":
    -                    for spec, _ in v.items():
    -                        if spec == "cpu":
    -                            if k == "limits":
    -                                resource[k][spec] = max_cpu
    -                            else:
    -                                resource[k][spec] = min_cpu
    -                        if spec == "memory":
    -                            if k == "limits":
    -                                resource[k][spec] = str(max_memory) + "G"
    -                            else:
    -                                resource[k][spec] = str(min_memory) + "G"
    -                        if spec == "nvidia.com/gpu":
    -                            if i == 0:
    -                                resource[k][spec] = 0
    -                            else:
    -                                resource[k][spec] = gpu
    +                resource["requests"]["cpu"] = head_cpus
    +                resource["limits"]["cpu"] = head_cpus
    +                resource["requests"]["memory"] = str(head_memory) + "G"
    +                resource["limits"]["memory"] = str(head_memory) + "G"
    +                resource["requests"]["nvidia.com/gpu"] = head_gpus
    +                resource["limits"]["nvidia.com/gpu"] = head_gpus
    +
    +            else:
    +                for k, v in resource.items():
    +                    if k == "replicas" and i == 1:
    +                        resource[k] = workers
    +                    if k == "requests" or k == "limits":
    +                        for spec, _ in v.items():
    +                            if spec == "cpu":
    +                                if k == "limits":
    +                                    resource[k][spec] = max_cpu
    +                                else:
    +                                    resource[k][spec] = min_cpu
    +                            if spec == "memory":
    +                                if k == "limits":
    +                                    resource[k][spec] = str(max_memory) + "G"
    +                                else:
    +                                    resource[k][spec] = str(min_memory) + "G"
    +                            if spec == "nvidia.com/gpu":
    +                                if i == 0:
    +                                    resource[k][spec] = 0
    +                                else:
    +                                    resource[k][spec] = gpu
         else:
             sys.exit("Error: malformed template")
    @@ -855,7 +923,7 @@

    Functions

    -def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets) +def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus)
    @@ -876,11 +944,15 @@

    Functions

    instascale, env, image_pull_secrets, + head_cpus, + head_memory, + head_gpus, ): if "generictemplate" in item.keys(): head = item.get("generictemplate").get("spec").get("headGroupSpec") - worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers worker["minReplicas"] = workers @@ -896,7 +968,9 @@

    Functions

    update_env(spec, env) if comp == head: # TODO: Eventually add head node configuration outside of template - continue + update_resources( + spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + ) else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)