Skip to content

Commit 5fa92d1

Browse files
committed
docs: enhance ray module code documentation
1 parent acd29a3 commit 5fa92d1

File tree

3 files changed

+261
-58
lines changed

3 files changed

+261
-58
lines changed

src/codeflare_sdk/ray/client/ray_jobs.py

+115-30
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,28 @@
2424

2525
class RayJobClient:
2626
"""
27-
A class that functions as a wrapper for the Ray Job Submission Client.
28-
29-
parameters:
30-
address -- Either (1) the address of the Ray cluster, or (2) the HTTP address of the dashboard server on the head node, e.g. “http://<head-node-ip>:8265”. In case (1) it must be specified as an address that can be passed to ray.init(),
31-
e.g. a Ray Client address (ray://<head_node_host>:10001), or “auto”, or “localhost:<port>”. If unspecified, will try to connect to a running local Ray cluster. This argument is always overridden by the RAY_ADDRESS environment variable.
32-
create_cluster_if_needed -- Indicates whether the cluster at the specified address needs to already be running. Ray doesn't start a cluster before interacting with jobs, but third-party job managers may do so.
33-
cookies -- Cookies to use when sending requests to the HTTP job server.
34-
metadata -- Arbitrary metadata to store along with all jobs. New metadata specified per job will be merged with the global metadata provided here via a simple dict update.
35-
headers -- Headers to use when sending requests to the HTTP job server, used for cases like authentication to a remote cluster.
36-
verify -- Boolean indication to verify the server's TLS certificate or a path to a file or directory of trusted certificates. Default: True.
27+
A wrapper class for the Ray Job Submission Client, used for interacting with Ray clusters to manage job
28+
submissions, deletions, and other job-related information.
29+
30+
Args:
31+
address (Optional[str]):
32+
The Ray cluster's address, which may be either the Ray Client address, HTTP address
33+
of the dashboard server on the head node, or "auto" / "localhost:<port>" for a local cluster.
34+
This is overridden by the RAY_ADDRESS environment variable if set.
35+
create_cluster_if_needed (bool):
36+
If True, a new cluster will be created if not already running at the
37+
specified address. By default, Ray requires an existing cluster.
38+
cookies (Optional[Dict[str, Any]]):
39+
HTTP cookies to send with requests to the job server.
40+
metadata (Optional[Dict[str, Any]]):
41+
Global metadata to store with all jobs, merged with job-specific
42+
metadata during job submission.
43+
headers (Optional[Dict[str, Any]]):
44+
HTTP headers to send with requests to the job server, used for
45+
authentication.
46+
verify (Optional[Union[str, bool]]):
47+
If True, verifies the server's TLS certificate. Can also be a path
48+
to trusted certificates. Default is True.
3749
"""
3850

3951
def __init__(
@@ -67,18 +79,35 @@ def submit_job(
6779
entrypoint_resources: Optional[Dict[str, float]] = None,
6880
) -> str:
6981
"""
70-
Method for submitting jobs to a Ray Cluster and returning the job id with entrypoint being a mandatory field.
71-
72-
Parameters:
73-
entrypoint -- The shell command to run for this job.
74-
submission_id -- A unique ID for this job.
75-
runtime_env -- The runtime environment to install and run this job in.
76-
metadata -- Arbitrary data to store along with this job.
77-
job_id -- DEPRECATED. This has been renamed to submission_id
78-
entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
79-
entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
80-
entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0.
81-
entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it.
82+
Submits a job to the Ray cluster with specified resources and returns the job ID.
83+
84+
Args:
85+
entrypoint (str):
86+
The command to execute for this job.
87+
job_id (Optional[str]):
88+
Deprecated, use `submission_id`. A unique job identifier.
89+
runtime_env (Optional[Dict[str, Any]]):
90+
The runtime environment for this job.
91+
metadata (Optional[Dict[str, str]]):
92+
Metadata associated with the job, merged with global metadata.
93+
submission_id (Optional[str]):
94+
Unique ID for the job submission.
95+
entrypoint_num_cpus (Optional[Union[int, float]]):
96+
The quantity of CPU cores to reserve for the execution of the entrypoint command,
97+
separately from any tasks or actors launched by it. Defaults to 0.
98+
entrypoint_num_gpus (Optional[Union[int, float]]):
99+
The quantity of GPUs to reserve for the execution of the entrypoint command,
100+
separately from any tasks or actors launched by it. Defaults to 0.
101+
entrypoint_memory (Optional[int]):
102+
The quantity of memory to reserve for the execution of the entrypoint command,
103+
separately from any tasks or actors launched by it. Defaults to 0.
104+
entrypoint_resources (Optional[Dict[str, float]]):
105+
The quantity of custom resources to reserve for the execution of the entrypoint command,
106+
separately from any tasks or actors launched by it.
107+
108+
Returns:
109+
str:
110+
The unique identifier for the submitted job.
82111
"""
83112
return self.rayJobClient.submit_job(
84113
entrypoint=entrypoint,
@@ -94,7 +123,15 @@ def submit_job(
94123

95124
def delete_job(self, job_id: str) -> (bool, str):
96125
"""
97-
Method for deleting jobs with the job id being a mandatory field.
126+
Deletes a job by job ID.
127+
128+
Args:
129+
job_id (str):
130+
The unique identifier of the job to delete.
131+
132+
Returns:
133+
tuple(bool, str):
134+
A tuple with deletion status and a message.
98135
"""
99136
deletion_status = self.rayJobClient.delete_job(job_id=job_id)
100137

@@ -107,37 +144,77 @@ def delete_job(self, job_id: str) -> (bool, str):
107144

108145
def get_address(self) -> str:
109146
"""
110-
Method for getting the address from the RayJobClient
147+
Retrieves the address of the connected Ray cluster.
148+
149+
Returns:
150+
str:
151+
The Ray cluster's address.
111152
"""
112153
return self.rayJobClient.get_address()
113154

114155
def get_job_info(self, job_id: str):
115156
"""
116-
Method for getting the job info with the job id being a mandatory field.
157+
Fetches information about a job by job ID.
158+
159+
Args:
160+
job_id (str):
161+
The unique identifier of the job.
162+
163+
Returns:
164+
JobInfo:
165+
Information about the job's status, progress, and other details.
117166
"""
118167
return self.rayJobClient.get_job_info(job_id=job_id)
119168

120169
def get_job_logs(self, job_id: str) -> str:
121170
"""
122-
Method for getting the job logs with the job id being a mandatory field.
171+
Retrieves the logs for a specific job by job ID.
172+
173+
Args:
174+
job_id (str):
175+
The unique identifier of the job.
176+
177+
Returns:
178+
str:
179+
Logs output from the job.
123180
"""
124181
return self.rayJobClient.get_job_logs(job_id=job_id)
125182

126183
def get_job_status(self, job_id: str) -> str:
127184
"""
128-
Method for getting the job's status with the job id being a mandatory field.
185+
Fetches the current status of a job by job ID.
186+
187+
Args:
188+
job_id (str):
189+
The unique identifier of the job.
190+
191+
Returns:
192+
str:
193+
The job's status.
129194
"""
130195
return self.rayJobClient.get_job_status(job_id=job_id)
131196

132197
def list_jobs(self) -> List[JobDetails]:
133198
"""
134-
Method for getting a list of current jobs in the Ray Cluster.
199+
Lists all current jobs in the Ray cluster.
200+
201+
Returns:
202+
List[JobDetails]:
203+
A list of job details for each current job in the cluster.
135204
"""
136205
return self.rayJobClient.list_jobs()
137206

138207
def stop_job(self, job_id: str) -> (bool, str):
139208
"""
140-
Method for stopping a job with the job id being a mandatory field.
209+
Stops a running job by job ID.
210+
211+
Args:
212+
job_id (str):
213+
The unique identifier of the job to stop.
214+
215+
Returns:
216+
tuple(bool, str):
217+
A tuple with the stop status and a message.
141218
"""
142219
stop_job_status = self.rayJobClient.stop_job(job_id=job_id)
143220
if stop_job_status:
@@ -148,6 +225,14 @@ def stop_job(self, job_id: str) -> (bool, str):
148225

149226
def tail_job_logs(self, job_id: str) -> Iterator[str]:
150227
"""
151-
Method for getting an iterator that follows the logs of a job with the job id being a mandatory field.
228+
Continuously streams the logs of a job.
229+
230+
Args:
231+
job_id (str):
232+
The unique identifier of the job.
233+
234+
Returns:
235+
Iterator[str]:
236+
An iterator that yields log entries in real-time.
152237
"""
153238
return self.rayJobClient.tail_job_logs(job_id=job_id)

src/codeflare_sdk/ray/cluster/cluster.py

+95-2
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ def status(
292292
return status, ready
293293

294294
def is_dashboard_ready(self) -> bool:
295+
"""
296+
Checks if the cluster's dashboard is ready and accessible.
297+
298+
This method attempts to send a GET request to the cluster dashboard URI.
299+
If the request is successful (HTTP status code 200), it returns True.
300+
If an SSL error occurs, it returns False, indicating the dashboard is not ready.
301+
302+
Returns:
303+
bool:
304+
True if the dashboard is ready, False otherwise.
305+
"""
295306
try:
296307
response = requests.get(
297308
self.cluster_dashboard_uri(),
@@ -309,8 +320,22 @@ def is_dashboard_ready(self) -> bool:
309320

310321
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
311322
"""
312-
Waits for requested cluster to be ready, up to an optional timeout (s).
313-
Checks every five seconds.
323+
Waits for the requested cluster to be ready, up to an optional timeout.
324+
325+
This method checks the status of the cluster every five seconds until it is
326+
ready or the timeout is reached. If dashboard_check is enabled, it will also
327+
check for the readiness of the dashboard.
328+
329+
Args:
330+
timeout (Optional[int]):
331+
The maximum time to wait for the cluster to be ready in seconds. If None, waits indefinitely.
332+
dashboard_check (bool):
333+
Flag to determine if the dashboard readiness should
334+
be checked. Defaults to True.
335+
336+
Raises:
337+
TimeoutError:
338+
If the timeout is reached before the cluster or dashboard is ready.
314339
"""
315340
print("Waiting for requested resources to be set up...")
316341
time = 0
@@ -342,6 +367,21 @@ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True
342367
time += 5
343368

344369
def details(self, print_to_console: bool = True) -> RayCluster:
370+
"""
371+
Retrieves details about the Ray Cluster.
372+
373+
This method returns a copy of the Ray Cluster information and optionally prints
374+
the details to the console.
375+
376+
Args:
377+
print_to_console (bool):
378+
Flag to determine if the cluster details should be
379+
printed to the console. Defaults to True.
380+
381+
Returns:
382+
RayCluster:
383+
A copy of the Ray Cluster details.
384+
"""
345385
cluster = _copy_to_ray(self)
346386
if print_to_console:
347387
pretty_print.print_clusters([cluster])
@@ -448,6 +488,24 @@ def from_k8_cluster_object(
448488
write_to_file=False,
449489
verify_tls=True,
450490
):
491+
"""
492+
Creates a Cluster instance from a Kubernetes cluster object.
493+
494+
Args:
495+
rc (dict):
496+
The Ray cluster object from Kubernetes.
497+
appwrapper (bool):
498+
Flag to indicate whether to use app wrapper.
499+
write_to_file (bool):
500+
Flag to write configuration to file.
501+
verify_tls (bool):
502+
Flag to verify TLS.
503+
504+
Returns:
505+
Cluster:
506+
A configured Cluster instance.
507+
"""
508+
451509
config_check()
452510
machine_types = (
453511
rc["metadata"]["labels"]["orderedinstance"].split("_")
@@ -504,6 +562,13 @@ def from_k8_cluster_object(
504562
return Cluster(cluster_config)
505563

506564
def local_client_url(self):
565+
"""
566+
Constructs the URL for the local Ray client.
567+
568+
Returns:
569+
str:
570+
The Ray client URL based on the ingress domain.
571+
"""
507572
ingress_domain = _get_ingress_domain(self)
508573
return f"ray://{ingress_domain}"
509574

@@ -574,6 +639,13 @@ def list_all_queued(
574639

575640

576641
def get_current_namespace(): # pragma: no cover
642+
"""
643+
Retrieves the current Kubernetes namespace.
644+
645+
Returns:
646+
str:
647+
The current namespace or None if not found.
648+
"""
577649
if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
578650
try:
579651
file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r")
@@ -598,6 +670,27 @@ def get_cluster(
598670
write_to_file: bool = False,
599671
verify_tls: bool = True,
600672
):
673+
"""
674+
Retrieves a specific cluster by name from a given namespace.
675+
676+
Args:
677+
cluster_name (str):
678+
The name of the cluster to retrieve.
679+
namespace (str):
680+
The Kubernetes namespace where the cluster is located.
681+
write_to_file (bool):
682+
Flag to write configuration to file.
683+
verify_tls (bool):
684+
Flag to verify TLS.
685+
686+
Returns:
687+
Cluster:
688+
The requested Cluster instance.
689+
690+
Raises:
691+
FileNotFoundError:
692+
If the cluster is not found in the specified namespace.
693+
"""
601694
try:
602695
config_check()
603696
api_instance = client.CustomObjectsApi(get_api_client())

0 commit comments

Comments
 (0)