Skip to content

Commit

Permalink
CLI Layout and Create RayCluster function (#227)
Browse files Browse the repository at this point in the history
* Create: base and file layout for CLI

* Add: Create raycluster command for CLI

* Refactor: refactor CLI using pre-commit

* Test: unit tests for create raycluster function in the CLI

* Update: update egg-info with more paths

* Change: change Framework Cluster to RayCluster

* merge: rebase with main

* Fix: unit tests

* Change: create cluster to define cluster in unit tests

* Add: error handling for invalid command

* test: change tests so cli cluster definition has its own yaml file
  • Loading branch information
carsonmh authored and Maxusmusti committed Jul 31, 2023
1 parent 97bf513 commit 2f835e0
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ We use pre-commit to make sure the code is consistently formatted. To make sure
- To run the unit tests, run `pytest -v tests/unit_test.py`
- Any new test functions/scripts can be added into the `tests` folder
- NOTE: Functional tests coming soon, will live in `tests/func_test.py`
- To test CLI, run `codeflare` followed by any command. To see list of commands, simply run `codeflare`

#### Code Coverage

Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"
executing = "1.2.0"
click = "8.0.4"

[tool.poetry.group.docs]
optional = true
Expand All @@ -39,3 +40,10 @@ pdoc3 = "0.10.0"
pytest = "7.4.0"
coverage = "7.2.7"
pytest-mock = "3.11.1"

[tool.poetry.scripts]
codeflare = "codeflare_sdk.cli.codeflare_cli:cli"

[build-system]
requires = ["poetry_core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ codeflare-torchx==0.6.0.dev0
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
cryptography==40.0.2
executing==1.2.0
click==8.0.4
4 changes: 4 additions & 0 deletions src/codeflare_sdk.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ src/codeflare_sdk/job/jobs.py
src/codeflare_sdk/utils/__init__.py
src/codeflare_sdk/utils/generate_yaml.py
src/codeflare_sdk/utils/pretty_print.py
src/codeflare_sdk/cli/__init__.py
src/codeflare_sdk/cli/codeflare_cli.py
src/codeflare_sdk/cli/commands/create.py
src/codeflare_sdk/cli/cli_utils.py
Empty file.
12 changes: 12 additions & 0 deletions src/codeflare_sdk/cli/cli_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import ast
import click


class PythonLiteralOption(click.Option):
def type_cast_value(self, ctx, value):
try:
if not value:
return None
return ast.literal_eval(value)
except:
raise click.BadParameter(value)
36 changes: 36 additions & 0 deletions src/codeflare_sdk/cli/codeflare_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import click
import sys
import os

cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands"))


class CodeflareCLI(click.MultiCommand):
def list_commands(self, ctx):
rv = []
for filename in os.listdir(cmd_folder):
if filename.endswith(".py") and filename != "__init__.py":
rv.append(filename[:-3])
rv.sort()
return rv

def get_command(self, ctx, name):
ns = {}
fn = os.path.join(cmd_folder, name + ".py")
try:
with open(fn) as f:
code = compile(f.read(), fn, "exec")
eval(code, ns, ns)
return ns["cli"]
except FileNotFoundError:
return


@click.command(cls=CodeflareCLI)
@click.pass_context
def cli(ctx):
pass


if __name__ == "__main__":
cli()
36 changes: 36 additions & 0 deletions src/codeflare_sdk/cli/commands/define.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import click

from codeflare_sdk.cluster.cluster import Cluster
from codeflare_sdk.cluster.config import ClusterConfiguration
from codeflare_sdk.cli.cli_utils import PythonLiteralOption


@click.group()
def cli():
"""Define a resource with parameter specifications"""
pass


@cli.command()
@click.option("--name", type=str, required=True)
@click.option("--namespace", "-n", type=str)
@click.option("--head_info", cls=PythonLiteralOption, type=list)
@click.option("--machine_types", cls=PythonLiteralOption, type=list)
@click.option("--min_cpus", type=int)
@click.option("--max_cpus", type=int)
@click.option("--min_worker", type=int)
@click.option("--max_worker", type=int)
@click.option("--min_memory", type=int)
@click.option("--max_memory", type=int)
@click.option("--gpu", type=int)
@click.option("--template", type=str)
@click.option("--instascale", type=bool)
@click.option("--envs", cls=PythonLiteralOption, type=dict)
@click.option("--image", type=str)
@click.option("--local_interactive", type=bool)
@click.option("--image_pull_secrets", cls=PythonLiteralOption, type=list)
def raycluster(**kwargs):
"""Define a RayCluster with parameter specifications"""
filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
clusterConfig = ClusterConfiguration(**filtered_kwargs)
Cluster(clusterConfig) # Creates yaml file
195 changes: 195 additions & 0 deletions tests/cli-test-case.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
apiVersion: mcad.ibm.com/v1beta1
kind: AppWrapper
metadata:
labels:
orderedinstance: cpu.small_gpu.large
name: cli-test-cluster
namespace: ns
spec:
priority: 9
resources:
GenericItems:
- custompodresources:
- limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
replicas: 1
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
replicas: 2
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
generictemplate:
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: cli-test-cluster
controller-tools.k8s.io: '1.0'
name: cli-test-cluster
namespace: ns
spec:
autoscalerOptions:
idleTimeoutSeconds: 60
imagePullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
upscalingMode: Default
enableInTreeAutoscaling: false
headGroupSpec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
num-gpus: '0'
serviceType: ClusterIP
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cli-test-cluster
operator: In
values:
- cli-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: '0'
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: ray-head
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
imagePullSecrets:
- name: cli-test-pull-secret
rayVersion: 2.1.0
workerGroupSpecs:
- groupName: small-group-cli-test-cluster
maxReplicas: 2
minReplicas: 2
rayStartParams:
block: 'true'
num-gpus: '7'
replicas: 2
template:
metadata:
annotations:
key: value
labels:
key: value
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cli-test-cluster
operator: In
values:
- cli-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: '0'
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: machine-learning
resources:
limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
imagePullSecrets:
- name: cli-test-pull-secret
initContainers:
- command:
- sh
- -c
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
do echo waiting for myservice; sleep 2; done
image: busybox:1.28
name: init-myservice
replicas: 1
- generictemplate:
apiVersion: route.openshift.io/v1
kind: Route
metadata:
labels:
odh-ray-cluster-service: cli-test-cluster-head-svc
name: ray-dashboard-cli-test-cluster
namespace: ns
spec:
port:
targetPort: dashboard
to:
kind: Service
name: cli-test-cluster-head-svc
replica: 1
Items: []
37 changes: 37 additions & 0 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import filecmp
import os
import re
from click.testing import CliRunner

parent = Path(__file__).resolve().parents[1]
sys.path.append(str(parent) + "/src")
Expand Down Expand Up @@ -63,6 +64,7 @@
generate_tls_cert,
export_env,
)
from codeflare_sdk.cli.codeflare_cli import cli

import openshift
from openshift.selector import Selector
Expand All @@ -75,6 +77,37 @@
import yaml


# CLI testing
def test_cli_working():
runner = CliRunner()
result = runner.invoke(cli)
assert result.exit_code == 0


def test_cluster_definition_cli():
runner = CliRunner()
define_cluster_command = """
define raycluster
--name=cli-test-cluster
--namespace=ns
--min_worker=1
--max_worker=2
--min_cpus=3
--max_cpus=4
--min_memory=5
--max_memory=6
--gpu=7
--instascale=True
--machine_types='["cpu.small", "gpu.large"]'
--image_pull_secrets='["cli-test-pull-secret"]'
"""
result = runner.invoke(cli, define_cluster_command)
assert result.output == "Written to: cli-test-cluster.yaml\n"
assert filecmp.cmp(
"cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True
)


# For mocking openshift client results
fake_res = openshift.Result("fake")

Expand Down Expand Up @@ -2221,8 +2254,12 @@ def test_cleanup():
os.remove("unit-test-default-cluster.yaml")
os.remove("test.yaml")
os.remove("raytest2.yaml")
<<<<<<< HEAD
os.remove("quicktest.yaml")
os.remove("tls-cluster-namespace/ca.crt")
os.remove("tls-cluster-namespace/tls.crt")
os.remove("tls-cluster-namespace/tls.key")
os.rmdir("tls-cluster-namespace")
=======
os.remove("cli-test-cluster.yaml")
>>>>>>> 3195eb1 (CLI Layout and Create RayCluster function (#227))

0 comments on commit 2f835e0

Please sign in to comment.