From d9badb60b0c104ec9d19e9e102c7c33c82462ec0 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 22 Sep 2023 08:30:06 +0200 Subject: [PATCH 1/4] Bind Ray ClusterRole to the new operator ServiceAccount --- .../rbac/mcad-controller-ray-clusterrolebinding.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codeflare-stack/rbac/mcad-controller-ray-clusterrolebinding.yaml b/codeflare-stack/rbac/mcad-controller-ray-clusterrolebinding.yaml index 172e7d25..a3931da0 100644 --- a/codeflare-stack/rbac/mcad-controller-ray-clusterrolebinding.yaml +++ b/codeflare-stack/rbac/mcad-controller-ray-clusterrolebinding.yaml @@ -4,8 +4,8 @@ metadata: name: mcad-controller-ray-clusterrolebinding subjects: - kind: ServiceAccount - name: mcad-controller-mcad - namespace: $(namespace) + name: codeflare-operator-controller-manager + namespace: openshift-operators roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole From 57a1b2b50ea7a90bf3cbe9a4f9caccae76157e7f Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Wed, 20 Sep 2023 13:04:59 +0200 Subject: [PATCH 2/4] Backport changes in ODH manifest tests --- tests/basictests/distributed-workloads.sh | 32 +++++++++++---------- tests/basictests/ray.sh | 3 +- tests/resources/codeflare-subscription.yaml | 2 +- tests/resources/custom-nb-small.yaml | 6 ++-- tests/resources/mnist_ray_mini.ipynb | 3 +- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/tests/basictests/distributed-workloads.sh b/tests/basictests/distributed-workloads.sh index 412c9d00..9cd119ae 100755 --- a/tests/basictests/distributed-workloads.sh +++ b/tests/basictests/distributed-workloads.sh @@ -33,10 +33,10 @@ function test_mcad_torchx_functionality() { ########### Clean Cluster should be free of these resources ############ # Get appwrapper name - AW=$(oc get appwrapper -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) || true + AW=$(oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) || true # Clean up resources if [[ -n $AW ]]; then - os::cmd::expect_success "oc delete appwrapper $AW -n ${ODHPROJECT} || true" + os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} || true" fi os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT} || true" os::cmd::expect_success "oc delete cm notebooks-mcad -n ${ODHPROJECT} || true" @@ -50,7 +50,9 @@ function test_mcad_torchx_functionality() { os::cmd::expect_success "oc create configmap notebooks-mcad -n ${ODHPROJECT} --from-file=${RESOURCEDIR}/mnist_mcad_mini.ipynb" # Get Token - local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oE 'access_token=[^&]*'| sed 's/access_token=//')" + local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oP 'access_token=\K[^&]*')" + + # Spawn notebook-server using the codeflare custom nb image os::cmd::expect_success "cat ${RESOURCEDIR}/custom-nb-small.yaml \ | sed s/%INGRESS%/$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})/g \ | sed s/%OCPSERVER%/$(oc whoami --show-server=true|cut -f3 -d "/")/g \ @@ -62,16 +64,16 @@ function test_mcad_torchx_functionality() { os::cmd::try_until_text "oc get pod -n ${ODHPROJECT} | grep "jupyter-nb-kube-3aadmin" | awk '{print \$2}'" "2/2" $odhdefaulttimeout $odhdefaultinterval # Wait for appwrapper to exist - os::cmd::try_until_text "oc get appwrapper -n ${ODHPROJECT} | grep mnistjob" "mnistjob-*" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob" "mnistjob-*" $odhdefaulttimeout $odhdefaultinterval # Get appwrapper name - AW=$(oc get appwrapper -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) + AW=$(oc get appwrapper.workload.codeflare.dev -n ${ODHPROJECT} | grep mnistjob | cut -d ' ' -f 1) # Wait for the mnisttest appwrapper state to become running - os::cmd::try_until_text "oc get appwrapper $AW -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval # Wait for workload to succeed and clean up - os::cmd::try_until_text "oc get appwrapper $AW -n ${ODHPROJECT}" ".*NotFound.*" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT}" "*NotFound*" $odhdefaulttimeout $odhdefaultinterval # Test clean up resources os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT}" @@ -80,8 +82,8 @@ function test_mcad_torchx_functionality() { os::cmd::expect_success "oc delete cm notebooks-mcad -n ${ODHPROJECT} || true" os::cmd::expect_failure "oc get cm notebooks-mcad -n ${ODHPROJECT}" - os::cmd::expect_success "oc delete appwrapper $AW -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get appwrapper $AW -n ${ODHPROJECT}" + os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT} || true" + os::cmd::expect_failure "oc get appwrapper.workload.codeflare.dev $AW -n ${ODHPROJECT}" os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" os::cmd::expect_failure "oc get pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT}" @@ -94,7 +96,7 @@ function test_mcad_ray_functionality() { # Clean up resources os::cmd::expect_success "oc delete notebook jupyter-nb-kube-3aadmin -n ${ODHPROJECT} || true" os::cmd::expect_success "oc delete cm notebooks-ray -n ${ODHPROJECT} || true" - os::cmd::expect_success "oc delete appwrapper mnisttest -n ${ODHPROJECT} || true" + os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} || true" os::cmd::expect_success "oc delete raycluster mnisttest -n ${ODHPROJECT} || true" os::cmd::expect_success "oc delete pvc jupyterhub-nb-kube-3aadmin-pvc -n ${ODHPROJECT} || true" ############################################################################## @@ -106,7 +108,7 @@ function test_mcad_ray_functionality() { os::cmd::expect_success "oc create configmap notebooks-ray -n ${ODHPROJECT} --from-file=${RESOURCEDIR}/mnist_ray_mini.ipynb --from-file=${RESOURCEDIR}/mnist.py --from-file=${RESOURCEDIR}/requirements.txt" # Get Token - local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oE 'access_token=[^&]*'| sed 's/access_token=//')" + local TESTUSER_BEARER_TOKEN="$(curl -skiL -u $TEST_USER:$TEST_PASS -H 'X-CSRF-Token: xxx' "$OPENSHIFT_OAUTH_ENDPOINT/oauth/authorize?response_type=token&client_id=openshift-challenging-client" | grep -oP 'access_token=\K[^&]*')" # Spawn notebook-server using the codeflare custom nb image os::cmd::expect_success "cat ${RESOURCEDIR}/custom-nb-small.yaml \ @@ -120,13 +122,13 @@ function test_mcad_ray_functionality() { os::cmd::try_until_text "oc get pod -n ${ODHPROJECT} | grep "jupyter-nb-kube-3aadmin" | awk '{print \$2}'" "2/2" $odhdefaulttimeout $odhdefaultinterval # Wait for the mnisttest appwrapper state to become running - os::cmd::try_until_text "oc get appwrapper mnisttest -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} -ojsonpath='{.status.state}'" "Running" $odhdefaulttimeout $odhdefaultinterval # Wait for Raycluster to be ready os::cmd::try_until_text "oc get raycluster -n ${ODHPROJECT} mnisttest -ojsonpath='{.status.state}'" "ready" $odhdefaulttimeout $odhdefaultinterval # Wait for job to be completed and cleaned up - os::cmd::try_until_text "oc get appwrapper mnisttest -n ${ODHPROJECT}" ".*NotFound.*" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT}" "*NotFound*" $odhdefaulttimeout $odhdefaultinterval os::cmd::expect_failure "oc get raycluster mnisttest -n ${ODHPROJECT}" # Test clean up resources @@ -136,8 +138,8 @@ function test_mcad_ray_functionality() { os::cmd::expect_success "oc delete cm notebooks-ray -n ${ODHPROJECT} || true" os::cmd::expect_failure "oc get cm notebooks-ray -n ${ODHPROJECT}" - os::cmd::expect_success "oc delete appwrapper mnisttest -n ${ODHPROJECT} || true" - os::cmd::expect_failure "oc get appwrapper mnisttest -n ${ODHPROJECT}" + os::cmd::expect_success "oc delete appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT} || true" + os::cmd::expect_failure "oc get appwrapper.workload.codeflare.dev mnisttest -n ${ODHPROJECT}" os::cmd::expect_success "oc delete raycluster mnisttest -n ${ODHPROJECT} || true" os::cmd::expect_failure "oc get raycluster mnisttest -n ${ODHPROJECT}" diff --git a/tests/basictests/ray.sh b/tests/basictests/ray.sh index 247ff8dd..f47c3e88 100755 --- a/tests/basictests/ray.sh +++ b/tests/basictests/ray.sh @@ -26,7 +26,8 @@ function start_test_ray_cluster(){ os::cmd::expect_success "oc project ${ODHPROJECT}" os::cmd::expect_success "oc apply -f ${RESOURCEDIR}/ray/ray-test-cluster-test.yaml" os::cmd::try_until_text "oc get RayCluster kuberay-cluster-test" "kuberay-cluster-test" $odhdefaulttimeout $odhdefaultinterval - sleep 15 + os::cmd::try_until_text "oc get pods -l ray.io/identifier=kuberay-cluster-test-head -o jsonpath='{$.items[*].status.phase}'" "Running" $odhdefaulttimeout $odhdefaultinterval + os::cmd::try_until_text "oc get pods -l ray.io/identifier=kuberay-cluster-test-worker -o jsonpath='{$.items[*].status.phase}'" "Running" $odhdefaulttimeout $odhdefaultinterval } function check_functionality(){ diff --git a/tests/resources/codeflare-subscription.yaml b/tests/resources/codeflare-subscription.yaml index 8eb516dd..b97f08d4 100644 --- a/tests/resources/codeflare-subscription.yaml +++ b/tests/resources/codeflare-subscription.yaml @@ -9,4 +9,4 @@ spec: source: community-operators sourceNamespace: openshift-marketplace installPlanApproval: Manual - startingCSV: codeflare-operator.v0.1.0 + startingCSV: codeflare-operator.v0.2.3 diff --git a/tests/resources/custom-nb-small.yaml b/tests/resources/custom-nb-small.yaml index f411a8a8..540f012a 100644 --- a/tests/resources/custom-nb-small.yaml +++ b/tests/resources/custom-nb-small.yaml @@ -10,7 +10,7 @@ kind: Notebook metadata: annotations: notebooks.opendatahub.io/inject-oauth: "true" - notebooks.opendatahub.io/last-image-selection: codeflare-notebook:latest + notebooks.opendatahub.io/last-image-selection: codeflare-notebook:v0.7.1 notebooks.opendatahub.io/last-size-selection: Small notebooks.opendatahub.io/oauth-logout-url: https://odh-dashboard-%NAMESPACE%.%INGRESS%/notebookController/kube-3aadmin/home opendatahub.io/link: https://jupyter-nb-kube-3aadmin-%NAMESPACE%.%INGRESS%/notebook/%NAMESPACE%/jupyter-nb-kube-3aadmin @@ -47,14 +47,14 @@ spec: --ServerApp.quit_button=False --ServerApp.tornado_settings={"user":"kube-3aadmin","hub_host":"https://odh-dashboard-%NAMESPACE%.%INGRESS%","hub_prefix":"/notebookController/kube-3aadmin"} - name: JUPYTER_IMAGE - value: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:latest + value: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:v0.7.1 - name: JUPYTER_NOTEBOOK_PORT value: "8888" - name: OCP_SERVER value: https://%OCPSERVER% - name: OCP_TOKEN value: %OCPTOKEN% - image: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:latest + image: image-registry.openshift-image-registry.svc:5000/%NAMESPACE%/codeflare-notebook:v0.7.1 command: ["/bin/sh", "-c", "pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks-%JOBTYPE%/mnist_%JOBTYPE%_mini.ipynb /opt/app-root/src/mcad-out.ipynb && sleep infinity"] # args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ] imagePullPolicy: Always diff --git a/tests/resources/mnist_ray_mini.ipynb b/tests/resources/mnist_ray_mini.ipynb index ab01a3f7..cdc93aa2 100644 --- a/tests/resources/mnist_ray_mini.ipynb +++ b/tests/resources/mnist_ray_mini.ipynb @@ -38,7 +38,8 @@ "outputs": [], "source": [ "# Bring up the cluster\n", - "cluster.up()" + "cluster.up()\n", + "sleep(10)" ] }, { From 17727d45d5e991ca665e44e66ff26a8026df9aee Mon Sep 17 00:00:00 2001 From: Anish Asthana Date: Fri, 22 Sep 2023 10:08:37 -0400 Subject: [PATCH 3/4] Remove sync action Signed-off-by: Anish Asthana --- .github/workflows/sync.yml | 118 ------------------------------------- 1 file changed, 118 deletions(-) delete mode 100644 .github/workflows/sync.yml diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml deleted file mode 100644 index 6586ae31..00000000 --- a/.github/workflows/sync.yml +++ /dev/null @@ -1,118 +0,0 @@ -# This workflow will update the manifests, tests and compatibilty matrix in odh-manifests repository - -name: update manifests in odh-manifests repo -on: - workflow_dispatch: - push: - branches: - - main - - 'releases/*' - -jobs: - update-manifests-and-create-pull-request: - runs-on: ubuntu-latest - permissions: - pull-requests: write - - steps: - - name: Checkout distributed-workload repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: main - - - name: Retrieve components versions from distributed-workload repo - run: | - CODEFLARE_OPERATOR_VERSION=$(sed -n -e 's/^.*CodeFlare Operator *| //p' README.md | tr -d ' |') - echo "CODEFLARE_OPERATOR_VERSION=${CODEFLARE_OPERATOR_VERSION}" >> "$GITHUB_ENV" - MCAD_VERSION=$(sed -n -e 's/^.*Multi-Cluster App Dispatcher *| //p' README.md | tr -d ' |') - echo "MCAD_VERSION=${MCAD_VERSION}" >> "$GITHUB_ENV" - CODEFLARE_SDK_VERSION=$(sed -n -e 's/^.*CodeFlare-SDK *| //p' README.md | tr -d ' |') - echo "CODEFLARE_SDK_VERSION=${CODEFLARE_SDK_VERSION}" >> "$GITHUB_ENV" - INSTASCALE_VERSION=$(sed -n -e 's/^.*InstaScale *| //p' README.md | tr -d ' |') - echo "INSTASCALE_VERSION=${INSTASCALE_VERSION}" >> "$GITHUB_ENV" - KUBERAY_VERSION=$(sed -n -e 's/^.*KubeRay *| //p' README.md | tr -d ' |') - echo "KUBERAY_VERSION=${KUBERAY_VERSION}" >> "$GITHUB_ENV" - - - name: Checkout odh-manifest repository - uses: actions/checkout@v3 - with: - repository: opendatahub-io/odh-manifests - ref: master - path: odh-manifests - token: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} - - - name: Update manifest & create PR in odh-manifests repo - run: | - LATEST_TAG=$(git describe --tags --always --abbrev=0) - BRANCH_NAME="update-manifests-to-$LATEST_TAG" - - # Change working directory - cd odh-manifests - - # Checkout new branch for the changes - git checkout -b $BRANCH_NAME - - # Copy distributed-workload manifests using rsync - echo "Performing rsync ........" - rsync -av --exclude='README.md' ../codeflare-stack/ codeflare-stack/ - rsync -av --exclude='README.md' ../ray/ ray/ - rsync -av ../tests/util tests/resources/codeflare-stack/util - rsync -av --exclude='odh-subscription.yaml'../tests/resources/ tests/resources/codeflare-stack/ - rsync -av --exclude='distributed-workloads.sh' ../tests/basictests/ tests/basictests/ - echo "rsync complete .........." - - # Overwrite path in ray.sh - echo "Performing overwrite ......" - sed -i -e 's/\(RESOURCEDIR="${MY_DIR}\/..\/resources\)/\1\/ray/' tests/basictests/ray.sh - sed -i 's/\(source ${MY_DIR}\/..\)/\1\/resources\/codeflare-stack/' tests/basictests/ray.sh - echo "overwrite complete ........" - - # Update compatibilty matrix in codeflare/readme - echo "updating compatibilty matrix ........." - sed -i -E "s/(.*CodeFlare Operator.*)v[0-9]+\.[0-9]+\.[0-9]+(.*)/\1${{ env.CODEFLARE_OPERATOR_VERSION }}\2/" codeflare-stack/README.md - sed -i -E "s/(.*Multi-Cluster App Dispatcher.*)v[0-9]+\.[0-9]+\.[0-9]+(.*)/\1${{ env.MCAD_VERSION }}\2/" codeflare-stack/README.md - sed -i -E "s/(.*CodeFlare-SDK.*)v[0-9]+\.[0-9]+\.[0-9]+(.*)/\1${{ env.CODEFLARE_SDK_VERSION }}\2/" codeflare-stack/README.md - sed -i -E "s/(.*InstaScale.*)v[0-9]+\.[0-9]+\.[0-9]+(.*)/\1${{ env.INSTASCALE_VERSION }}\2/" codeflare-stack/README.md - sed -i -E "s/(.*KubeRay.*)v[0-9]+\.[0-9]+\.[0-9]+(.*)/\1${{ env.KUBERAY_VERSION }}\2/" codeflare-stack/README.md - echo "update complete ........." - - # Configure identity for committer - git config user.email "github-actions@github.com" - git config user.name "GitHub Actions" - - # Commit and push the branch to origin - echo "Summary of changes:" - git status - if [[ `git status --porcelain` ]]; then - echo "Changes detected ...." - git add . - git commit -m "Manifest_updates_for_Distributed_Workloads_$LATEST_TAG" - echo "completed commit .........." - else - echo "No changes detected ...." - echo "So, exiting the workflow ..." - exit 0 - fi - - # Check if the branch exists and perform rebase if it does - if git ls-remote --exit-code --heads origin $BRANCH_NAME; then - git pull --rebase origin $BRANCH_NAME - echo "rebase completed ...." - fi - git push origin $BRANCH_NAME - echo "push completed ...." - - # Create PR in odh-manifests repo if not opened yet - if [[ $(gh pr view $BRANCH_NAME) && $(gh pr view $BRANCH_NAME --json state --jq .state) == "OPEN" ]]; then - echo "PR already opened" - else - echo "Creating PR......." - gh pr create \ - --title "Update distributed-workload manifests for $LATEST_TAG Release" \ - --body "This is an automated PR to update distributed-workload manifests" \ - --head "$BRANCH_NAME" \ - --base "master" - fi - env: - GITHUB_TOKEN: ${{ secrets.CODEFLARE_MACHINE_ACCOUNT_TOKEN }} \ No newline at end of file From 483257f803e780125a30658cb2e26cb2b641fd0b Mon Sep 17 00:00:00 2001 From: anishasthana Date: Fri, 22 Sep 2023 14:35:55 +0000 Subject: [PATCH 4/4] Update dependency versions for release v1.0.0-rc.1 --- README.md | 8 ++++---- codeflare-stack/base/codeflare-notebook-imagestream.yaml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index dad1eea9..a222f55d 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ Integration of this stack into the Open Data Hub is owned by the Distributed Wor | Component | Version | |------------------------------|---------| -| CodeFlare Operator | v0.2.3 | -| Multi-Cluster App Dispatcher | v1.34.1 | -| CodeFlare-SDK | v0.7.1 | -| InstaScale | v0.0.8 | +| CodeFlare Operator | v1.0.0-rc.1 | +| Multi-Cluster App Dispatcher | v1.35.0 | +| CodeFlare-SDK | v0.8.0 | +| InstaScale | v0.0.9 | | KubeRay | v0.6.0 | diff --git a/codeflare-stack/base/codeflare-notebook-imagestream.yaml b/codeflare-stack/base/codeflare-notebook-imagestream.yaml index bcfbe2e2..2483a9e7 100644 --- a/codeflare-stack/base/codeflare-notebook-imagestream.yaml +++ b/codeflare-stack/base/codeflare-notebook-imagestream.yaml @@ -13,9 +13,9 @@ spec: tags: - annotations: openshift.io/imported-from: quay.io/project-codeflare/notebook - name: v0.7.1 + name: v0.8.0 from: kind: DockerImage - name: quay.io/project-codeflare/notebook:v0.7.1 + name: quay.io/project-codeflare/notebook:v0.8.0 importPolicy: - scheduled: true + scheduled: true