Create ray cluster with ssa #526
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Guided notebooks tests | |
on: | |
pull_request: | |
branches: [ main ] | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
jobs: | |
verify-0_basic_ray: | |
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} | |
runs-on: ubuntu-20.04-4core | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.9' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Setup Guided notebooks execution | |
run: | | |
echo "Installing papermill and dependencies..." | |
pip install poetry papermill ipython ipykernel | |
# Disable virtualenv due to problems using packaged in virtualenv in papermill | |
poetry config virtualenvs.create false | |
echo "Installing SDK..." | |
poetry install --with test,docs | |
- name: Run 0_basic_ray.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb | |
# Set explicit namespace as SDK need it (currently) to resolve local queues | |
sed -i "s/head_memory_limits=2,/head_memory_limits=2, namespace='default',/" 0_basic_ray.ipynb | |
# Run notebook | |
poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600 | |
working-directory: demo-notebooks/guided-demos | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log | |
- name: Print Kueue operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Kueue operator logs" | |
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') | |
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${TEMP_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs-0_basic_ray | |
retention-days: 10 | |
path: | | |
${{ env.TEMP_DIR }}/**/*.log | |
verify-1_cluster_job_client: | |
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} | |
runs-on: ubuntu-20.04-4core-gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.9' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
with: | |
enable-time-slicing: 'true' | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Setup Guided notebooks execution | |
run: | | |
echo "Installing papermill and dependencies..." | |
pip install poetry papermill ipython ipykernel | |
# Disable virtualenv due to problems using packaged in virtualenv in papermill | |
poetry config virtualenvs.create false | |
echo "Installing SDK..." | |
poetry install --with test,docs | |
- name: Run 1_cluster_job_client.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill | |
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) | |
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb | |
# Set explicit namespace as SDK need it (currently) to resolve local queues | |
sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 1_cluster_job_client.ipynb | |
# Run notebook | |
poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200 | |
working-directory: demo-notebooks/guided-demos | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log | |
- name: Print Kueue operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Kueue operator logs" | |
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') | |
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${TEMP_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs-1_cluster_job_client | |
retention-days: 10 | |
path: | | |
${{ env.TEMP_DIR }}/**/*.log | |
verify-2_basic_interactive: | |
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} | |
runs-on: ubuntu-20.04-4core-gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.9' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
with: | |
enable-time-slicing: 'true' | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Install MINIO | |
run: | | |
kubectl apply -f ./tests/e2e/minio_deployment.yaml | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio | |
- name: Setup Guided notebooks execution | |
run: | | |
echo "Installing papermill and dependencies..." | |
pip install poetry papermill ipython ipykernel | |
# Disable virtualenv due to problems using packaged in virtualenv in papermill | |
poetry config virtualenvs.create false | |
echo "Installing SDK..." | |
poetry install --with test,docs | |
- name: Run 2_basic_interactive.ipynb | |
run: | | |
set -euo pipefail | |
# Remove login/logout cells, as KinD doesn't support authentication using token | |
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster | |
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb | |
# Set explicit namespace as SDK need it (currently) to resolve local queues | |
sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 2_basic_interactive.ipynb | |
# Add MINIO related modules to runtime environment | |
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb | |
# Replace markdown cell with remote configuration for MINIO | |
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json) | |
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb | |
# Configure persistent storage for Ray trainer | |
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb | |
# Run notebook | |
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
working-directory: demo-notebooks/guided-demos | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log | |
- name: Print Kueue operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Kueue operator logs" | |
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') | |
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${TEMP_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs-2_basic_interactive | |
retention-days: 10 | |
path: | | |
${{ env.TEMP_DIR }}/**/*.log |