Skip to content

Commit 97cd6ea

Browse files
authored
RHOAIENG-10449 - Add PR check for additional-demos notebooks (#684)
1 parent 003a287 commit 97cd6ea

File tree

3 files changed

+420
-0
lines changed

3 files changed

+420
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
name: Additional demo notebooks tests
2+
3+
on:
4+
pull_request:
5+
types: [ labeled ]
6+
workflow_dispatch:
7+
8+
concurrency:
9+
group: ${{ github.head_ref }}-${{ github.workflow }}
10+
cancel-in-progress: true
11+
12+
env:
13+
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
14+
15+
jobs:
16+
verify-local_interactive:
17+
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
18+
runs-on: ubuntu-20.04-4core
19+
20+
steps:
21+
- name: Checkout code
22+
uses: actions/checkout@v4
23+
with:
24+
submodules: recursive
25+
26+
- name: Checkout common repo code
27+
uses: actions/checkout@v4
28+
with:
29+
repository: 'project-codeflare/codeflare-common'
30+
ref: 'main'
31+
path: 'common'
32+
33+
- name: Checkout CodeFlare operator repository
34+
uses: actions/checkout@v4
35+
with:
36+
repository: project-codeflare/codeflare-operator
37+
path: codeflare-operator
38+
39+
- name: Set Go
40+
uses: actions/setup-go@v5
41+
with:
42+
go-version-file: './codeflare-operator/go.mod'
43+
cache-dependency-path: "./codeflare-operator/go.sum"
44+
45+
- name: Set up gotestfmt
46+
uses: gotesttools/gotestfmt-action@v2
47+
with:
48+
token: ${{ secrets.GITHUB_TOKEN }}
49+
50+
- name: Set up specific Python version
51+
uses: actions/setup-python@v5
52+
with:
53+
python-version: '3.9'
54+
cache: 'pip' # caching pip dependencies
55+
56+
- name: Setup and start KinD cluster
57+
uses: ./common/github-actions/kind
58+
59+
- name: Deploy CodeFlare stack
60+
id: deploy
61+
run: |
62+
cd codeflare-operator
63+
echo Setting up CodeFlare stack
64+
make setup-e2e
65+
echo Deploying CodeFlare operator
66+
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
67+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
68+
cd ..
69+
70+
- name: Setup Additional demo notebooks execution
71+
run: |
72+
echo "Installing papermill and dependencies..."
73+
pip install poetry papermill ipython ipykernel
74+
# Disable virtualenv due to problems using packaged in virtualenv in papermill
75+
poetry config virtualenvs.create false
76+
77+
echo "Installing SDK..."
78+
poetry install --with test,docs
79+
80+
- name: Run local_interactive.ipynb
81+
run: |
82+
set -euo pipefail
83+
84+
# Remove login/logout cells, as KinD doesn't support authentication using token
85+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
86+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
87+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
88+
sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb
89+
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
90+
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
91+
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
92+
# Set explicit namespace as SDK need it (currently) to resolve local queues
93+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
94+
# Run notebook
95+
poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
96+
env:
97+
GRPC_DNS_RESOLVER: "native"
98+
working-directory: demo-notebooks/additional-demos
99+
100+
- name: Print CodeFlare operator logs
101+
if: always() && steps.deploy.outcome == 'success'
102+
run: |
103+
echo "Printing CodeFlare operator logs"
104+
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
105+
106+
- name: Print Kueue operator logs
107+
if: always() && steps.deploy.outcome == 'success'
108+
run: |
109+
echo "Printing Kueue operator logs"
110+
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
111+
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
112+
113+
- name: Print KubeRay operator logs
114+
if: always() && steps.deploy.outcome == 'success'
115+
run: |
116+
echo "Printing KubeRay operator logs"
117+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
118+
119+
- name: Export all KinD pod logs
120+
uses: ./common/github-actions/kind-export-logs
121+
if: always() && steps.deploy.outcome == 'success'
122+
with:
123+
output-directory: ${TEMP_DIR}
124+
125+
- name: Upload logs
126+
uses: actions/upload-artifact@v4
127+
if: always() && steps.deploy.outcome == 'success'
128+
with:
129+
name: logs-local_interactive
130+
retention-days: 10
131+
path: |
132+
${{ env.TEMP_DIR }}/**/*.log
133+
134+
verify-ray_job_client:
135+
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
136+
runs-on: ubuntu-20.04-4core
137+
138+
steps:
139+
- name: Checkout code
140+
uses: actions/checkout@v4
141+
with:
142+
submodules: recursive
143+
144+
- name: Checkout common repo code
145+
uses: actions/checkout@v4
146+
with:
147+
repository: 'project-codeflare/codeflare-common'
148+
ref: 'main'
149+
path: 'common'
150+
151+
- name: Checkout CodeFlare operator repository
152+
uses: actions/checkout@v4
153+
with:
154+
repository: project-codeflare/codeflare-operator
155+
path: codeflare-operator
156+
157+
- name: Set Go
158+
uses: actions/setup-go@v5
159+
with:
160+
go-version-file: './codeflare-operator/go.mod'
161+
cache-dependency-path: "./codeflare-operator/go.sum"
162+
163+
- name: Set up gotestfmt
164+
uses: gotesttools/gotestfmt-action@v2
165+
with:
166+
token: ${{ secrets.GITHUB_TOKEN }}
167+
168+
- name: Set up specific Python version
169+
uses: actions/setup-python@v5
170+
with:
171+
python-version: '3.9'
172+
cache: 'pip' # caching pip dependencies
173+
174+
- name: Setup and start KinD cluster
175+
uses: ./common/github-actions/kind
176+
177+
- name: Deploy CodeFlare stack
178+
id: deploy
179+
run: |
180+
cd codeflare-operator
181+
echo Setting up CodeFlare stack
182+
make setup-e2e
183+
echo Deploying CodeFlare operator
184+
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
185+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
186+
cd ..
187+
188+
- name: Setup Additional demo notebooks execution
189+
run: |
190+
echo "Installing papermill and dependencies..."
191+
pip install poetry papermill ipython ipykernel
192+
# Disable virtualenv due to problems using packaged in virtualenv in papermill
193+
poetry config virtualenvs.create false
194+
195+
echo "Installing SDK..."
196+
poetry install --with test,docs
197+
198+
- name: Run ray_job_client.ipynb
199+
run: |
200+
set -euo pipefail
201+
202+
# Remove login/logout cells, as KinD doesn't support authentication using token
203+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
204+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
205+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
206+
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
207+
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
208+
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
209+
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
210+
# Set explicit namespace as SDK need it (currently) to resolve local queues
211+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" ray_job_client.ipynb
212+
sed -i "s/worker_memory_requests=4,/worker_memory_requests=1,/" ray_job_client.ipynb
213+
sed -i "s/worker_memory_limits=4,/worker_memory_limits=1,/" ray_job_client.ipynb
214+
sed -i "s/'Authorization': .*/'Authorization': None\",/" ray_job_client.ipynb
215+
sed -i "s/num_workers=2/num_workers=1/" ray_job_client.ipynb
216+
sed -i "s/RayJobClient(address=ray_dashboard, headers=header, verify=True)/RayJobClient(address=ray_dashboard, verify=False)/" ray_job_client.ipynb
217+
# Run notebook
218+
poetry run papermill ray_job_client.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
219+
env:
220+
GRPC_DNS_RESOLVER: "native"
221+
working-directory: demo-notebooks/additional-demos
222+
223+
- name: Print CodeFlare operator logs
224+
if: always() && steps.deploy.outcome == 'success'
225+
run: |
226+
echo "Printing CodeFlare operator logs"
227+
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
228+
229+
- name: Print Kueue operator logs
230+
if: always() && steps.deploy.outcome == 'success'
231+
run: |
232+
echo "Printing Kueue operator logs"
233+
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
234+
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
235+
236+
- name: Print KubeRay operator logs
237+
if: always() && steps.deploy.outcome == 'success'
238+
run: |
239+
echo "Printing KubeRay operator logs"
240+
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
241+
242+
- name: Export all KinD pod logs
243+
uses: ./common/github-actions/kind-export-logs
244+
if: always() && steps.deploy.outcome == 'success'
245+
with:
246+
output-directory: ${TEMP_DIR}
247+
248+
- name: Upload logs
249+
uses: actions/upload-artifact@v4
250+
if: always() && steps.deploy.outcome == 'success'
251+
with:
252+
name: logs-ray_job_client
253+
retention-days: 10
254+
path: |
255+
${{ env.TEMP_DIR }}/**/*.log

0 commit comments

Comments
 (0)