Skip to content

Commit 84c0328

Browse files
committed
add a test flow
1 parent 84eb43b commit 84c0328

File tree

2 files changed

+36
-16
lines changed

2 files changed

+36
-16
lines changed

.github/workflows/nvidia-gpu-operator.yml

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,4 @@ jobs:
4040
az aks command invoke \
4141
--resource-group $RESOURCE_GROUP \
4242
--name $CLUSTER_NAME \
43-
--command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
44-
45-
- name: Check GPU Allocatable Resources
46-
run: |
47-
az aks command invoke \
48-
--resource-group $RESOURCE_GROUP \
49-
--name $CLUSTER_NAME \
50-
--command "kubectl get nodes -l accelerator=nvidia -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}'"
51-
52-
- name: Apply pod-check-nvidia-smi.yaml and check logs
53-
run: |
54-
az aks command invoke \
55-
--resource-group $RESOURCE_GROUP \
56-
--name $CLUSTER_NAME \
57-
--command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
58-
--file pod-check-nvidia-smi.yaml
43+
--command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"

.github/workflows/test.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: Step 4 - Test GPU Operator Installation
2+
3+
on:
4+
workflow_dispatch:
5+
6+
env:
7+
RESOURCE_GROUP: rg-pvt-aks-h100
8+
CLUSTER_NAME: pvt-aks-h100
9+
10+
jobs:
11+
install-gpu-operator:
12+
runs-on: ubuntu-latest
13+
14+
permissions:
15+
id-token: write
16+
contents: read
17+
18+
steps:
19+
- name: Checkout repository
20+
uses: actions/checkout@v4
21+
22+
- name: Log in to Azure with federated identity (User Assigned Managed Identity)
23+
uses: azure/login@v2
24+
with:
25+
client-id: ${{ secrets.AZURE_CLIENT_ID }}
26+
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
27+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
28+
29+
- name: Apply pod-check-nvidia-smi.yaml and check logs
30+
run: |
31+
az aks command invoke \
32+
--resource-group $RESOURCE_GROUP \
33+
--name $CLUSTER_NAME \
34+
--command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
35+
--file pod-check-nvidia-smi.yaml

0 commit comments

Comments
 (0)