File tree Expand file tree Collapse file tree 2 files changed +36
-16
lines changed
Expand file tree Collapse file tree 2 files changed +36
-16
lines changed Original file line number Diff line number Diff line change 4040 az aks command invoke \
4141 --resource-group $RESOURCE_GROUP \
4242 --name $CLUSTER_NAME \
43- --command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
44-
45- - name : Check GPU Allocatable Resources
46- run : |
47- az aks command invoke \
48- --resource-group $RESOURCE_GROUP \
49- --name $CLUSTER_NAME \
50- --command "kubectl get nodes -l accelerator=nvidia -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}'"
51-
52- - name : Apply pod-check-nvidia-smi.yaml and check logs
53- run : |
54- az aks command invoke \
55- --resource-group $RESOURCE_GROUP \
56- --name $CLUSTER_NAME \
57- --command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
58- --file pod-check-nvidia-smi.yaml
43+ --command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
Original file line number Diff line number Diff line change 1+ name : Step 4 - Test GPU Operator Installation
2+
3+ on :
4+ workflow_dispatch :
5+
6+ env :
7+ RESOURCE_GROUP : rg-pvt-aks-h100
8+ CLUSTER_NAME : pvt-aks-h100
9+
10+ jobs :
11+ install-gpu-operator :
12+ runs-on : ubuntu-latest
13+
14+ permissions :
15+ id-token : write
16+ contents : read
17+
18+ steps :
19+ - name : Checkout repository
20+ uses : actions/checkout@v4
21+
22+ - name : Log in to Azure with federated identity (User Assigned Managed Identity)
23+ uses : azure/login@v2
24+ with :
25+ client-id : ${{ secrets.AZURE_CLIENT_ID }}
26+ tenant-id : ${{ secrets.AZURE_TENANT_ID }}
27+ subscription-id : ${{ secrets.AZURE_SUBSCRIPTION_ID }}
28+
29+ - name : Apply pod-check-nvidia-smi.yaml and check logs
30+ run : |
31+ az aks command invoke \
32+ --resource-group $RESOURCE_GROUP \
33+ --name $CLUSTER_NAME \
34+ --command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
35+ --file pod-check-nvidia-smi.yaml
You can’t perform that action at this time.
0 commit comments