Step 3 - Install NVIDIA GPU Operator #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Step 3 - Install NVIDIA GPU Operator | |
| description: | | |
| This workflow installs the NVIDIA GPU Operator on an AKS cluster and verifies its installation by deploying a test pod that runs `nvidia-smi`. | |
| on: | |
| workflow_dispatch: | |
| env: | |
| RESOURCE_GROUP: rg-pvt-aks-h100 | |
| CLUSTER_NAME: pvt-aks-h100 | |
| jobs: | |
| install-gpu-operator: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Log in to Azure with federated identity (User Assigned Managed Identity) | |
| uses: azure/login@v2 | |
| with: | |
| client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| - name: Install NVIDIA GPU Operator via Helm | |
| run: | | |
| az aks command invoke \ | |
| --resource-group $RESOURCE_GROUP \ | |
| --name $CLUSTER_NAME \ | |
| --command "helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update && helm install gpu-operator nvidia/gpu-operator -n gpu-operator --create-namespace --set operator.runtimeClass=nvidia-container-runtime" | |
| - name: Wait for GPU Operator Pods to be Ready | |
| run: | | |
| az aks command invoke \ | |
| --resource-group $RESOURCE_GROUP \ | |
| --name $CLUSTER_NAME \ | |
| --command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator" | |
| - name: Check GPU Allocatable Resources | |
| run: | | |
| az aks command invoke \ | |
| --resource-group $RESOURCE_GROUP \ | |
| --name $CLUSTER_NAME \ | |
| --command "kubectl get nodes -l accelerator=nvidia -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}'" | |
| - name: Apply pod-check-nvidia-smi.yaml and check logs | |
| run: | | |
| az aks command invoke \ | |
| --resource-group $RESOURCE_GROUP \ | |
| --name $CLUSTER_NAME \ | |
| --command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \ | |
| --file pod-check-nvidia-smi.yaml |