Skip to content

Step 3 - Install NVIDIA GPU Operator #1

Step 3 - Install NVIDIA GPU Operator

Step 3 - Install NVIDIA GPU Operator #1

name: Step 3 - Install NVIDIA GPU Operator
description: |
This workflow installs the NVIDIA GPU Operator on an AKS cluster and verifies its installation by deploying a test pod that runs `nvidia-smi`.
on:
workflow_dispatch:
env:
RESOURCE_GROUP: rg-pvt-aks-h100
CLUSTER_NAME: pvt-aks-h100
jobs:
install-gpu-operator:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to Azure with federated identity (User Assigned Managed Identity)
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Install NVIDIA GPU Operator via Helm
run: |
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $CLUSTER_NAME \
--command "helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update && helm install gpu-operator nvidia/gpu-operator -n gpu-operator --create-namespace --set operator.runtimeClass=nvidia-container-runtime"
- name: Wait for GPU Operator Pods to be Ready
run: |
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $CLUSTER_NAME \
--command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
- name: Check GPU Allocatable Resources
run: |
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $CLUSTER_NAME \
--command "kubectl get nodes -l accelerator=nvidia -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}'"
- name: Apply pod-check-nvidia-smi.yaml and check logs
run: |
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $CLUSTER_NAME \
--command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
--file pod-check-nvidia-smi.yaml