add a test flow

dcasati · dcasati · commit 84c0328b8ebb · 2025-04-04T14:39:09.000-06:00
diff --git a/.github/workflows/nvidia-gpu-operator.yml b/.github/workflows/nvidia-gpu-operator.yml
@@ -40,19 +40,4 @@ jobs:
           az aks command invoke \
             --resource-group $RESOURCE_GROUP \
             --name $CLUSTER_NAME \
-            --command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
-
-      - name: Check GPU Allocatable Resources
-        run: |
-          az aks command invoke \
-            --resource-group $RESOURCE_GROUP \
-            --name $CLUSTER_NAME \
-            --command "kubectl get nodes -l accelerator=nvidia -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}'"
-
-      - name: Apply pod-check-nvidia-smi.yaml and check logs
-        run: |
-          az aks command invoke \
-            --resource-group $RESOURCE_GROUP \
-            --name $CLUSTER_NAME \
-            --command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
-            --file pod-check-nvidia-smi.yaml
+            --command "kubectl wait --for=condition=Ready pods --all -n gpu-operator --timeout=180s && kubectl get pods -n gpu-operator"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,35 @@
+name: Step 4 - Test GPU Operator Installation
+
+on:
+  workflow_dispatch:
+
+env:
+  RESOURCE_GROUP: rg-pvt-aks-h100
+  CLUSTER_NAME: pvt-aks-h100
+
+jobs:
+  install-gpu-operator:
+    runs-on: ubuntu-latest
+
+    permissions:
+      id-token: write
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Azure with federated identity (User Assigned Managed Identity)
+        uses: azure/login@v2
+        with:
+          client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Apply pod-check-nvidia-smi.yaml and check logs
+        run: |
+          az aks command invoke \
+            --resource-group $RESOURCE_GROUP \
+            --name $CLUSTER_NAME \
+            --command "kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl wait --for=condition=Succeeded pod/nvidia-gpu-test -n default --timeout=120s && kubectl logs nvidia-gpu-test -n default" \
+            --file pod-check-nvidia-smi.yaml