cluster-api: node template in scale-from-0-nodes scenario with DRA

ttsuuubasa · ttsuuubasa · commit 3fbacf0d0f62 · 2025-02-12T11:56:04.000+09:00
Modify TemplateNodeInfo() to return the template of ResourceSlice.
This is to address the DRA expansion of Cluster Autoscaler, allowing users to set the number of GPUs and DRA driver name by specifying
the annotation to NodeGroup provided by cluster-api.

Signed-off-by: Tsubasa Watanabe &lt;w.tsubasa@fujitsu.com&gt;
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/README.md b/cluster-autoscaler/cloudprovider/clusterapi/README.md
@@ -223,15 +223,21 @@ metadata:
     capacity.cluster-autoscaler.kubernetes.io/memory: "128G"
     capacity.cluster-autoscaler.kubernetes.io/cpu: "16"
     capacity.cluster-autoscaler.kubernetes.io/ephemeral-disk: "100Gi"
+    capacity.cluster-autoscaler.kubernetes.io/maxPods: "200"
+    // Device Plugin
     capacity.cluster-autoscaler.kubernetes.io/gpu-type: "nvidia.com/gpu"
+    // Dynamic Resource Allocation (DRA)
+    capacity.cluster-autoscaler.kubernetes.io/dra-driver: "gpu.nvidia.com"
+    // Common in Device Plugin and DRA
     capacity.cluster-autoscaler.kubernetes.io/gpu-count: "2"
-    capacity.cluster-autoscaler.kubernetes.io/maxPods: "200"
 ```
 
 *Note* the `maxPods` annotation will default to `110` if it is not supplied.
 This value is inspired by the Kubernetes best practices
 [Considerations for large clusters](https://kubernetes.io/docs/setup/best-practices/cluster-large/).
 
+*Note* User should select the annotation for GPU either `gpu-type` or `dra-driver` depends on whether using Device Plugin or Dynamic Resource Allocation(DRA). `gpu-count` is a common parameter in both.
+
 #### RBAC changes for scaling from zero
 
 If you are using the opt-in support for scaling from zero as defined by the
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go
@@ -283,7 +283,12 @@ func (ng *nodegroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
 		return nil, err
 	}
 
-	nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
+	resourceSlices, err := ng.scalableResource.InstanceResourceSlices(nodeName)
+	if err != nil {
+		return nil, err
+	}
+
+	nodeInfo := framework.NewNodeInfo(&node, resourceSlices, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
 	return nodeInfo, nil
 }
 
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go
@@ -1309,12 +1309,19 @@ func TestNodeGroupTemplateNodeInfo(t *testing.T) {
 		nodeGroupMaxSizeAnnotationKey: "10",
 	}
 
+	type testResourceSlice struct {
+		driverName string
+		gpuCount   int
+		deviceType string
+	}
+
 	type testCaseConfig struct {
-		nodeLabels         map[string]string
-		includeNodes       bool
-		expectedErr        error
-		expectedCapacity   map[corev1.ResourceName]int64
-		expectedNodeLabels map[string]string
+		nodeLabels            map[string]string
+		includeNodes          bool
+		expectedErr           error
+		expectedCapacity      map[corev1.ResourceName]int64
+		expectedNodeLabels    map[string]string
+		expectedResourceSlice testResourceSlice
 	}
 
 	testCases := []struct {
@@ -1407,6 +1414,33 @@ func TestNodeGroupTemplateNodeInfo(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "When the NodeGroup can scale from zero and DRA is enabled, it creates ResourceSlice derived from the annotation of DRA driver name and GPU count",
+			nodeGroupAnnotations: map[string]string{
+				memoryKey:    "2048Mi",
+				cpuKey:       "2",
+				draDriverKey: "gpu.nvidia.com",
+				gpuCountKey:  "2",
+			},
+			config: testCaseConfig{
+				expectedErr: nil,
+				expectedCapacity: map[corev1.ResourceName]int64{
+					corev1.ResourceCPU:    2,
+					corev1.ResourceMemory: 2048 * 1024 * 1024,
+					corev1.ResourcePods:   110,
+				},
+				expectedResourceSlice: testResourceSlice{
+					driverName: "gpu.nvidia.com",
+					gpuCount:   2,
+					deviceType: GpuDeviceType,
+				},
+				expectedNodeLabels: map[string]string{
+					"kubernetes.io/os":       "linux",
+					"kubernetes.io/arch":     "amd64",
+					"kubernetes.io/hostname": "random value",
+				},
+			},
+		},
 	}
 
 	test := func(t *testing.T, testConfig *testConfig, config testCaseConfig) {
@@ -1470,6 +1504,18 @@ func TestNodeGroupTemplateNodeInfo(t *testing.T) {
 				}
 			}
 		}
+		for _, resourceslice := range nodeInfo.LocalResourceSlices {
+			if resourceslice.Spec.Driver != config.expectedResourceSlice.driverName {
+				t.Errorf("Expected DRA driver in ResourceSlice to have: %s, but got: %s", config.expectedResourceSlice.driverName, resourceslice.Spec.Driver)
+			} else if len(resourceslice.Spec.Devices) != config.expectedResourceSlice.gpuCount {
+				t.Errorf("Expected the number of DRA devices in ResourceSlice to have: %d, but got: %d", config.expectedResourceSlice.gpuCount, len(resourceslice.Spec.Devices))
+			}
+			for _, device := range resourceslice.Spec.Devices {
+				if *device.Basic.Attributes["type"].StringValue != config.expectedResourceSlice.deviceType {
+					t.Errorf("Expected device type to have: %s, but got: %s", config.expectedResourceSlice.deviceType, *device.Basic.Attributes["type"].StringValue)
+				}
+			}
+		}
 	}
 
 	for _, tc := range testCases {
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go
@@ -20,18 +20,21 @@ import (
 	"context"
 	"fmt"
 	"path"
+	"strconv"
 	"strings"
 	"time"
 
 	"github.com/pkg/errors"
 	apiv1 "k8s.io/api/core/v1"
 	corev1 "k8s.io/api/core/v1"
+	resourceapi "k8s.io/api/resource/v1beta1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/util/validation"
 	klog "k8s.io/klog/v2"
+	"k8s.io/utils/ptr"
 )
 
 type unstructuredScalableResource struct {
@@ -297,6 +300,46 @@ func (r unstructuredScalableResource) InstanceCapacity() (map[corev1.ResourceNam
 	return capacity, nil
 }
 
+func (r unstructuredScalableResource) InstanceResourceSlices(nodeName string) ([]*resourceapi.ResourceSlice, error) {
+	driver := r.InstanceDRADriver()
+	gpuCount, err := r.InstanceGPUCapacityAnnotation()
+	if err != nil {
+		return nil, err
+	}
+
+	var result []*resourceapi.ResourceSlice
+	if driver != "" && !gpuCount.IsZero() {
+		resourceslice := &resourceapi.ResourceSlice{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: nodeName + "-" + driver,
+			},
+			Spec: resourceapi.ResourceSliceSpec{
+				Driver:   driver,
+				NodeName: nodeName,
+				Pool: resourceapi.ResourcePool{
+					Name: nodeName,
+				},
+			},
+		}
+		for i := 0; i < int(gpuCount.Value()); i++ {
+			device := resourceapi.Device{
+				Name: "gpu-" + strconv.Itoa(i),
+				Basic: &resourceapi.BasicDevice{
+					Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
+						"type": {
+							StringValue: ptr.To(GpuDeviceType),
+						},
+					},
+				},
+			}
+			resourceslice.Spec.Devices = append(resourceslice.Spec.Devices, device)
+		}
+		result = append(result, resourceslice)
+		return result, nil
+	}
+	return nil, nil
+}
+
 func (r unstructuredScalableResource) InstanceEphemeralDiskCapacityAnnotation() (resource.Quantity, error) {
 	return parseEphemeralDiskCapacity(r.unstructured.GetAnnotations())
 }
@@ -321,6 +364,10 @@ func (r unstructuredScalableResource) InstanceMaxPodsCapacityAnnotation() (resou
 	return parseMaxPodsCapacity(r.unstructured.GetAnnotations())
 }
 
+func (r unstructuredScalableResource) InstanceDRADriver() string {
+	return parseDRADriver(r.unstructured.GetAnnotations())
+}
+
 func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*unstructured.Unstructured, error) {
 	infraref, found, err := unstructured.NestedStringMap(r.unstructured.Object, "spec", "template", "spec", "infrastructureRef")
 	if !found || err != nil {
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured_test.go
@@ -24,10 +24,12 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	v1 "k8s.io/api/core/v1"
+	resourceapi "k8s.io/api/resource/v1beta1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/client-go/tools/cache"
+	"k8s.io/utils/ptr"
 )
 
 const (
@@ -297,6 +299,32 @@ func TestAnnotations(t *testing.T) {
 	gpuQuantity := resource.MustParse("1")
 	maxPodsQuantity := resource.MustParse("42")
 	expectedTaints := []v1.Taint{{Key: "key1", Effect: v1.TaintEffectNoSchedule, Value: "value1"}, {Key: "key2", Effect: v1.TaintEffectNoExecute, Value: "value2"}}
+	testNodeName := "test-node"
+	draDriver := "test-driver"
+	expectedResourceSlice := &resourceapi.ResourceSlice{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: testNodeName + "-" + draDriver,
+		},
+		Spec: resourceapi.ResourceSliceSpec{
+			Driver:   draDriver,
+			NodeName: testNodeName,
+			Pool: resourceapi.ResourcePool{
+				Name: testNodeName,
+			},
+			Devices: []resourceapi.Device{
+				{
+					Name: "gpu-0",
+					Basic: &resourceapi.BasicDevice{
+						Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
+							"type": {
+								StringValue: ptr.To(GpuDeviceType),
+							},
+						},
+					},
+				},
+			},
+		},
+	}
 	annotations := map[string]string{
 		cpuKey:          cpuQuantity.String(),
 		memoryKey:       memQuantity.String(),
@@ -305,6 +333,7 @@ func TestAnnotations(t *testing.T) {
 		maxPodsKey:      maxPodsQuantity.String(),
 		taintsKey:       "key1=value1:NoSchedule,key2=value2:NoExecute",
 		labelsKey:       "key3=value3,key4=value4,key5=value5",
+		draDriverKey:    draDriver,
 	}
 
 	test := func(t *testing.T, testConfig *testConfig, testResource *unstructured.Unstructured) {
@@ -346,6 +375,14 @@ func TestAnnotations(t *testing.T) {
 			t.Errorf("expected %v, got %v", maxPodsQuantity, maxPods)
 		}
 
+		if resourceSlices, err := sr.InstanceResourceSlices(testNodeName); err != nil {
+			t.Fatal(err)
+		} else {
+			for _, resourceslice := range resourceSlices {
+				assert.Equal(t, expectedResourceSlice, resourceslice)
+			}
+		}
+
 		taints := sr.Taints()
 		assert.Equal(t, expectedTaints, taints)
 
diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go
@@ -40,6 +40,7 @@ const (
 	maxPodsKey      = "capacity.cluster-autoscaler.kubernetes.io/maxPods"
 	taintsKey       = "capacity.cluster-autoscaler.kubernetes.io/taints"
 	labelsKey       = "capacity.cluster-autoscaler.kubernetes.io/labels"
+	draDriverKey    = "capacity.cluster-autoscaler.kubernetes.io/dra-driver"
 	// UnknownArch is used if the Architecture is Unknown
 	UnknownArch SystemArchitecture = ""
 	// Amd64 is used if the Architecture is x86_64
@@ -54,6 +55,8 @@ const (
 	DefaultArch = Amd64
 	// scaleUpFromZeroDefaultEnvVar is the name of the env var for the default architecture
 	scaleUpFromZeroDefaultArchEnvVar = "CAPI_SCALE_ZERO_DEFAULT_ARCH"
+	// GpuDeviceType is used if DRA device is GPU
+	GpuDeviceType = "gpu"
 )
 
 var (
@@ -282,6 +285,13 @@ func parseMaxPodsCapacity(annotations map[string]string) (resource.Quantity, err
 	return parseIntKey(annotations, maxPodsKey)
 }
 
+func parseDRADriver(annotations map[string]string) string {
+	if val, found := annotations[draDriverKey]; found {
+		return val
+	}
+	return ""
+}
+
 func clusterNameFromResource(r *unstructured.Unstructured) string {
 	// Use Spec.ClusterName if defined (only available on v1alpha3+ types)
 	clusterName, found, err := unstructured.NestedString(r.Object, "spec", "clusterName")

Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,12 @@ func (ng nodegroup) TemplateNodeInfo() (framework.NodeInfo, error) {`
`283`	`283`	`return nil, err`
`284`	`284`	`}`
`285`	`285`
`286`		`- nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})`
	`286`	`+ resourceSlices, err := ng.scalableResource.InstanceResourceSlices(nodeName)`
	`287`	`+ if err != nil {`
	`288`	`+ return nil, err`
	`289`	`+ }`
	`290`	`+`
	`291`	`+ nodeInfo := framework.NewNodeInfo(&node, resourceSlices, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})`
`287`	`292`	`return nodeInfo, nil`
`288`	`293`	`}`
`289`	`294`