Skip to content

Commit

Permalink
cluster-api: node template in scale-from-0-nodes scenario with DRA
Browse files Browse the repository at this point in the history
Modify TemplateNodeInfo() to return the template of ResourceSlice.
This is to address the DRA expansion of Cluster Autoscaler, allowing users to set the number of GPUs and DRA driver name by specifying
the annotation to NodeGroup provided by cluster-api.

Signed-off-by: Tsubasa Watanabe <[email protected]>
  • Loading branch information
ttsuuubasa committed Feb 5, 2025
1 parent dffe7ac commit 40c71bb
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,12 @@ func (ng *nodegroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, err
}

nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
resourceSlices, err := ng.scalableResource.InstanceResourceSlices(nodeName)
if err != nil {
return nil, err
}

nodeInfo := framework.NewNodeInfo(&node, resourceSlices, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
return nodeInfo, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@ import (
"context"
"fmt"
"path"
"strconv"
"strings"
"time"

"github.com/pkg/errors"
apiv1 "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/validation"
klog "k8s.io/klog/v2"
"k8s.io/utils/ptr"
)

type unstructuredScalableResource struct {
Expand Down Expand Up @@ -297,6 +300,46 @@ func (r unstructuredScalableResource) InstanceCapacity() (map[corev1.ResourceNam
return capacity, nil
}

func (r unstructuredScalableResource) InstanceResourceSlices(nodeName string) ([]*resourceapi.ResourceSlice, error) {
driver := r.InstanceDRADriver()
gpuCount, err := r.InstanceGPUCapacityAnnotation()
if err != nil {
return nil, err
}

var result []*resourceapi.ResourceSlice
if driver != "" && !gpuCount.IsZero() {
resourceslice := &resourceapi.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName + "-" + driver,
},
Spec: resourceapi.ResourceSliceSpec{
Driver: driver,
NodeName: nodeName,
Pool: resourceapi.ResourcePool{
Name: nodeName,
},
},
}
for i := 0; i < int(gpuCount.Value()); i++ {
device := resourceapi.Device{
Name: "gpu-" + strconv.Itoa(i),
Basic: &resourceapi.BasicDevice{
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
"type": {
StringValue: ptr.To(GpuDeviceType),
},
},
},
}
resourceslice.Spec.Devices = append(resourceslice.Spec.Devices, device)
}
result = append(result, resourceslice)
return result, nil
}
return nil, nil
}

func (r unstructuredScalableResource) InstanceEphemeralDiskCapacityAnnotation() (resource.Quantity, error) {
return parseEphemeralDiskCapacity(r.unstructured.GetAnnotations())
}
Expand All @@ -321,6 +364,10 @@ func (r unstructuredScalableResource) InstanceMaxPodsCapacityAnnotation() (resou
return parseMaxPodsCapacity(r.unstructured.GetAnnotations())
}

func (r unstructuredScalableResource) InstanceDRADriver() string {
return parseDRADriver(r.unstructured.GetAnnotations())
}

func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*unstructured.Unstructured, error) {
infraref, found, err := unstructured.NestedStringMap(r.unstructured.Object, "spec", "template", "spec", "infrastructureRef")
if !found || err != nil {
Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods"
taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints"
labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels"
draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver"
// UnknownArch is used if the Architecture is Unknown
UnknownArch SystemArchitecture = ""
// Amd64 is used if the Architecture is x86_64
Expand All @@ -54,6 +55,8 @@ const (
DefaultArch = Amd64
// scaleUpFromZeroDefaultEnvVar is the name of the env var for the default architecture
scaleUpFromZeroDefaultArchEnvVar = "CAPI_SCALE_ZERO_DEFAULT_ARCH"
// gpuDeviceType is used if DRA device is GPU
GpuDeviceType = "gpu"
)

var (
Expand Down Expand Up @@ -282,6 +285,13 @@ func parseMaxPodsCapacity(annotations map[string]string) (resource.Quantity, err
return parseIntKey(annotations, maxPodsKey)
}

func parseDRADriver(annotations map[string]string) string {
if val, found := annotations[draDriverKey]; found {
return val
}
return ""
}

func clusterNameFromResource(r *unstructured.Unstructured) string {
// Use Spec.ClusterName if defined (only available on v1alpha3+ types)
clusterName, found, err := unstructured.NestedString(r.Object, "spec", "clusterName")
Expand Down

0 comments on commit 40c71bb

Please sign in to comment.