From 2b62a7d6dfb23b2fc115546f17b1a53673c79986 Mon Sep 17 00:00:00 2001 From: Muhammad Soliman Date: Fri, 13 Dec 2024 13:39:12 +0100 Subject: [PATCH 1/2] Add option for passing extended resources in node labels in GCE on GCE, Cluster atuoscaler reads extended resource information from kubenv->AUTOSCALER_ENV_VARS->extended_resources in the managed scaling group template definition. However, users have no way to add a variable to extended resources, they are controlled from GKE side. This results in cluster autoscaler not supporting scale up from zero for all node pools that has extended resources (like GPU) on GCE. However, node labels are passed from the node pool to the managed scaling group template through the kubenv->AUTOSCALER_ENV_VARS->node_labels. This commit introduces the ability to pass extended resources as node labels with defined prefix on GCE, similar to how cluster autoscaler expects extended resources on AWS. This allows scaling from zero for node pools with extended resrouces. --- .../cloudprovider/gce/templates.go | 36 +++++++++++++++---- .../cloudprovider/gce/templates_test.go | 12 +++++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index 678d8002cf02..db0cd234fb13 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -438,14 +438,36 @@ func extractExtendedResourcesFromKubeEnv(kubeEnv KubeEnv) (apiv1.ResourceList, e klog.Warningf("error while obtaining extended_resources from AUTOSCALER_ENV_VARS; %v", err) return nil, err } - - if !found { - return apiv1.ResourceList{}, nil + var extendedResourcesMap map[string]string + if found { + extendedResourcesMap, err = parseKeyValueListToMap(extendedResourcesAsString) + if err != nil { + return apiv1.ResourceList{}, err + } + } else { + extendedResourcesMap = make(map[string]string) } - - extendedResourcesMap, err := parseKeyValueListToMap(extendedResourcesAsString) + nodeLabelsAsString, found, err := extractAutoscalerVarFromKubeEnv(kubeEnv, "node_labels") if err != nil { - return apiv1.ResourceList{}, err + klog.Warningf("error while obtaining node_labels from AUTOSCALER_ENV_VARS; %v", err) + return nil, err + } + if found { + nodeLabelsMap, err := parseKeyValueListToMap(nodeLabelsAsString) + if err != nil { + return apiv1.ResourceList{}, err + } + const extendedResourcesKeyPrefix = "clusterautoscaler-nodetemplate-resources-" + for key, value := range nodeLabelsMap { + if strings.HasPrefix(key, extendedResourcesKeyPrefix) { + key = strings.TrimPrefix(key, extendedResourcesKeyPrefix) + extendedResourcesMap[key] = value + } + } + } + + if len(extendedResourcesMap) == 0 { + return apiv1.ResourceList{}, nil } extendedResources := apiv1.ResourceList{} @@ -453,7 +475,7 @@ func extractExtendedResourcesFromKubeEnv(kubeEnv KubeEnv) (apiv1.ResourceList, e if q, err := resource.ParseQuantity(quantity); err == nil && q.Sign() >= 0 { extendedResources[apiv1.ResourceName(name)] = q } else if err != nil { - klog.Warningf("ignoring invalid value in extended_resources defined in AUTOSCALER_ENV_VARS; %v", err) + klog.Warningf("ignoring invalid value in extended_resources or node_labels defined in AUTOSCALER_ENV_VARS; %v", err) } } return extendedResources, nil diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index 4fa96f407e38..f611fad7c5b7 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -1304,6 +1304,18 @@ func TestExtractExtendedResourcesFromKubeEnv(t *testing.T) { expectedExtendedResources: apiv1.ResourceList{}, expectedErr: true, }, + { + name: "two valid values one of them defined in node labels", + kubeEnvValue: "AUTOSCALER_ENV_VARS: node_labels=a=b,c=d,clusterautoscaler-nodetemplate-resources-test.co/test-resource=3,cloud.google.com/gke-nodepool=pool-3,cloud.google.com/gke-preemptible=true;" + + "node_taints='dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c';" + + "kube_reserved=cpu=1000m,memory=300000Mi;" + + "extended_resources=foo=bar,baz=10G", + expectedExtendedResources: apiv1.ResourceList{ + apiv1.ResourceName("baz"): *resource.NewQuantity(10*units.GB, resource.DecimalSI), + apiv1.ResourceName("test.co/test-resource"): *resource.NewQuantity(3, resource.DecimalSI), + }, + expectedErr: false, + }, } for _, tc := range testCases { From 4f13cabcb4f0b414a44fff5707305566614e7523 Mon Sep 17 00:00:00 2001 From: Muhammad Soliman Date: Wed, 12 Feb 2025 10:35:24 +0100 Subject: [PATCH 2/2] Fixes based on code review change last character in extended resources prefix to be `.` instead of `-`. Add a warning if the extended resource already exists. --- cluster-autoscaler/cloudprovider/gce/templates.go | 5 ++++- cluster-autoscaler/cloudprovider/gce/templates_test.go | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index db0cd234fb13..d54c876c527d 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -457,10 +457,13 @@ func extractExtendedResourcesFromKubeEnv(kubeEnv KubeEnv) (apiv1.ResourceList, e if err != nil { return apiv1.ResourceList{}, err } - const extendedResourcesKeyPrefix = "clusterautoscaler-nodetemplate-resources-" + const extendedResourcesKeyPrefix = "clusterautoscaler-nodetemplate-resources." for key, value := range nodeLabelsMap { if strings.HasPrefix(key, extendedResourcesKeyPrefix) { key = strings.TrimPrefix(key, extendedResourcesKeyPrefix) + if _, existsBefore := extendedResourcesMap[key]; existsBefore { + klog.Warningf("extended resource %s defined twice in template", key) + } extendedResourcesMap[key] = value } } diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index f611fad7c5b7..ebaf2eaa1ab9 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -1306,7 +1306,7 @@ func TestExtractExtendedResourcesFromKubeEnv(t *testing.T) { }, { name: "two valid values one of them defined in node labels", - kubeEnvValue: "AUTOSCALER_ENV_VARS: node_labels=a=b,c=d,clusterautoscaler-nodetemplate-resources-test.co/test-resource=3,cloud.google.com/gke-nodepool=pool-3,cloud.google.com/gke-preemptible=true;" + + kubeEnvValue: "AUTOSCALER_ENV_VARS: node_labels=a=b,c=d,clusterautoscaler-nodetemplate-resources.test.co/test-resource=3,cloud.google.com/gke-nodepool=pool-3,cloud.google.com/gke-preemptible=true;" + "node_taints='dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c';" + "kube_reserved=cpu=1000m,memory=300000Mi;" + "extended_resources=foo=bar,baz=10G",