Skip to content

Commit

Permalink
Merge pull request #7531 from willie-yao/fast-delete-toggle
Browse files Browse the repository at this point in the history
Add flag to enable fast delete of failed VMSS
  • Loading branch information
k8s-ci-robot authored Nov 26, 2024
2 parents 86a80c6 + 064d48f commit 930f148
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 56 deletions.
6 changes: 6 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ type Config struct {

// StrictCacheUpdates updates cache values only after positive validation from Azure APIs
StrictCacheUpdates bool `json:"strictCacheUpdates,omitempty" yaml:"strictCacheUpdates,omitempty"`

// EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning
EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"`
}

// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
Expand Down Expand Up @@ -295,6 +298,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
if _, err = assignIntFromEnvIfExists(&cfg.CloudProviderRateLimitBucketWrite, "RATE_LIMIT_WRITE_BUCKETS"); err != nil {
return nil, err
}
if _, err = assignBoolFromEnvIfExists(&cfg.EnableFastDeleteOnFailedProvisioning, "AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); err != nil {
return nil, err
}

// Nonstatic defaults
cfg.VMType = strings.ToLower(cfg.VMType)
Expand Down
23 changes: 14 additions & 9 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ const validAzureCfg = `{
"routeRateLimit": {
"cloudProviderRateLimit": true,
"cloudProviderRateLimitQPS": 3
}
},
"enableFastDeleteOnFailedProvisioning": true
}`

const validAzureCfgLegacy = `{
Expand Down Expand Up @@ -273,8 +274,9 @@ func TestCreateAzureManagerValidConfig(t *testing.T) {
},
},
},
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
EnableFastDeleteOnFailedProvisioning: true,
}

assert.NoError(t, err)
Expand Down Expand Up @@ -648,12 +650,13 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
},
},
},
ClusterName: "mycluster",
ClusterResourceGroup: "myrg",
ARMBaseURLForAPClient: "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local",
Deployment: "deployment",
VmssVmsCacheJitter: 90,
MaxDeploymentsCount: 8,
ClusterName: "mycluster",
ClusterResourceGroup: "myrg",
ARMBaseURLForAPClient: "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local",
Deployment: "deployment",
VmssVmsCacheJitter: 90,
MaxDeploymentsCount: 8,
EnableFastDeleteOnFailedProvisioning: true,
}

t.Setenv("ARM_CLOUD", "AzurePublicCloud")
Expand Down Expand Up @@ -686,6 +689,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down Expand Up @@ -931,6 +935,7 @@ func TestCreateAzureManagerWithEnvOverridingConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfgForStandardVMType), cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down
46 changes: 26 additions & 20 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ type ScaleSet struct {

// uses Azure Dedicated Host
dedicatedHost bool

enableFastDeleteOnFailedProvisioning bool
}

// NewScaleSet creates a new NewScaleSet.
Expand Down Expand Up @@ -128,6 +130,8 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d
klog.V(2).Infof("enableDetailedCSEMessage: %t", scaleSet.enableDetailedCSEMessage)
}

scaleSet.enableFastDeleteOnFailedProvisioning = az.config.EnableFastDeleteOnFailedProvisioning

return scaleSet, nil
}

Expand Down Expand Up @@ -705,7 +709,7 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForFlex() error {
return rerr.Error()
}

scaleSet.instanceCache = buildInstanceCacheForFlex(vms)
scaleSet.instanceCache = buildInstanceCacheForFlex(vms, scaleSet.enableFastDeleteOnFailedProvisioning)
scaleSet.lastInstanceRefresh = lastRefresh

return nil
Expand Down Expand Up @@ -763,21 +767,21 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForUniform() error {
// Note that the GetScaleSetVms() results is not used directly because for the List endpoint,
// their resource ID format is not consistent with Get endpoint
// buildInstanceCacheForFlex used by orchestrationMode == compute.Flexible
func buildInstanceCacheForFlex(vms []compute.VirtualMachine) []cloudprovider.Instance {
func buildInstanceCacheForFlex(vms []compute.VirtualMachine, enableFastDeleteOnFailedProvisioning bool) []cloudprovider.Instance {
var instances []cloudprovider.Instance
for _, vm := range vms {
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState)
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState, enableFastDeleteOnFailedProvisioning)
}

return instances
}

// addVMToCache used by orchestrationMode == compute.Flexible
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string) {
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) {
// The resource ID is empty string, which indicates the instance may be in deleting state.
if len(*id) == 0 {
return
Expand All @@ -792,13 +796,13 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st

*instances = append(*instances, cloudprovider.Instance{
Id: azurePrefix + resourceID,
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState),
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState, enableFastDeleteOnFailedProvisioning),
})
}

// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus
// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus {
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus {
if provisioningState == nil {
return nil
}
Expand All @@ -812,21 +816,23 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi
case provisioningStateCreating:
status.State = cloudprovider.InstanceCreating
case provisioningStateFailed:
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
if enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState)
status.State = cloudprovider.InstanceRunning
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState)
status.State = cloudprovider.InstanceRunning
}
default:
status.State = cloudprovider.InstanceRunning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"fmt"
"strconv"
"sync"
"time"

Expand Down Expand Up @@ -211,6 +212,10 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
}
return nil
}
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}

status := &cloudprovider.InstanceStatus{}
switch *vm.ProvisioningState {
Expand All @@ -219,26 +224,24 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
case string(compute.GalleryProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.GalleryProvisioningStateFailed):
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}

// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning))
if scaleSet.enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState)
status.State = cloudprovider.InstanceRunning
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState)
status.State = cloudprovider.InstanceRunning
}
default:
status.State = cloudprovider.InstanceRunning
Expand All @@ -247,6 +250,7 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
// Add vmssCSE Provisioning Failed Message in error info body for vmssCSE Extensions if enableDetailedCSEMessage is true
if scaleSet.enableDetailedCSEMessage && vm.InstanceView != nil {
if err, failed := scaleSet.cseErrors(vm.InstanceView.Extensions); failed {
klog.V(3).Infof("VM %s reports CSE failure: %v, with provisioning state %s, power state %s", to.String(vm.ID), err, to.String(vm.ProvisioningState), powerState)
status.State = cloudprovider.InstanceCreating
errorInfo := &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
Expand Down
18 changes: 10 additions & 8 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand All @@ -55,10 +56,11 @@ func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand Down

0 comments on commit 930f148

Please sign in to comment.