From f243e383112d352f34a6643b001dfed60b9bef9e Mon Sep 17 00:00:00 2001 From: Justyna Betkier Date: Mon, 30 Dec 2024 12:57:44 +0100 Subject: [PATCH] - Remove double logging of scale down errors - the AbortNodeDeletion already logs information. - Change the abort node deletion to log a warning instead of an error if it happens that scheduler manages to put a pod on a node that is supposed to scale down. This is a known race condition that should not result in errors logged as they are misleading to oncallers (example: http://b/385203969#comment12) --- .../core/scaledown/actuation/actuator.go | 15 ++++------ .../scaledown/actuation/delete_in_batch.go | 29 +++++++++++++------ .../actuation/group_deletion_scheduler.go | 17 +++++++---- .../group_deletion_scheduler_test.go | 2 +- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index df240538f293..6833cba7cda9 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -289,10 +289,9 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider clusterSnapshot, err := a.createSnapshot(nodes) if err != nil { - klog.Errorf("Scale-down: couldn't create delete snapshot, err: %v", err) nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "createSnapshot returned error %v", err)} for _, node := range nodes { - a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to create delete snapshot", nodeDeleteResult) + a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to create delete snapshot", nodeDeleteResult) } return } @@ -300,10 +299,9 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider if drain { pdbs, err := a.ctx.PodDisruptionBudgetLister().List() if err != nil { - klog.Errorf("Scale-down: couldn't fetch pod disruption budgets, err: %v", err) nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "podDisruptionBudgetLister.List returned error %v", err)} for _, node := range nodes { - a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to fetch pod disruption budgets", nodeDeleteResult) + a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to fetch pod disruption budgets", nodeDeleteResult) } return } @@ -319,24 +317,21 @@ func (a *Actuator) deleteNodesAsync(nodes []*apiv1.Node, nodeGroup cloudprovider for _, node := range nodes { nodeInfo, err := clusterSnapshot.GetNodeInfo(node.Name) if err != nil { - klog.Errorf("Scale-down: can't retrieve node %q from snapshot, err: %v", node.Name, err) nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "nodeInfos.Get for %q returned error: %v", node.Name, err)} - a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get node info", nodeDeleteResult) + a.nodeDeletionScheduler.AbortNodeDeletionDueToError(node, nodeGroup.Id(), drain, "failed to get node info", nodeDeleteResult) continue } podsToRemove, _, _, err := simulator.GetPodsToMove(nodeInfo, a.deleteOptions, a.drainabilityRules, registry, remainingPdbTracker, time.Now()) if err != nil { - klog.Errorf("Scale-down: couldn't delete node %q, err: %v", node.Name, err) nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "GetPodsToMove for %q returned error: %v", node.Name, err)} - a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get pods to move on node", nodeDeleteResult) + a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "failed to get pods to move on node", nodeDeleteResult, true) continue } if !drain && len(podsToRemove) != 0 { - klog.Errorf("Scale-down: couldn't delete empty node %q, new pods got scheduled", node.Name) nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "failed to delete empty node %q, new pods scheduled", node.Name)} - a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "node is not empty", nodeDeleteResult) + a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroup.Id(), drain, "node is not empty", nodeDeleteResult, true) continue } diff --git a/cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go b/cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go index 496f046e3ae3..a30822231c4c 100644 --- a/cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go +++ b/cluster-autoscaler/core/scaledown/actuation/delete_in_batch.go @@ -90,7 +90,7 @@ func (d *NodeDeletionBatcher) deleteNodesAndRegisterStatus(nodes []*apiv1.Node, for _, node := range nodes { if err != nil { result := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err} - CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result) + CleanUpAndRecordErrorForFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result) } else { RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.scaleStateNotifier, node, nodeGroup, drain, d.nodeDeletionTracker) } @@ -135,7 +135,7 @@ func (d *NodeDeletionBatcher) remove(nodeGroupId string) error { drain := drainedNodeDeletions[node.Name] if err != nil { result = status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: err} - CleanUpAndRecordFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result) + CleanUpAndRecordErrorForFailedScaleDownEvent(d.ctx, node, nodeGroupId, drain, d.nodeDeletionTracker, "", result) } else { RegisterAndRecordSuccessfulScaleDownEvent(d.ctx, d.scaleStateNotifier, node, nodeGroup, drain, d.nodeDeletionTracker) } @@ -185,16 +185,27 @@ func IsNodeBeingDeleted(node *apiv1.Node, timestamp time.Time) bool { return deleteTime != nil && (timestamp.Sub(*deleteTime) < MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < MaxKubernetesEmptyNodeDeletionTime) } -// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log an error. -func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult) { - if drain { - klog.Errorf("Scale-down: couldn't delete node %q with drain, %v, status error: %v", node.Name, errMsg, status.Err) - ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to drain and delete node: %v", status.Err) +// CleanUpAndRecordErrorForFailedScaleDownEvent record failed scale down event and log an error. +func CleanUpAndRecordErrorForFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult) { + CleanUpAndRecordFailedScaleDownEvent(ctx, node, nodeGroupId, drain, nodeDeletionTracker, errMsg, status, false) +} +// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log a warning or an error. +func CleanUpAndRecordFailedScaleDownEvent(ctx *context.AutoscalingContext, node *apiv1.Node, nodeGroupId string, drain bool, nodeDeletionTracker *deletiontracker.NodeDeletionTracker, errMsg string, status status.NodeDeleteResult, logAsWarning bool) { + var logMsgFormat, eventMsgFormat string + if drain { + logMsgFormat = "couldn't delete node %q with drain" + eventMsgFormat = "failed to drain and delete node" + } else { + logMsgFormat = "couldn't delete empty node %q" + eventMsgFormat = "failed to delete empty node" + } + if logAsWarning { + klog.Warningf("Scale-down: "+logMsgFormat+", %v, status error: %v", node.Name, errMsg, status.Err) } else { - klog.Errorf("Scale-down: couldn't delete empty node, %v, status error: %v", errMsg, status.Err) - ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", "failed to delete empty node: %v", status.Err) + klog.Errorf("Scale-down: "+logMsgFormat+", %v, status error: %v", node.Name, errMsg, status.Err) } + ctx.Recorder.Eventf(node, apiv1.EventTypeWarning, "ScaleDownFailed", eventMsgFormat+": %v", status.Err) taints.CleanToBeDeleted(node, ctx.ClientSet, ctx.CordonNodeBeforeTerminate) nodeDeletionTracker.EndDeletion(nodeGroupId, node.Name, status) } diff --git a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go index cc67f65e1262..619aa57877fe 100644 --- a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go +++ b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go @@ -80,7 +80,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo, opts, err := nodeGroup.GetOptions(ds.ctx.NodeGroupDefaults) if err != nil && err != cloudprovider.ErrNotImplemented { nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerErrorf(errors.InternalError, "GetOptions returned error %v", err)} - ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "failed to get autoscaling options for a node group", nodeDeleteResult) + ds.AbortNodeDeletionDueToError(nodeInfo.Node(), nodeGroup.Id(), drain, "failed to get autoscaling options for a node group", nodeDeleteResult) return } if opts == nil { @@ -89,7 +89,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo, nodeDeleteResult := ds.prepareNodeForDeletion(nodeInfo, drain) if nodeDeleteResult.Err != nil { - ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "prepareNodeForDeletion failed", nodeDeleteResult) + ds.AbortNodeDeletion(nodeInfo.Node(), nodeGroup.Id(), drain, "prepareNodeForDeletion failed", nodeDeleteResult, true) return } @@ -122,7 +122,7 @@ func (ds *GroupDeletionScheduler) addToBatcher(nodeInfo *framework.NodeInfo, nod if atomic { if ds.failuresForGroup[nodeGroup.Id()] { nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")} - CleanUpAndRecordFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult) + CleanUpAndRecordErrorForFailedScaleDownEvent(ds.ctx, nodeInfo.Node(), nodeGroup.Id(), drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult) delete(ds.nodeQueue, nodeGroup.Id()) } if len(ds.nodeQueue[nodeGroup.Id()]) < batchSize { @@ -134,18 +134,23 @@ func (ds *GroupDeletionScheduler) addToBatcher(nodeInfo *framework.NodeInfo, nod ds.nodeQueue[nodeGroup.Id()] = []*apiv1.Node{} } +// AbortNodeDeletionDueToError frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion. +func (ds *GroupDeletionScheduler) AbortNodeDeletionDueToError(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult) { + ds.AbortNodeDeletion(node, nodeGroupId, drain, errMsg, result, false) +} + // AbortNodeDeletion frees up a node that couldn't be deleted successfully. If it was a part of a group, the same is applied for other nodes queued for deletion. -func (ds *GroupDeletionScheduler) AbortNodeDeletion(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult) { +func (ds *GroupDeletionScheduler) AbortNodeDeletion(node *apiv1.Node, nodeGroupId string, drain bool, errMsg string, result status.NodeDeleteResult, logAsWarning bool) { ds.Lock() defer ds.Unlock() ds.failuresForGroup[nodeGroupId] = true - CleanUpAndRecordFailedScaleDownEvent(ds.ctx, node, nodeGroupId, drain, ds.nodeDeletionTracker, errMsg, result) + CleanUpAndRecordFailedScaleDownEvent(ds.ctx, node, nodeGroupId, drain, ds.nodeDeletionTracker, errMsg, result, logAsWarning) for _, otherNode := range ds.nodeQueue[nodeGroupId] { if otherNode == node { continue } nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: errors.NewAutoscalerError(errors.TransientError, "couldn't scale down other nodes in this node group")} - CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult) + CleanUpAndRecordFailedScaleDownEvent(ds.ctx, otherNode, nodeGroupId, drain, ds.nodeDeletionTracker, "scale down failed for node group as a whole", nodeDeleteResult, logAsWarning) } delete(ds.nodeQueue, nodeGroupId) } diff --git a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go index d2c49625355c..324cc490c804 100644 --- a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go +++ b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler_test.go @@ -173,7 +173,7 @@ func TestScheduleDeletion(t *testing.T) { for _, bucket := range ti.toAbort { for _, node := range bucket.Nodes { nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorFailedToDelete, Err: cmpopts.AnyError} - scheduler.AbortNodeDeletion(node, bucket.Group.Id(), false, "simulated abort", nodeDeleteResult) + scheduler.AbortNodeDeletionDueToError(node, bucket.Group.Id(), false, "simulated abort", nodeDeleteResult) } } if err := scheduleAll(ti.toScheduleAfterAbort, scheduler); err != nil {