From 86ee2b723a810561ba4e85ecd24e0c20c6c215c2 Mon Sep 17 00:00:00 2001 From: Justyna Betkier Date: Mon, 30 Dec 2024 14:13:36 +0100 Subject: [PATCH] Improve logging when the cluster reaches max nodes total. - add autoscaling status to reflect that - change the log severity to warning as this means that autoscaler will not be fully functional (in praticular scaling up will not work) - fix the scale up enforcer logic not to skip the max nodes reached logging point --- cluster-autoscaler/core/static_autoscaler.go | 17 +++++++++++------ .../status/scale_up_status_processor.go | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 3e7e5781e42c..c88bbfed2dce 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -519,15 +519,17 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return false, nil } - forceScaleUp := a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp) + shouldScaleUp := true if len(unschedulablePodsToHelp) == 0 { scaleUpStatus.Result = status.ScaleUpNotNeeded klog.V(1).Info("No unschedulable pods") - } else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal && !forceScaleUp { - scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable - klog.V(1).Infof("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes)) - } else if len(a.BypassedSchedulers) == 0 && !forceScaleUp && allPodsAreNew(unschedulablePodsToHelp, currentTime) { + shouldScaleUp = false + } else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal { + scaleUpStatus.Result = status.ScaleUpLimitedByMaxNodesTotal + klog.Warningf("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes)) + shouldScaleUp = false + } else if len(a.BypassedSchedulers) == 0 && allPodsAreNew(unschedulablePodsToHelp, currentTime) { // The assumption here is that these pods have been created very recently and probably there // is more pods to come. In theory we could check the newest pod time but then if pod were created // slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time. @@ -537,7 +539,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr a.processorCallbacks.DisableScaleDownForLoop() scaleUpStatus.Result = status.ScaleUpInCooldown klog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more") - } else { + shouldScaleUp = false + } + + if shouldScaleUp || a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp) { scaleUpStart := preScaleUp() scaleUpStatus, typedErr = a.scaleUpOrchestrator.ScaleUp(unschedulablePodsToHelp, readyNodes, daemonsets, nodeInfosForGroups, false) if exit, err := postScaleUp(scaleUpStart); exit { diff --git a/cluster-autoscaler/processors/status/scale_up_status_processor.go b/cluster-autoscaler/processors/status/scale_up_status_processor.go index 2bd48ba1ce45..708bb0e232ba 100644 --- a/cluster-autoscaler/processors/status/scale_up_status_processor.go +++ b/cluster-autoscaler/processors/status/scale_up_status_processor.go @@ -66,6 +66,8 @@ const ( ScaleUpNotTried // ScaleUpInCooldown - the scale up wasn't even attempted, because it's in a cooldown state (it's suspended for a scheduled period of time). ScaleUpInCooldown + // ScaleUpLimitedByMaxNodesTotal - the scale up wasn't attempted, because the cluster reached max nodes total + ScaleUpLimitedByMaxNodesTotal ) // WasSuccessful returns true if the scale-up was successful.