From 86ee2b723a810561ba4e85ecd24e0c20c6c215c2 Mon Sep 17 00:00:00 2001
From: Justyna Betkier <jbetkier@google.com>
Date: Mon, 30 Dec 2024 14:13:36 +0100
Subject: [PATCH] Improve logging when the cluster reaches max nodes total.

- add autoscaling status to reflect that
- change the log severity to warning as this means that autoscaler will
  not be fully functional (in praticular scaling up will not work)
- fix the scale up enforcer logic not to skip the max nodes reached
  logging point
---
 cluster-autoscaler/core/static_autoscaler.go    | 17 +++++++++++------
 .../status/scale_up_status_processor.go         |  2 ++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 3e7e5781e42c..c88bbfed2dce 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -519,15 +519,17 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		return false, nil
 	}
 
-	forceScaleUp := a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp)
+	shouldScaleUp := true
 
 	if len(unschedulablePodsToHelp) == 0 {
 		scaleUpStatus.Result = status.ScaleUpNotNeeded
 		klog.V(1).Info("No unschedulable pods")
-	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal && !forceScaleUp {
-		scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable
-		klog.V(1).Infof("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
-	} else if len(a.BypassedSchedulers) == 0 && !forceScaleUp && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
+		shouldScaleUp = false
+	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
+		scaleUpStatus.Result = status.ScaleUpLimitedByMaxNodesTotal
+		klog.Warningf("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
+		shouldScaleUp = false
+	} else if len(a.BypassedSchedulers) == 0 && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
 		// The assumption here is that these pods have been created very recently and probably there
 		// is more pods to come. In theory we could check the newest pod time but then if pod were created
 		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
@@ -537,7 +539,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		a.processorCallbacks.DisableScaleDownForLoop()
 		scaleUpStatus.Result = status.ScaleUpInCooldown
 		klog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more")
-	} else {
+		shouldScaleUp = false
+	}
+
+	if shouldScaleUp || a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp) {
 		scaleUpStart := preScaleUp()
 		scaleUpStatus, typedErr = a.scaleUpOrchestrator.ScaleUp(unschedulablePodsToHelp, readyNodes, daemonsets, nodeInfosForGroups, false)
 		if exit, err := postScaleUp(scaleUpStart); exit {
diff --git a/cluster-autoscaler/processors/status/scale_up_status_processor.go b/cluster-autoscaler/processors/status/scale_up_status_processor.go
index 2bd48ba1ce45..708bb0e232ba 100644
--- a/cluster-autoscaler/processors/status/scale_up_status_processor.go
+++ b/cluster-autoscaler/processors/status/scale_up_status_processor.go
@@ -66,6 +66,8 @@ const (
 	ScaleUpNotTried
 	// ScaleUpInCooldown - the scale up wasn't even attempted, because it's in a cooldown state (it's suspended for a scheduled period of time).
 	ScaleUpInCooldown
+	// ScaleUpLimitedByMaxNodesTotal - the scale up wasn't attempted, because the cluster reached max nodes total
+	ScaleUpLimitedByMaxNodesTotal
 )
 
 // WasSuccessful returns true if the scale-up was successful.