Improve events when max total nodes of the cluster is reached.

- log cluster wide event - previous event would never get fired because the estimators would already cap the options they generate and additionally it would fire once and events are kept only for some time - log per pod event explaining why the scale up is not triggered (previously it would either get no scale up because no matching group or it would not get an event at all) This required adding a list of pods that were unschedulable to the status in case when the max total nodes were reached.
kubernetes · Feb 12, 2025 · b8db30c · b8db30c
1 parent cf115af
commit b8db30c
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 3 deletions.
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -528,7 +528,18 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		scaleUpStatus.Result = status.ScaleUpLimitedByMaxNodesTotal
 		klog.Warningf("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
+		autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "MaxNodesTotalReached",
+			"Max total nodes in cluster reached: %v", autoscalingContext.MaxNodesTotal)
 		shouldScaleUp = false
+
+		noScaleUpInfoForPods := []status.NoScaleUpInfo{}
+		for _, pod := range unschedulablePodsToHelp {
+			noScaleUpInfo := status.NoScaleUpInfo{
+				Pod: pod,
+			}
+			noScaleUpInfoForPods = append(noScaleUpInfoForPods, noScaleUpInfo)
+		}
+		scaleUpStatus.PodsRemainUnschedulable = noScaleUpInfoForPods
 	} else if len(a.BypassedSchedulers) == 0 && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
 		// The assumption here is that these pods have been created very recently and probably there
 		// is more pods to come. In theory we could check the newest pod time but then if pod were created

diff --git a/cluster-autoscaler/processors/status/eventing_scale_up_processor.go b/cluster-autoscaler/processors/status/eventing_scale_up_processor.go
@@ -41,7 +41,7 @@ func (p *EventingScaleUpStatusProcessor) Process(context *context.AutoscalingCon
 		for _, noScaleUpInfo := range status.PodsRemainUnschedulable {
 			context.Recorder.Event(noScaleUpInfo.Pod, apiv1.EventTypeNormal, "NotTriggerScaleUp",
 				fmt.Sprintf("pod didn't trigger scale-up: %s",
-					ReasonsMessage(noScaleUpInfo, consideredNodeGroupsMap)))
+					ReasonsMessage(status.Result, noScaleUpInfo, consideredNodeGroupsMap)))
 		}
 	} else {
 		klog.V(4).Infof("Skipping event processing for unschedulable pods since there is a" +
@@ -60,7 +60,11 @@ func (p *EventingScaleUpStatusProcessor) CleanUp() {
 }
 
 // ReasonsMessage aggregates reasons from NoScaleUpInfos.
-func ReasonsMessage(noScaleUpInfo NoScaleUpInfo, consideredNodeGroups map[string]cloudprovider.NodeGroup) string {
+func ReasonsMessage(scaleUpStatus ScaleUpResult, noScaleUpInfo NoScaleUpInfo, consideredNodeGroups map[string]cloudprovider.NodeGroup) string {
+	if scaleUpStatus == ScaleUpLimitedByMaxNodesTotal {
+		return "max total nodes in cluster reached"
+	}
+
 	messages := []string{}
 	aggregated := map[string]int{}
 	for nodeGroupId, reasons := range noScaleUpInfo.RejectedNodeGroups {

diff --git a/cluster-autoscaler/processors/status/eventing_scale_up_processor_test.go b/cluster-autoscaler/processors/status/eventing_scale_up_processor_test.go
@@ -101,6 +101,21 @@ func TestEventingScaleUpStatusProcessor(t *testing.T) {
 			expectedTriggered:   0,
 			expectedNoTriggered: 0,
 		},
+		{
+			caseName: "No scale up; max total nodes in cluster reached",
+			state: &ScaleUpStatus{
+				Result:               ScaleUpLimitedByMaxNodesTotal,
+				ScaleUpInfos:         []nodegroupset.ScaleUpInfo{{}},
+				PodsTriggeredScaleUp: []*apiv1.Pod{},
+				PodsRemainUnschedulable: []NoScaleUpInfo{
+					{Pod: p1},
+					{Pod: p2},
+					{Pod: p3},
+				},
+			},
+			expectedTriggered:   0,
+			expectedNoTriggered: 3,
+		},
 	}
 
 	for _, tc := range testCases {
@@ -166,9 +181,18 @@ func TestReasonsMessage(t *testing.T) {
 		"2 max limit reached",
 		"1 not ready",
 	}
-	result := ReasonsMessage(NoScaleUpInfo{nil, rejected, skipped}, considered)
+	result := ReasonsMessage(ScaleUpNoOptionsAvailable, NoScaleUpInfo{nil, rejected, skipped}, considered)
 
 	for _, part := range expected {
 		assert.Contains(t, result, part)
 	}
 }
+
+func TestReasonsMessageWhenScaleUpLimitedByMaxNodesTotal(t *testing.T) {
+	considered := map[string]cloudprovider.NodeGroup{}
+	noScaleUpInfo := NoScaleUpInfo{
+		Pod: nil,
+	}
+	result := ReasonsMessage(ScaleUpLimitedByMaxNodesTotal, noScaleUpInfo, considered)
+	assert.Contains(t, result, "max total nodes in cluster reached")
+}