Skip to content

Commit

Permalink
Improve logging when the cluster reaches max nodes total.
Browse files Browse the repository at this point in the history
- add autoscaling status to reflect that
- change the log severity to warning as this means that autoscaler will
  not be fully functional (in praticular scaling up will not work)
- fix the scale up enforcer logic not to skip the max nodes reached
  logging point
  • Loading branch information
jbtk committed Jan 29, 2025
1 parent 29b611d commit 86ee2b7
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
17 changes: 11 additions & 6 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -519,15 +519,17 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return false, nil
}

forceScaleUp := a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp)
shouldScaleUp := true

if len(unschedulablePodsToHelp) == 0 {
scaleUpStatus.Result = status.ScaleUpNotNeeded
klog.V(1).Info("No unschedulable pods")
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal && !forceScaleUp {
scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable
klog.V(1).Infof("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
} else if len(a.BypassedSchedulers) == 0 && !forceScaleUp && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
shouldScaleUp = false
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
scaleUpStatus.Result = status.ScaleUpLimitedByMaxNodesTotal
klog.Warningf("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
shouldScaleUp = false
} else if len(a.BypassedSchedulers) == 0 && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
// The assumption here is that these pods have been created very recently and probably there
// is more pods to come. In theory we could check the newest pod time but then if pod were created
// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
Expand All @@ -537,7 +539,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
a.processorCallbacks.DisableScaleDownForLoop()
scaleUpStatus.Result = status.ScaleUpInCooldown
klog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more")
} else {
shouldScaleUp = false
}

if shouldScaleUp || a.processors.ScaleUpEnforcer.ShouldForceScaleUp(unschedulablePodsToHelp) {
scaleUpStart := preScaleUp()
scaleUpStatus, typedErr = a.scaleUpOrchestrator.ScaleUp(unschedulablePodsToHelp, readyNodes, daemonsets, nodeInfosForGroups, false)
if exit, err := postScaleUp(scaleUpStart); exit {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ const (
ScaleUpNotTried
// ScaleUpInCooldown - the scale up wasn't even attempted, because it's in a cooldown state (it's suspended for a scheduled period of time).
ScaleUpInCooldown
// ScaleUpLimitedByMaxNodesTotal - the scale up wasn't attempted, because the cluster reached max nodes total
ScaleUpLimitedByMaxNodesTotal
)

// WasSuccessful returns true if the scale-up was successful.
Expand Down

0 comments on commit 86ee2b7

Please sign in to comment.