Skip to content

Commit 38f4e72

Browse files
committed
fix: Set MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio to 0.11 and add cap of 5 non-ready nodes at once
1 parent 18c35ba commit 38f4e72

File tree

2 files changed

+32
-13
lines changed

2 files changed

+32
-13
lines changed

main.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ import (
2020
)
2121

2222
const (
23-
MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking
24-
MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.15 // To help with larger clusters
23+
MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking
24+
25+
MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.11 // To help with larger clusters
26+
MaximumNumberOfUpdatedNonReadyNodes = 5 // To prevent too many non-ready nodes from being taken into account when calculating resources available in one node
2527
)
2628

2729
var (
@@ -143,7 +145,7 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
143145
log.Printf("[%s] Skipping because ASG has a desired capacity of %d, but only has %d instances", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.Int64Value(autoScalingGroup.DesiredCapacity), len(autoScalingGroup.Instances))
144146
continue
145147
}
146-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) {
148+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) {
147149
log.Printf("[%s] ASG has too many non-ready updated nodes/instances (%d), waiting until they become ready", aws.StringValue(autoScalingGroup.AutoScalingGroupName), numberOfNonReadyUpdatedNodesOrInstances)
148150
continue
149151
}
@@ -539,12 +541,20 @@ func compareLaunchTemplateVersions(targetTemplate *ec2.LaunchTemplate, lt1, lt2
539541
return lt1version == lt2version
540542
}
541543

542-
func HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool {
544+
// HasAcceptableNumberOfUpdatedNonReadyNodes checks if there's a sufficient amount of updated
545+
// and ready nodes to move on to the next step (drain & terminate an outdated node) for a number of non-ready nodes.
546+
//
547+
// The logic behind this is that the more nodes are ready and updated, the higher the confidence we have that the
548+
// upgrade is going well, so we can ramp things up faster the deeper we are in the upgrade process.
549+
func HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool {
543550
if numberOfUpdatedNonReadyNodes == 0 {
544551
return true // all updated nodes are ready, so we can proceed
545552
}
546553
if numberOfUpdatedReadyNodes == 0 {
547554
return false // there are no ready nodes AND there are non-ready nodes (we know this because of the previous check), so we cannot proceed
548555
}
556+
if numberOfUpdatedNonReadyNodes > MaximumNumberOfUpdatedNonReadyNodes {
557+
return false // there are too many non-ready nodes, so we cannot proceed
558+
}
549559
return float64(numberOfUpdatedNonReadyNodes)/float64(numberOfUpdatedReadyNodes) <= MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio
550560
}

main_test.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -646,28 +646,37 @@ func TestHandleRollingUpgrade_withMixedInstancePolicyWhenOneOfTheInstanceTypesOv
646646
}
647647
}
648648

649-
func TestHasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(t *testing.T) {
649+
func TestHasAcceptableNumberOfUpdatedNonReadyNodes(t *testing.T) {
650650
// false: there's too many non-ready nodes
651651
// true: there's an acceptable amount of non-ready nodes given how many ready nodes there are
652-
if HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(100, 0) {
652+
if HasAcceptableNumberOfUpdatedNonReadyNodes(100, 0) {
653653
t.Error("100NR/0R ready should not be acceptable")
654654
}
655-
if HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(50, 50) {
655+
if HasAcceptableNumberOfUpdatedNonReadyNodes(50, 50) {
656656
t.Error("50NR/50R should not be acceptable")
657657
}
658-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(5, 95) {
659-
t.Error("5NR/95R should be acceptable")
658+
if HasAcceptableNumberOfUpdatedNonReadyNodes(6, 10000) {
659+
t.Error("6NR/10000R should not be acceptable, because MaximumNumberOfUpdatedNonReadyNodes is set to", MaximumNumberOfUpdatedNonReadyNodes)
660660
}
661-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(1, 99) {
661+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(5, 10000) {
662+
t.Error("5NR/10000R should be acceptable")
663+
}
664+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(4, 100) {
665+
t.Error("4NR/100R should be acceptable")
666+
}
667+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 99) {
662668
t.Error("1NR/99R should be acceptable")
663669
}
664-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 100) {
670+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 100) {
665671
t.Error("0NR/100R should be acceptable")
666672
}
667-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 1) {
673+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 1) {
668674
t.Error("0NR/1R should be acceptable")
669675
}
670-
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 0) {
676+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 0) {
671677
t.Error("0NR/0R should be acceptable")
672678
}
679+
if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 11) {
680+
t.Error("1NR/11R should be acceptable")
681+
}
673682
}

0 commit comments

Comments
 (0)