Skip to content

Commit 20d6f11

Browse files
authored
fix: Always use the updated ASG definition when incrementing its desired capacity (#167)
Fixes #129
1 parent 3b8647b commit 20d6f11

File tree

4 files changed

+39
-25
lines changed

4 files changed

+39
-25
lines changed

cloud/aws.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,18 +112,32 @@ func DescribeLaunchTemplate(svc ec2iface.EC2API, input *ec2.DescribeLaunchTempla
112112
return templatesOutput.LaunchTemplates[0], nil
113113
}
114114

115-
func SetAutoScalingGroupDesiredCount(svc autoscalingiface.AutoScalingAPI, asg *autoscaling.Group, count int64) error {
116-
if count > aws.Int64Value(asg.MaxSize) {
115+
// IncrementAutoScalingGroupDesiredCount retrieves the latest definition of the ASG and increments its current
116+
// desired capacity by 1. The reason why we retrieve the ASG again even though we already have it is to avoid a
117+
// scenario in which the ASG had already been scaled up or down since the last time it was retrieved.
118+
// See https://github.com/TwiN/aws-eks-asg-rolling-update-handler/issues/129 for more information.
119+
func IncrementAutoScalingGroupDesiredCount(svc autoscalingiface.AutoScalingAPI, autoScalingGroupName string) error {
120+
latestASGs, err := DescribeAutoScalingGroupsByNames(svc, []string{autoScalingGroupName})
121+
if err != nil {
122+
return fmt.Errorf("failed to retrieve latest asg with name '%s': %w", autoScalingGroupName, err)
123+
}
124+
if len(latestASGs) != 1 {
125+
// ASG names are unique per region and account, so if there isn't exactly one ASG, there's a problem.
126+
return errors.New("failed to retrieve latest asg with name: " + autoScalingGroupName)
127+
}
128+
asg := latestASGs[0]
129+
newDesiredCapacity := aws.Int64Value(asg.DesiredCapacity) + 1
130+
if newDesiredCapacity > aws.Int64Value(asg.MaxSize) {
117131
return ErrCannotIncreaseDesiredCountAboveMax
118132
}
119133
desiredInput := &autoscaling.SetDesiredCapacityInput{
120134
AutoScalingGroupName: asg.AutoScalingGroupName,
121-
DesiredCapacity: aws.Int64(count),
135+
DesiredCapacity: aws.Int64(newDesiredCapacity),
122136
HonorCooldown: aws.Bool(true),
123137
}
124-
_, err := svc.SetDesiredCapacity(desiredInput)
138+
_, err = svc.SetDesiredCapacity(desiredInput)
125139
if err != nil {
126-
return fmt.Errorf("unable to increase ASG %s desired count to %d: %v", aws.StringValue(asg.AutoScalingGroupName), count, err)
140+
return fmt.Errorf("unable to increase ASG %s desired count to %d: %w", autoScalingGroupName, newDesiredCapacity, err)
127141
}
128142
return nil
129143
}

k8s/util.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@ import (
77
v1 "k8s.io/api/core/v1"
88
)
99

10-
// CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes calculates the resources available in the target nodes
10+
// CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode calculates the resources available in the target nodes
1111
// and compares them with the resources that would be required if the old node were to be drained
1212
//
13-
// This is not fool proof: 2 targetNodes with 1G available in each would cause the assumption that you can fit
13+
// This is not foolproof: 2 targetNodes with 1G available in each would cause the assumption that you can fit
1414
// a 2G pod in the targetNodes when you obviously can't (you'd need 1 node with 2G available, not 2 with 1G)
1515
// That's alright, because the purpose is to provide a smooth rolling upgrade, not a flawless experience, and
1616
// while the latter is definitely possible, it would slow down the process by quite a bit. In a way, this is
1717
// the beauty of co-existing with the cluster autoscaler; an extra node will be spun up to handle the leftovers,
1818
// if any.
19-
func CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(client ClientAPI, oldNode *v1.Node, targetNodes []*v1.Node) bool {
19+
func CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client ClientAPI, oldNode *v1.Node, targetNodes []*v1.Node) bool {
2020
totalAvailableTargetCPU := int64(0)
2121
totalAvailableTargetMemory := int64(0)
2222
// Get resources available in target nodes
@@ -104,7 +104,7 @@ func AnnotateNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.I
104104
return nil
105105
}
106106

107-
// Label Node adds an Label to the Kubernetes node represented by a given AWS instance
107+
// LabelNodeByAutoScalingInstance adds a Label to the Kubernetes node represented by a given AWS instance
108108
func LabelNodeByAutoScalingInstance(client ClientAPI, instance *autoscaling.Instance, key, value string) error {
109109
node, err := client.GetNodeByAutoScalingInstance(instance)
110110
if err != nil {

k8s/util_test.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
"k8s.io/api/core/v1"
88
)
99

10-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(t *testing.T) {
10+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(t *testing.T) {
1111
// allocatable cpu & memory aren't used for the old node.
1212
// They're only used by the target nodes (newNode, in this case) to calculate if the leftover resources from moving
1313
// the pods from the old node to the new node are positive (if the leftover is negative, it means there's not enough
@@ -17,7 +17,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(t *testing.T) {
1717
oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "100m", "100Mi", false, v1.PodRunning)
1818
mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod})
1919

20-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{&newNode})
20+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
2121
if !hasEnoughResources {
2222
t.Error("should've had enough space in node")
2323
}
@@ -26,14 +26,14 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(t *testing.T) {
2626
}
2727
}
2828

29-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_whenNotEnoughSpaceInNewNodes(t *testing.T) {
29+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_whenNotEnoughSpaceInNewNodes(t *testing.T) {
3030
oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
3131
newNode := k8stest.CreateTestNode("new-node-1", "us-west-2c", "i-0b22d79604221412c", "1000m", "1000Mi")
3232
oldNodePod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "200m", "200Mi", false, v1.PodRunning)
3333
newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "900m", "200Mi", false, v1.PodRunning)
3434
mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodePod, newNodePod})
3535

36-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{&newNode})
36+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
3737
if hasEnoughResources {
3838
t.Error("shouldn't have had enough space in node")
3939
}
@@ -42,7 +42,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_whenNotEnoughSpac
4242
}
4343
}
4444

45-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultiplePods(t *testing.T) {
45+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultiplePods(t *testing.T) {
4646
oldNode := k8stest.CreateTestNode("old-node", "us-west-2c", "i-0b22d79604221412c", "0m", "0m")
4747
newNode := k8stest.CreateTestNode("new-node-1", "us-west-2b", "i-07550830aef9e4179", "1000m", "1000Mi")
4848
oldNodeFirstPod := k8stest.CreateTestPod("old-pod-1", oldNode.Name, "300m", "0", false, v1.PodRunning)
@@ -51,7 +51,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultiplePods(
5151
newNodePod := k8stest.CreateTestPod("new-pod-1", newNode.Name, "200m", "200Mi", false, v1.PodRunning)
5252
mockClient := k8stest.NewMockClient([]v1.Node{oldNode, newNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, newNodePod})
5353

54-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{&newNode})
54+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&newNode})
5555
if hasEnoughResources {
5656
t.Error("shouldn't have had enough space in node")
5757
}
@@ -60,7 +60,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultiplePods(
6060
}
6161
}
6262

63-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultipleTargetNodes(t *testing.T) {
63+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withMultipleTargetNodes(t *testing.T) {
6464
oldNode := k8stest.CreateTestNode("old-node", "us-west-2b", "i-07550830aef9e4179", "0m", "0m")
6565
firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-034fa1dfbfd35f8bb", "1000m", "1000Mi")
6666
secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2b", "i-0918aff89347cef0c", "1000m", "1000Mi")
@@ -69,7 +69,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultipleTarge
6969
oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "500m", "0", false, v1.PodRunning)
7070
mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod})
7171

72-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
72+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
7373
if !hasEnoughResources {
7474
t.Error("should've had enough space in node")
7575
}
@@ -78,7 +78,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withMultipleTarge
7878
}
7979
}
8080

81-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withPodsSpreadAcrossMultipleTargetNodes(t *testing.T) {
81+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withPodsSpreadAcrossMultipleTargetNodes(t *testing.T) {
8282
oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
8383
firstNewNode := k8stest.CreateTestNode("new-node-1", "us-west-2a", "i-07550830aef9e4179", "1000m", "1000Mi")
8484
secondNewNode := k8stest.CreateTestNode("new-node-2", "us-west-2a", "i-0147ad0816c210dae", "1000m", "1000Mi")
@@ -89,7 +89,7 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withPodsSpreadAcr
8989
oldNodeThirdPod := k8stest.CreateTestPod("old-node-pod-3", oldNode.Name, "0", "500Mi", false, v1.PodRunning)
9090
mockClient := k8stest.NewMockClient([]v1.Node{oldNode, firstNewNode, secondNewNode}, []v1.Pod{oldNodeFirstPod, oldNodeSecondPod, oldNodeThirdPod, firstNewNodePod, secondNewNodePod})
9191

92-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
92+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{&firstNewNode, &secondNewNode})
9393
if hasEnoughResources {
9494
t.Error("shouldn't have had enough space in node")
9595
}
@@ -98,23 +98,23 @@ func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withPodsSpreadAcr
9898
}
9999
}
100100

101-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withNoTargetNodes(t *testing.T) {
101+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodes(t *testing.T) {
102102
oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
103103
oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", false, v1.PodRunning)
104104
mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
105105

106-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{})
106+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{})
107107
if hasEnoughResources {
108108
t.Error("there's no target nodes; there definitely shouldn't have been enough space")
109109
}
110110
}
111111

112-
func TestCheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes_withNoTargetNodesButOldNodeOnlyHasPodsFromDaemonSets(t *testing.T) {
112+
func TestCheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode_withNoTargetNodesButOldNodeOnlyHasPodsFromDaemonSets(t *testing.T) {
113113
oldNode := k8stest.CreateTestNode("old-node", "us-west-2a", "i-034fa1dfbfd35f8bb", "0m", "0m")
114114
oldNodePod := k8stest.CreateTestPod("old-node-pod-1", oldNode.Name, "500Mi", "500Mi", true, v1.PodRunning)
115115
mockClient := k8stest.NewMockClient([]v1.Node{oldNode}, []v1.Pod{oldNodePod})
116116

117-
hasEnoughResources := CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(mockClient, &oldNode, []*v1.Node{})
117+
hasEnoughResources := CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(mockClient, &oldNode, []*v1.Node{})
118118
if !hasEnoughResources {
119119
t.Error("there's no target nodes, but the only pods in the old node are from daemon sets")
120120
}

main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
183183
} else {
184184
log.Printf("[%s][%s] Node already started rollout process", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
185185
// check if existing updatedInstances have the capacity to support what's inside this node
186-
hasEnoughResources := k8s.CheckIfNodeHasEnoughResourcesToTransferAllPodsInNodes(client, node, updatedReadyNodes)
186+
hasEnoughResources := k8s.CheckIfUpdatedNodesHaveEnoughResourcesToScheduleAllPodsFromOldNode(client, node, updatedReadyNodes)
187187
if hasEnoughResources {
188188
log.Printf("[%s][%s] Updated nodes have enough resources available", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
189189
if minutesSinceDrained == -1 {
@@ -242,7 +242,7 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
242242
continue
243243
}
244244
log.Printf("[%s][%s] Updated nodes do not have enough resources available, increasing desired count by 1", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
245-
err := cloud.SetAutoScalingGroupDesiredCount(autoScalingService, autoScalingGroup, aws.Int64Value(autoScalingGroup.DesiredCapacity)+1)
245+
err := cloud.IncrementAutoScalingGroupDesiredCount(autoScalingService, aws.StringValue(autoScalingGroup.AutoScalingGroupName))
246246
if err != nil {
247247
log.Printf("[%s][%s] Unable to increase ASG desired size: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
248248
log.Printf("[%s][%s] Skipping", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))

0 commit comments

Comments
 (0)