Skip to content

Commit 7e314b3

Browse files
authored
feat: Add enhanced node repair configuration support (#8512)
* feat: Add enhanced node repair configuration support This commit implements comprehensive enhanced node repair configuration for EKS managed nodegroups with the following features: - Support for percentage and count-based unhealthy node thresholds - Configurable parallel repair limits (percentage and count) - Advanced node repair config overrides for specific conditions - Full CLI flag support for all new parameters - Complete YAML configuration file support - Backward compatibility with existing configurations Key changes: - Extended API types with new NodeRepairConfigOverride struct - Added CLI flags for all new parameters - Updated CloudFormation builder for AWS EKS integration - Comprehensive unit and integration tests - Updated documentation and examples - Enhanced JSON schema validation CLI Examples: eksctl create cluster --enable-node-repair --node-repair-max-unhealthy-percentage=25 eksctl create nodegroup --enable-node-repair --node-repair-max-parallel-count=2 Config Examples: nodeRepairConfig: enabled: true maxUnhealthyNodeThresholdPercentage: 20 maxParallelNodesRepairedCount: 2 nodeRepairConfigOverrides: - nodeMonitoringCondition: NetworkNotReady nodeUnhealthyReason: InterfaceNotUp repairAction: Restart minRepairWaitTimeMins: 15 * bump eks version * update user doc for configurable node repair * update user docs
1 parent a08d13d commit 7e314b3

File tree

14 files changed

+1449
-19
lines changed

14 files changed

+1449
-19
lines changed

examples/44-node-repair.yaml

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# An example ClusterConfig that uses a managed node group with auto repair.
1+
# An example ClusterConfig that demonstrates node repair configuration
2+
# for EKS managed nodegroups with various configuration options.
23

34
apiVersion: eksctl.io/v1alpha5
45
kind: ClusterConfig
@@ -8,6 +9,77 @@ metadata:
89
region: us-west-2
910

1011
managedNodeGroups:
11-
- name: ng-1
12-
nodeRepairConfig:
13-
enabled: true
12+
# Example 1: Basic node repair
13+
- name: basic-repair-ng
14+
instanceType: m5.large
15+
desiredCapacity: 3
16+
nodeRepairConfig:
17+
enabled: true
18+
19+
# Example 2: Node repair with percentage-based thresholds
20+
- name: percentage-repair-ng
21+
instanceType: m5.large
22+
desiredCapacity: 3
23+
minSize: 1
24+
maxSize: 5
25+
nodeRepairConfig:
26+
enabled: true
27+
# Stop repair actions when 20% of nodes are unhealthy
28+
maxUnhealthyNodeThresholdPercentage: 20
29+
# Repair at most 15% of unhealthy nodes in parallel
30+
maxParallelNodesRepairedPercentage: 15
31+
32+
# Example 3: Node repair with count-based thresholds
33+
- name: count-repair-ng
34+
instanceType: m5.xlarge
35+
desiredCapacity: 10
36+
minSize: 5
37+
maxSize: 20
38+
nodeRepairConfig:
39+
enabled: true
40+
# Stop repair actions when 3 nodes are unhealthy
41+
maxUnhealthyNodeThresholdCount: 3
42+
# Repair at most 2 unhealthy nodes in parallel
43+
maxParallelNodesRepairedCount: 2
44+
45+
# Example 4: GPU workload with custom repair overrides
46+
- name: gpu-repair-ng
47+
instanceType: g4dn.xlarge
48+
desiredCapacity: 4
49+
minSize: 2
50+
maxSize: 8
51+
nodeRepairConfig:
52+
enabled: true
53+
maxUnhealthyNodeThresholdPercentage: 25
54+
maxParallelNodesRepairedCount: 1
55+
# Custom repair behavior for specific failure scenarios
56+
nodeRepairConfigOverrides:
57+
# Handle GPU-related failures with immediate termination
58+
- nodeMonitoringCondition: "AcceleratedInstanceNotReady"
59+
nodeUnhealthyReason: "NvidiaXID13Error"
60+
minRepairWaitTimeMins: 5
61+
repairAction: "Terminate"
62+
# Handle network issues with restart first
63+
- nodeMonitoringCondition: "NetworkNotReady"
64+
nodeUnhealthyReason: "InterfaceNotUp"
65+
minRepairWaitTimeMins: 15
66+
repairAction: "Restart"
67+
68+
# Example 5: Conservative repair for critical workloads
69+
- name: critical-repair-ng
70+
instanceType: c5.2xlarge
71+
desiredCapacity: 6
72+
minSize: 3
73+
maxSize: 12
74+
nodeRepairConfig:
75+
enabled: true
76+
# Conservative settings - stop repair when only 10% of nodes are unhealthy
77+
maxUnhealthyNodeThresholdPercentage: 10
78+
# Repair only 1 node at a time
79+
maxParallelNodesRepairedCount: 1
80+
nodeRepairConfigOverrides:
81+
# Wait longer before taking action on critical workloads
82+
- nodeMonitoringCondition: "NetworkNotReady"
83+
nodeUnhealthyReason: "InterfaceNotUp"
84+
minRepairWaitTimeMins: 45
85+
repairAction: "Restart"
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
//go:build integration
2+
// +build integration
3+
4+
package enhancednoderepair
5+
6+
import (
7+
"fmt"
8+
"os"
9+
"testing"
10+
"time"
11+
12+
. "github.com/onsi/ginkgo/v2"
13+
. "github.com/onsi/gomega"
14+
15+
. "github.com/weaveworks/eksctl/integration/runner"
16+
"github.com/weaveworks/eksctl/integration/tests"
17+
"github.com/weaveworks/eksctl/pkg/testutils"
18+
)
19+
20+
var params *tests.Params
21+
22+
func init() {
23+
// Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing
24+
testing.Init()
25+
params = tests.NewParamsWithGivenClusterName("enhanced-node-repair", "test-enhanced-node-repair")
26+
}
27+
28+
func TestEnhancedNodeRepair(t *testing.T) {
29+
testutils.RegisterAndRun(t)
30+
}
31+
32+
var _ = Describe("(Integration) Enhanced Node Repair Configuration", func() {
33+
34+
Context("CloudFormation template generation", func() {
35+
It("should generate correct CloudFormation template with CLI flags", func() {
36+
By("testing CLI flags generate correct CloudFormation")
37+
cmd := params.EksctlCreateCmd.WithArgs(
38+
"cluster",
39+
"--name", "test-cli-template",
40+
"--region", params.Region,
41+
"--managed",
42+
"--enable-node-repair",
43+
"--node-repair-max-unhealthy-percentage=25",
44+
"--node-repair-max-parallel-count=2",
45+
"--dry-run",
46+
)
47+
Expect(cmd).To(RunSuccessfully())
48+
})
49+
50+
It("should generate correct CloudFormation template with YAML config", func() {
51+
By("creating temporary config file")
52+
configFile := fmt.Sprintf("/tmp/test-enhanced-node-repair-%d.yaml", time.Now().Unix())
53+
yamlConfig := fmt.Sprintf(`
54+
apiVersion: eksctl.io/v1alpha5
55+
kind: ClusterConfig
56+
57+
metadata:
58+
name: test-yaml-template
59+
region: %s
60+
61+
managedNodeGroups:
62+
- name: enhanced-ng
63+
instanceType: t3.medium
64+
desiredCapacity: 2
65+
nodeRepairConfig:
66+
enabled: true
67+
maxUnhealthyNodeThresholdPercentage: 20
68+
maxParallelNodesRepairedPercentage: 15
69+
nodeRepairConfigOverrides:
70+
- nodeMonitoringCondition: "NetworkNotReady"
71+
nodeUnhealthyReason: "InterfaceNotUp"
72+
minRepairWaitTimeMins: 15
73+
repairAction: "Restart"
74+
`, params.Region)
75+
76+
err := os.WriteFile(configFile, []byte(yamlConfig), 0644)
77+
Expect(err).NotTo(HaveOccurred())
78+
defer os.Remove(configFile)
79+
80+
By("testing YAML config generates correct CloudFormation")
81+
cmd := params.EksctlCreateCmd.WithArgs(
82+
"cluster",
83+
"--config-file", configFile,
84+
"--dry-run",
85+
).WithoutArg("--region", params.Region)
86+
Expect(cmd).To(RunSuccessfully())
87+
})
88+
89+
It("should validate backward compatibility with existing config", func() {
90+
By("testing existing node repair config still works")
91+
cmd := params.EksctlCreateCmd.WithArgs(
92+
"cluster",
93+
"--name", "test-backward-compat",
94+
"--region", params.Region,
95+
"--managed",
96+
"--enable-node-repair",
97+
"--dry-run",
98+
)
99+
Expect(cmd).To(RunSuccessfully())
100+
})
101+
})
102+
103+
Context("error handling", func() {
104+
It("should handle invalid CLI flag combinations gracefully", func() {
105+
By("testing with unmanaged nodegroup (should fail)")
106+
cmd := params.EksctlCreateCmd.WithArgs(
107+
"cluster",
108+
"--name", "test-error-handling",
109+
"--region", params.Region,
110+
"--managed=false",
111+
"--enable-node-repair",
112+
"--dry-run",
113+
)
114+
Expect(cmd).NotTo(RunSuccessfully())
115+
})
116+
117+
It("should handle invalid YAML configuration gracefully", func() {
118+
By("creating config file with invalid node repair config")
119+
configFile := fmt.Sprintf("/tmp/test-invalid-config-%d.yaml", time.Now().Unix())
120+
invalidConfig := fmt.Sprintf(`
121+
apiVersion: eksctl.io/v1alpha5
122+
kind: ClusterConfig
123+
124+
metadata:
125+
name: test-invalid
126+
region: %s
127+
128+
nodeGroups:
129+
- name: unmanaged-ng
130+
instanceType: t3.medium
131+
nodeRepairConfig:
132+
enabled: true
133+
`, params.Region)
134+
135+
err := os.WriteFile(configFile, []byte(invalidConfig), 0644)
136+
Expect(err).NotTo(HaveOccurred())
137+
defer os.Remove(configFile)
138+
139+
By("testing invalid config is rejected")
140+
cmd := params.EksctlCreateCmd.WithArgs(
141+
"cluster",
142+
"--config-file", configFile,
143+
"--dry-run",
144+
).WithoutArg("--region", params.Region)
145+
// This should fail because nodeRepairConfig is not supported for unmanaged nodegroups
146+
Expect(cmd).NotTo(RunSuccessfully())
147+
})
148+
})
149+
})

pkg/apis/eksctl.io/v1alpha5/assets/schema.json

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2307,10 +2307,43 @@
23072307
"type": "boolean",
23082308
"description": "Enables the auto repair feature for the nodegroup",
23092309
"x-intellij-html-description": "Enables the auto repair feature for the nodegroup"
2310+
},
2311+
"maxParallelNodesRepairedCount": {
2312+
"type": "integer",
2313+
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.",
2314+
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time."
2315+
},
2316+
"maxParallelNodesRepairedPercentage": {
2317+
"type": "integer",
2318+
"description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.",
2319+
"x-intellij-html-description": "specifies the maximum number of nodes that can be repaired concurrently or in parallel, expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time."
2320+
},
2321+
"maxUnhealthyNodeThresholdCount": {
2322+
"type": "integer",
2323+
"description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.",
2324+
"x-intellij-html-description": "specifies a count threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time."
2325+
},
2326+
"maxUnhealthyNodeThresholdPercentage": {
2327+
"type": "integer",
2328+
"description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.",
2329+
"x-intellij-html-description": "specifies a percentage threshold of unhealthy nodes, above which node auto repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time."
2330+
},
2331+
"nodeRepairConfigOverrides": {
2332+
"items": {
2333+
"$ref": "#/definitions/NodeRepairConfigOverride"
2334+
},
2335+
"type": "array",
2336+
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
2337+
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
23102338
}
23112339
},
23122340
"preferredOrder": [
2313-
"enabled"
2341+
"enabled",
2342+
"maxUnhealthyNodeThresholdPercentage",
2343+
"maxUnhealthyNodeThresholdCount",
2344+
"maxParallelNodesRepairedPercentage",
2345+
"maxParallelNodesRepairedCount",
2346+
"nodeRepairConfigOverrides"
23142347
],
23152348
"additionalProperties": false,
23162349
"description": "contains the auto repair configuration for the nodegroup",
@@ -2436,6 +2469,39 @@
24362469
"description": "contains the configuration for updating NodeGroups.",
24372470
"x-intellij-html-description": "contains the configuration for updating NodeGroups."
24382471
},
2472+
"NodeRepairConfigOverride": {
2473+
"properties": {
2474+
"minRepairWaitTimeMins": {
2475+
"type": "integer",
2476+
"description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason",
2477+
"x-intellij-html-description": "specifies the minimum time in minutes to wait before attempting to repair a node with this specific NodeMonitoringCondition and NodeUnhealthyReason"
2478+
},
2479+
"nodeMonitoringCondition": {
2480+
"type": "string",
2481+
"description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to",
2482+
"x-intellij-html-description": "specifies an unhealthy condition reported by the node monitoring agent that this override would apply to"
2483+
},
2484+
"nodeUnhealthyReason": {
2485+
"type": "string",
2486+
"description": "specifies a reason reported by the node monitoring agent that this override would apply to",
2487+
"x-intellij-html-description": "specifies a reason reported by the node monitoring agent that this override would apply to"
2488+
},
2489+
"repairAction": {
2490+
"type": "string",
2491+
"description": "specifies the repair action to take for nodes when all of the specified conditions are met",
2492+
"x-intellij-html-description": "specifies the repair action to take for nodes when all of the specified conditions are met"
2493+
}
2494+
},
2495+
"preferredOrder": [
2496+
"nodeMonitoringCondition",
2497+
"nodeUnhealthyReason",
2498+
"minRepairWaitTimeMins",
2499+
"repairAction"
2500+
],
2501+
"additionalProperties": false,
2502+
"description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.",
2503+
"x-intellij-html-description": "specifies granular overrides for specific repair actions. These overrides control the repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values."
2504+
},
24392505
"OIDCIdentityProvider": {
24402506
"required": [
24412507
"name",

pkg/apis/eksctl.io/v1alpha5/types.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,6 +1609,48 @@ type (
16091609
// Enables the auto repair feature for the nodegroup
16101610
// +optional
16111611
Enabled *bool `json:"enabled,omitempty"`
1612+
1613+
// MaxUnhealthyNodeThresholdPercentage specifies a percentage threshold of unhealthy nodes, above which node auto
1614+
// repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdCount at the same time.
1615+
// +optional
1616+
MaxUnhealthyNodeThresholdPercentage *int `json:"maxUnhealthyNodeThresholdPercentage,omitempty"`
1617+
1618+
// MaxUnhealthyNodeThresholdCount specifies a count threshold of unhealthy nodes, above which node auto
1619+
// repair actions will stop. When using this, you cannot also set MaxUnhealthyNodeThresholdPercentage at the same time.
1620+
// +optional
1621+
MaxUnhealthyNodeThresholdCount *int `json:"maxUnhealthyNodeThresholdCount,omitempty"`
1622+
1623+
// MaxParallelNodesRepairedPercentage specifies the maximum number of nodes that can be repaired concurrently or in parallel,
1624+
// expressed as a percentage of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedCount at the same time.
1625+
// +optional
1626+
MaxParallelNodesRepairedPercentage *int `json:"maxParallelNodesRepairedPercentage,omitempty"`
1627+
1628+
// MaxParallelNodesRepairedCount specifies the maximum number of nodes that can be repaired concurrently or in parallel,
1629+
// expressed as a count of unhealthy nodes. When using this, you cannot also set MaxParallelNodesRepairedPercentage at the same time.
1630+
// +optional
1631+
MaxParallelNodesRepairedCount *int `json:"maxParallelNodesRepairedCount,omitempty"`
1632+
1633+
// NodeRepairConfigOverrides specifies granular overrides for specific repair actions. These overrides control the
1634+
// repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.
1635+
// +optional
1636+
NodeRepairConfigOverrides []NodeRepairConfigOverride `json:"nodeRepairConfigOverrides,omitempty"`
1637+
}
1638+
1639+
// NodeRepairConfigOverride specifies granular overrides for specific repair actions. These overrides control the
1640+
// repair action and the repair delay time before a node is considered eligible for repair. If you use this, you must specify all the values.
1641+
NodeRepairConfigOverride struct {
1642+
// NodeMonitoringCondition specifies an unhealthy condition reported by the node monitoring agent that this override would apply to
1643+
NodeMonitoringCondition string `json:"nodeMonitoringCondition"`
1644+
1645+
// NodeUnhealthyReason specifies a reason reported by the node monitoring agent that this override would apply to
1646+
NodeUnhealthyReason string `json:"nodeUnhealthyReason"`
1647+
1648+
// MinRepairWaitTimeMins specifies the minimum time in minutes to wait before attempting to repair a node
1649+
// with this specific NodeMonitoringCondition and NodeUnhealthyReason
1650+
MinRepairWaitTimeMins int `json:"minRepairWaitTimeMins"`
1651+
1652+
// RepairAction specifies the repair action to take for nodes when all of the specified conditions are met
1653+
RepairAction string `json:"repairAction"`
16121654
}
16131655
)
16141656

0 commit comments

Comments
 (0)