diff --git a/cmd/main.go b/cmd/main.go index 0a059cd..7832570 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -57,6 +57,7 @@ func main() { var rejectEmptyNodeDisruption bool var retryInterval time.Duration var rejectOverlappingDisruption bool + var healthHookTimeout time.Duration var nodeDisruptionTypesRaw string flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -66,6 +67,7 @@ func main() { flag.BoolVar(&rejectEmptyNodeDisruption, "reject-empty-node-disruption", false, "Reject NodeDisruption matching no actual node.") flag.DurationVar(&retryInterval, "retry-interval", controller.DefaultRetryInterval, "How long to wait between each retry (Default 60s)") flag.BoolVar(&rejectOverlappingDisruption, "reject-overlapping-disruption", false, "Automatically reject any overlapping NodeDisruption (based on node selector), preserving the oldest one") + flag.DurationVar(&healthHookTimeout, "healthhook-timeout", controller.DefaultHealthHookTimeout, "HTTP client timeout for calling HealthHook resolved from ADB") flag.StringVar(&nodeDisruptionTypesRaw, "node-disruption-types", "", "The list of types allowed for a node disruption separated by a comma.") opts := zap.Options{ @@ -109,6 +111,7 @@ func main() { RejectEmptyNodeDisruption: rejectEmptyNodeDisruption, RetryInterval: retryInterval, RejectOverlappingDisruption: rejectOverlappingDisruption, + HealthHookTimeout: healthHookTimeout, NodeDisruptionTypes: nodeDisruptionTypes, }, }).SetupWithManager(mgr); err != nil { diff --git a/internal/controller/applicationdisruptionbudget_controller.go b/internal/controller/applicationdisruptionbudget_controller.go index e11b802..4367324 100644 --- a/internal/controller/applicationdisruptionbudget_controller.go +++ b/internal/controller/applicationdisruptionbudget_controller.go @@ -25,6 +25,7 @@ import ( "net/http" "reflect" "strconv" + "time" "k8s.io/apimachinery/pkg/api/errors" @@ -216,12 +217,12 @@ func (r *ApplicationDisruptionBudgetResolver) GetNamespacedName() nodedisruption } // Call a lifecycle hook in order to synchronously validate a Node Disruption -func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption) error { +func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption, timeout time.Duration) error { if r.ApplicationDisruptionBudget.Spec.HealthHook.URL == "" { return nil } - client := &http.Client{} + client := &http.Client{Timeout: timeout} // Last resort, in case an external service is unresponsive. headers := make(map[string][]string, 1) data, err := json.Marshal(nd) diff --git a/internal/controller/budget.go b/internal/controller/budget.go index df7b2dc..69f24d9 100644 --- a/internal/controller/budget.go +++ b/internal/controller/budget.go @@ -2,6 +2,7 @@ package controller import ( "context" + "time" nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/pkg/resolver" @@ -17,7 +18,7 @@ type Budget interface { // Return the number of disruption allowed considering a list of current node disruptions TolerateDisruption(resolver.NodeSet) bool // Call a lifecycle hook in order to synchronously validate a Node Disruption - CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error + CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption, time.Duration) error // Apply the budget's status to Kubernetes UpdateStatus(context.Context) error // Get the name, namespace and kind of bduget diff --git a/internal/controller/budget_test.go b/internal/controller/budget_test.go index ef0319a..f4ca49a 100644 --- a/internal/controller/budget_test.go +++ b/internal/controller/budget_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "testing" + "time" nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1" "github.com/criteo/node-disruption-controller/internal/controller" @@ -36,7 +37,7 @@ func (m *MockBudget) TolerateDisruption(resolver.NodeSet) bool { } // Check health make a synchronous health check on the underlying resource of a budget -func (m *MockBudget) CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error { +func (m *MockBudget) CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption, time.Duration) error { m.healthChecked = true return m.health } diff --git a/internal/controller/nodedisruption_controller.go b/internal/controller/nodedisruption_controller.go index 82c6b96..f447132 100644 --- a/internal/controller/nodedisruption_controller.go +++ b/internal/controller/nodedisruption_controller.go @@ -36,7 +36,8 @@ import ( ) const ( - DefaultRetryInterval = time.Minute + DefaultRetryInterval = time.Minute + DefaultHealthHookTimeout = time.Minute ) type NodeDisruptionReconcilerConfig struct { @@ -46,6 +47,8 @@ type NodeDisruptionReconcilerConfig struct { RetryInterval time.Duration // Reject NodeDisruption if its node selector overlaps an older NodeDisruption's selector RejectOverlappingDisruption bool + // HealthHook http call resolved from ADB need a timeout to avoid unresponsive call + HealthHookTimeout time.Duration // Specify which node disruption types are allowed to be granted NodeDisruptionTypes []string } @@ -384,7 +387,7 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con } for _, budget := range impactedBudgets { - err := budget.CallHealthHook(ctx, ndr.NodeDisruption) + err := budget.CallHealthHook(ctx, ndr.NodeDisruption, ndr.Config.HealthHookTimeout) ref := budget.GetNamespacedName() if err != nil { anyFailed = true diff --git a/internal/controller/nodedisruptionbudget_controller.go b/internal/controller/nodedisruptionbudget_controller.go index 192c5a5..f043696 100644 --- a/internal/controller/nodedisruptionbudget_controller.go +++ b/internal/controller/nodedisruptionbudget_controller.go @@ -20,6 +20,7 @@ import ( "context" "math" "reflect" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -197,7 +198,7 @@ func (r *NodeDisruptionBudgetResolver) TolerateDisruption(disruptedNodes resolve } // Call a lifecycle hook in order to synchronously validate a Node Disruption -func (r *NodeDisruptionBudgetResolver) CallHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption) error { +func (r *NodeDisruptionBudgetResolver) CallHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption, _ time.Duration) error { return nil }