Skip to content

Commit b35a833

Browse files
damsallemd.amsallem
andauthored
Add 30s client timeout on HealthHook http calls (#74)
* Add 30s client timeout on HealthHook http calls * It's a failsafe one in case an external dep is unresponsive --------- Co-authored-by: d.amsallem <[email protected]>
1 parent 37b7627 commit b35a833

File tree

6 files changed

+17
-7
lines changed

6 files changed

+17
-7
lines changed

cmd/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func main() {
5757
var rejectEmptyNodeDisruption bool
5858
var retryInterval time.Duration
5959
var rejectOverlappingDisruption bool
60+
var healthHookTimeout time.Duration
6061
var nodeDisruptionTypesRaw string
6162
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
6263
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -66,6 +67,7 @@ func main() {
6667
flag.BoolVar(&rejectEmptyNodeDisruption, "reject-empty-node-disruption", false, "Reject NodeDisruption matching no actual node.")
6768
flag.DurationVar(&retryInterval, "retry-interval", controller.DefaultRetryInterval, "How long to wait between each retry (Default 60s)")
6869
flag.BoolVar(&rejectOverlappingDisruption, "reject-overlapping-disruption", false, "Automatically reject any overlapping NodeDisruption (based on node selector), preserving the oldest one")
70+
flag.DurationVar(&healthHookTimeout, "healthhook-timeout", controller.DefaultHealthHookTimeout, "HTTP client timeout for calling HealthHook resolved from ADB")
6971
flag.StringVar(&nodeDisruptionTypesRaw, "node-disruption-types", "", "The list of types allowed for a node disruption separated by a comma.")
7072

7173
opts := zap.Options{
@@ -109,6 +111,7 @@ func main() {
109111
RejectEmptyNodeDisruption: rejectEmptyNodeDisruption,
110112
RetryInterval: retryInterval,
111113
RejectOverlappingDisruption: rejectOverlappingDisruption,
114+
HealthHookTimeout: healthHookTimeout,
112115
NodeDisruptionTypes: nodeDisruptionTypes,
113116
},
114117
}).SetupWithManager(mgr); err != nil {

internal/controller/applicationdisruptionbudget_controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"net/http"
2626
"reflect"
2727
"strconv"
28+
"time"
2829

2930
"k8s.io/apimachinery/pkg/api/errors"
3031

@@ -216,12 +217,12 @@ func (r *ApplicationDisruptionBudgetResolver) GetNamespacedName() nodedisruption
216217
}
217218

218219
// Call a lifecycle hook in order to synchronously validate a Node Disruption
219-
func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption) error {
220+
func (r *ApplicationDisruptionBudgetResolver) CallHealthHook(ctx context.Context, nd nodedisruptionv1alpha1.NodeDisruption, timeout time.Duration) error {
220221
if r.ApplicationDisruptionBudget.Spec.HealthHook.URL == "" {
221222
return nil
222223
}
223224

224-
client := &http.Client{}
225+
client := &http.Client{Timeout: timeout} // Last resort, in case an external service is unresponsive.
225226
headers := make(map[string][]string, 1)
226227

227228
data, err := json.Marshal(nd)

internal/controller/budget.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controller
22

33
import (
44
"context"
5+
"time"
56

67
nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
78
"github.com/criteo/node-disruption-controller/pkg/resolver"
@@ -17,7 +18,7 @@ type Budget interface {
1718
// Return the number of disruption allowed considering a list of current node disruptions
1819
TolerateDisruption(resolver.NodeSet) bool
1920
// Call a lifecycle hook in order to synchronously validate a Node Disruption
20-
CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error
21+
CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption, time.Duration) error
2122
// Apply the budget's status to Kubernetes
2223
UpdateStatus(context.Context) error
2324
// Get the name, namespace and kind of bduget

internal/controller/budget_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"testing"
7+
"time"
78

89
nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
910
"github.com/criteo/node-disruption-controller/internal/controller"
@@ -36,7 +37,7 @@ func (m *MockBudget) TolerateDisruption(resolver.NodeSet) bool {
3637
}
3738

3839
// Check health make a synchronous health check on the underlying resource of a budget
39-
func (m *MockBudget) CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption) error {
40+
func (m *MockBudget) CallHealthHook(context.Context, nodedisruptionv1alpha1.NodeDisruption, time.Duration) error {
4041
m.healthChecked = true
4142
return m.health
4243
}

internal/controller/nodedisruption_controller.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ import (
3636
)
3737

3838
const (
39-
DefaultRetryInterval = time.Minute
39+
DefaultRetryInterval = time.Minute
40+
DefaultHealthHookTimeout = time.Minute
4041
)
4142

4243
type NodeDisruptionReconcilerConfig struct {
@@ -46,6 +47,8 @@ type NodeDisruptionReconcilerConfig struct {
4647
RetryInterval time.Duration
4748
// Reject NodeDisruption if its node selector overlaps an older NodeDisruption's selector
4849
RejectOverlappingDisruption bool
50+
// HealthHook http call resolved from ADB need a timeout to avoid unresponsive call
51+
HealthHookTimeout time.Duration
4952
// Specify which node disruption types are allowed to be granted
5053
NodeDisruptionTypes []string
5154
}
@@ -384,7 +387,7 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithBudgetConstraints(ctx con
384387
}
385388

386389
for _, budget := range impactedBudgets {
387-
err := budget.CallHealthHook(ctx, ndr.NodeDisruption)
390+
err := budget.CallHealthHook(ctx, ndr.NodeDisruption, ndr.Config.HealthHookTimeout)
388391
ref := budget.GetNamespacedName()
389392
if err != nil {
390393
anyFailed = true

internal/controller/nodedisruptionbudget_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"math"
2222
"reflect"
23+
"time"
2324

2425
corev1 "k8s.io/api/core/v1"
2526
"k8s.io/apimachinery/pkg/api/errors"
@@ -197,7 +198,7 @@ func (r *NodeDisruptionBudgetResolver) TolerateDisruption(disruptedNodes resolve
197198
}
198199

199200
// Call a lifecycle hook in order to synchronously validate a Node Disruption
200-
func (r *NodeDisruptionBudgetResolver) CallHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption) error {
201+
func (r *NodeDisruptionBudgetResolver) CallHealthHook(_ context.Context, _ nodedisruptionv1alpha1.NodeDisruption, _ time.Duration) error {
201202
return nil
202203
}
203204

0 commit comments

Comments
 (0)