Skip to content

Commit 89c5ba0

Browse files
Merge pull request #19 from utilitywarehouse/replace-old-hc
Replace old healthcheck
2 parents 4b8fb4e + de5ed81 commit 89c5ba0

File tree

5 files changed

+24
-26
lines changed

5 files changed

+24
-26
lines changed

deploy/example/namespace/daemonset.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,15 @@ spec:
4141
capabilities:
4242
add:
4343
- 'NET_ADMIN'
44-
readinessProbe:
44+
livenessProbe:
4545
httpGet:
4646
path: /healthz
4747
port: readiness-port
48-
periodSeconds: 10
49-
failureThreshold: 1
48+
periodSeconds: 30
49+
failureThreshold: 6
50+
initialDelaySeconds: 10
51+
successThreshold: 1
52+
timeoutSeconds: 1
5053
volumes:
5154
- name: var-lib-semaphore-wireguard
5255
hostPath:

kube/node_watcher.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ type NodeWatcher struct {
2929
store cache.Store
3030
controller cache.Controller
3131
eventHandler NodeEventHandler
32-
ListHealthy bool
33-
WatchHealthy bool
3432
}
3533

3634
// NewNodeWatcher returns a new node wathcer.
@@ -53,9 +51,6 @@ func (nw *NodeWatcher) Init() {
5351
if err != nil {
5452
log.Logger.Error("nw: list error", "err", err)
5553
metrics.IncNodeWatcherFailures(nw.clusterName, "list")
56-
nw.ListHealthy = false
57-
} else {
58-
nw.ListHealthy = true
5954
}
6055
return l, err
6156
},
@@ -64,9 +59,6 @@ func (nw *NodeWatcher) Init() {
6459
if err != nil {
6560
log.Logger.Error("nw: watch error", "err", err)
6661
metrics.IncNodeWatcherFailures(nw.clusterName, "watch")
67-
nw.WatchHealthy = false
68-
} else {
69-
nw.WatchHealthy = true
7062
}
7163
return w, err
7264
},
@@ -117,8 +109,3 @@ func (nw *NodeWatcher) List() ([]*v1.Node, error) {
117109
}
118110
return nodes, nil
119111
}
120-
121-
// Healthy is true when both list and watch handlers are running without errors.
122-
func (nw *NodeWatcher) Healthy() bool {
123-
return nw.ListHealthy && nw.WatchHealthy
124-
}

main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,12 @@ func listenAndServe(runners []*Runner) {
162162
mux := http.NewServeMux()
163163
mux.Handle("/metrics", promhttp.Handler())
164164
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
165+
// if we reach listenAndServe func the wg devices should have
166+
// been intialised and running. One could use the ruuners'
167+
// initialised flag for a liveness probe to kick the deployment
168+
// after some time
165169
for _, r := range runners {
166-
if !r.Healthy() {
170+
if !r.initialised {
167171
w.WriteHeader(http.StatusServiceUnavailable)
168172
return
169173
}

metrics/metrics.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) {
225225
}
226226
}
227227

228+
// SyncPeerAttempt increases the counter for attempts to sync wg peers
228229
func SyncPeerAttempt(device string, err error) {
229230
s := "1"
230231
if err != nil {
@@ -236,18 +237,21 @@ func SyncPeerAttempt(device string, err error) {
236237
}).Inc()
237238
}
238239

240+
// IncSyncQueueFullFailures increases sync queue failures counter
239241
func IncSyncQueueFullFailures(device string) {
240242
syncQueueFullFailures.With(prometheus.Labels{
241243
"device": device,
242244
}).Inc()
243245
}
244246

247+
// IncSyncRequeue increases requeue counter
245248
func IncSyncRequeue(device string) {
246249
syncRequeue.With(prometheus.Labels{
247250
"device": device,
248251
}).Inc()
249252
}
250253

254+
// IncNodeWatcherFailures increases node watcher failures counter
251255
func IncNodeWatcherFailures(c, v string) {
252256
nodeWatcherFailures.With(prometheus.Labels{
253257
"cluster": c,

runner.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ type Runner struct {
5353
nodeWatcher *kube.NodeWatcher
5454
peers map[string]Peer
5555
canSync bool // Flag to allow updating wireguard peers only after initial node watcher sync
56+
initialised bool // Flag to turn on after the successful initialisation of the runner to report healthy
5657
annotations RunnerAnnotations
5758
sync chan struct{}
5859
stop chan struct{}
@@ -65,6 +66,7 @@ func newRunner(client, watchClient kubernetes.Interface, nodeName, wgDeviceName,
6566
podSubnet: podSubnet,
6667
peers: make(map[string]Peer),
6768
canSync: false,
69+
initialised: false,
6870
annotations: constructRunnerAnnotations(localClusterName, remoteClusterName),
6971
sync: make(chan struct{}),
7072
stop: make(chan struct{}),
@@ -100,17 +102,20 @@ func (r *Runner) Run() error {
100102
if err := r.device.EnsureLinkUp(); err != nil {
101103
return err
102104
}
105+
// Static route to the whole subnet cidr
106+
if err := r.device.AddRouteToNet(r.podSubnet); err != nil {
107+
return err
108+
}
109+
// At this point the runner should be considered successfully initialised
110+
r.initialised = true
111+
103112
go r.nodeWatcher.Run()
104113
// wait for node watcher to sync. TODO: atm dummy and could run forever
105114
// if node cache fails to sync
106115
stopCh := make(chan struct{})
107116
if ok := cache.WaitForNamedCacheSync("nodeWatcher", stopCh, r.nodeWatcher.HasSynced); !ok {
108117
return fmt.Errorf("failed to wait for nodes cache to sync")
109118
}
110-
// Static route to the whole subnet cidr
111-
if err := r.device.AddRouteToNet(r.podSubnet); err != nil {
112-
return err
113-
}
114119
r.canSync = true
115120
r.enqueuePeersSync()
116121
return nil
@@ -308,8 +313,3 @@ func (r *Runner) onPeerNodeDelete(node *v1.Node) {
308313
}
309314
r.enqueuePeersSync()
310315
}
311-
312-
// Healthy is true if the node watcher is reporting healthy.
313-
func (r *Runner) Healthy() bool {
314-
return r.nodeWatcher.Healthy()
315-
}

0 commit comments

Comments
 (0)