Skip to content

Commit 501a307

Browse files
authored
Retry initializing informers to allow for network instability on node restart (#3688)
* Retry initializing informers to allow for network instability on node restart * use dskit backoff * return error on failure
1 parent afcf69c commit 501a307

File tree

2 files changed

+43
-9
lines changed

2 files changed

+43
-9
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ Main (unreleased)
3030
- Fix the `validate` command not understanding the `livedebugging` block. (@dehaansa)
3131
- Fix invalid class names in python profiles obtained with `pyroscope.ebpf`. (@korniltsev)
3232

33+
- For CRD-based components (`prometheus.operator.*`), retry initializing informers if the apiserver request fails. This rectifies issues where the apiserver is not reachable immediately after node restart. (@dehaansa)
34+
3335
v1.9.1
3436
-----------------
3537

internal/component/prometheus/operator/common/crdmanager.go

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/go-kit/log"
1515
"github.com/grafana/ckit/shard"
16+
"github.com/grafana/dskit/backoff"
1617
promopv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
1718
promopv1alpha1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1"
1819
"github.com/prometheus/common/model"
@@ -162,7 +163,7 @@ func (c *crdManager) Run(ctx context.Context) error {
162163
if err := c.runInformers(restConfig, ctx); err != nil {
163164
return err
164165
}
165-
level.Info(c.logger).Log("msg", "informers started")
166+
level.Info(c.logger).Log("msg", "informers started")
166167

167168
var cachedTargets map[string][]*targetgroup.Group
168169
// Start the target discovery loop to update the scrape manager with new targets.
@@ -323,6 +324,23 @@ func (c *crdManager) runInformers(restConfig *rest.Config, ctx context.Context)
323324
return nil
324325
}
325326

327+
func getInformer(ctx context.Context, informers cache.Informers, prototype client.Object, timeout time.Duration) (cache.Informer, error) {
328+
informerCtx, cancel := context.WithTimeout(ctx, timeout)
329+
defer cancel()
330+
331+
informer, err := informers.GetInformer(informerCtx, prototype)
332+
if err != nil {
333+
if errors.Is(informerCtx.Err(), context.DeadlineExceeded) { // Check the context to prevent GetInformer returning a fake timeout
334+
return nil, fmt.Errorf("timeout exceeded while configuring informers. Check the connection"+
335+
" to the Kubernetes API is stable and that Alloy has appropriate RBAC permissions for %T", prototype)
336+
}
337+
338+
return nil, err
339+
}
340+
341+
return informer, err
342+
}
343+
326344
// configureInformers configures the informers for the CRDManager to watch for crd changes.
327345
func (c *crdManager) configureInformers(ctx context.Context, informers cache.Informers) error {
328346
var prototype client.Object
@@ -339,18 +357,32 @@ func (c *crdManager) configureInformers(ctx context.Context, informers cache.Inf
339357
return fmt.Errorf("unknown kind to configure Informers: %s", c.kind)
340358
}
341359

342-
informerCtx, cancel := context.WithTimeout(ctx, c.args.InformerSyncTimeout)
343-
defer cancel()
360+
// On node restart, the API server is not always immediately available.
361+
// Retry with backoff to give time for the network to initialize.
362+
var informer cache.Informer
363+
var err error
344364

345-
informer, err := informers.GetInformer(informerCtx, prototype)
346-
if err != nil {
347-
if errors.Is(informerCtx.Err(), context.DeadlineExceeded) { // Check the context to prevent GetInformer returning a fake timeout
348-
return fmt.Errorf("timeout exceeded while configuring informers. Check the connection"+
349-
" to the Kubernetes API is stable and that Alloy has appropriate RBAC permissions for %v", prototype)
365+
backoff := backoff.New(
366+
ctx,
367+
backoff.Config{
368+
MinBackoff: 1 * time.Second,
369+
MaxBackoff: 10 * time.Second,
370+
MaxRetries: 3, // retry up to 3 times
371+
},
372+
)
373+
for backoff.Ongoing() {
374+
// Retry to get the informer in case of a timeout.
375+
informer, err = getInformer(ctx, informers, prototype, c.args.InformerSyncTimeout)
376+
if err == nil {
377+
break
350378
}
351-
379+
level.Warn(c.logger).Log("msg", "failed to get informer, retrying", "next backoff", backoff.NextDelay(), "err", err)
380+
backoff.Wait()
381+
}
382+
if err != nil {
352383
return err
353384
}
385+
354386
const resync = 5 * time.Minute
355387
switch c.kind {
356388
case KindPodMonitor:

0 commit comments

Comments
 (0)