Skip to content

Commit bbb57a5

Browse files
Merge pull request #364 from SgtCoDFish/backoff-startup
Add retries for issuing initial serving cert
2 parents d7dd7ab + 346d669 commit bbb57a5

File tree

3 files changed

+60
-21
lines changed

3 files changed

+60
-21
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
github.com/fsnotify/fsnotify v1.7.0
1111
github.com/go-logr/logr v1.4.2
1212
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
13+
github.com/lestrrat-go/backoff/v2 v2.0.8
1314
github.com/onsi/ginkgo/v2 v2.19.0
1415
github.com/onsi/gomega v1.33.1
1516
github.com/prometheus/client_golang v1.19.1
@@ -80,7 +81,6 @@ require (
8081
github.com/inconshreveable/mousetrap v1.1.0 // indirect
8182
github.com/josharian/intern v1.0.0 // indirect
8283
github.com/json-iterator/go v1.1.12 // indirect
83-
github.com/lestrrat-go/backoff/v2 v2.0.8 // indirect
8484
github.com/lestrrat-go/blackmagic v1.0.2 // indirect
8585
github.com/lestrrat-go/httpcc v1.0.1 // indirect
8686
github.com/lestrrat-go/iter v1.0.2 // indirect

pkg/certmanager/certmanager.go

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,11 @@ type IssuerChangeNotifier interface {
8282
// as updates happen.
8383
SubscribeIssuerChange() <-chan *cmmeta.ObjectReference
8484

85-
// MustWaitForIssuer returns true if there's no "default" issuerRef available
86-
// (i.e. no static issuerRef was configured at startup)
85+
// HasIssuerConfig returns true if there's a configured active issuer ref.
86+
// (i.e. a static issuerRef was configured at startup / runtime issuance config has been successfully acquired)
8787
// If this function returns true, InitialIssuer will always return nil and
88-
// subscribers must wait for runtime configuration before trying to issue
89-
// certificates
90-
MustWaitForIssuer() bool
88+
// subscribers must wait for runtime configuration before trying to issue certificates
89+
HasIssuerConfig() bool
9190

9291
// InitialIssuer returns the "static" issuer which was configured at startup. Will
9392
// always return nil if no such issuer exists.
@@ -475,9 +474,8 @@ func (m *manager) SubscribeIssuerChange() <-chan *cmmeta.ObjectReference {
475474
return ch
476475
}
477476

478-
func (m *manager) MustWaitForIssuer() bool {
479-
// if no originalIssuerRef was configured, must wait for runtime configuration
480-
return m.originalIssuerRef == nil
477+
func (m *manager) HasIssuerConfig() bool {
478+
return m.activeIssuerRef != nil
481479
}
482480

483481
func (m *manager) InitialIssuer() *cmmeta.ObjectReference {

pkg/tls/tls.go

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1"
3333
"github.com/cert-manager/cert-manager/pkg/util/pki"
3434
"github.com/go-logr/logr"
35+
"github.com/lestrrat-go/backoff/v2"
3536
"github.com/prometheus/client_golang/prometheus"
3637
"istio.io/istio/pkg/spiffe"
3738
pkiutil "istio.io/istio/security/pkg/pki/util"
@@ -164,17 +165,57 @@ func (p *Provider) Start(ctx context.Context) error {
164165
}()
165166
}
166167

167-
// If using pure runtime configuration (i.e. no issuerRef provided on startup)
168-
// we need to wait for an issuer to be available before we try to fetch the initial
169-
// serving certificate
168+
var notAfter time.Time
170169

171-
if p.issuerChangeNotifier.MustWaitForIssuer() {
172-
p.waitForInitialIssuer(ctx)
173-
}
170+
backoffPolicy := backoff.Exponential(
171+
backoff.WithMinInterval(time.Second),
172+
backoff.WithMaxInterval(time.Second*30),
173+
backoff.WithJitterFactor(0.05),
174+
backoff.WithMaxRetries(250),
175+
)
174176

175-
notAfter, err := p.fetchCertificate(ctx)
176-
if err != nil {
177-
return fmt.Errorf("failed to fetch initial serving certificate: %w", err)
177+
backoffController := backoffPolicy.Start(ctx)
178+
179+
for backoff.Continue(backoffController) {
180+
// If using pure runtime configuration (i.e. no issuerRef provided on startup) we might need
181+
// to wait for an issuer to be available before we try to fetch the initial serving certificate.
182+
183+
// We also need to wait to be able to actually successfully issue a certificate; if the
184+
// user is installing istio-csr and cert-manager at the same time and has already provisioned
185+
// a ConfigMap with runtime configuration before installing either, it's very possible that
186+
// the issuer the ConfigMap refers to doesn't yet exist, even though the issuer config exists
187+
188+
if !p.issuerChangeNotifier.HasIssuerConfig() {
189+
p.waitForInitialIssuer(ctx)
190+
}
191+
192+
// We have an issuerRef now (after waitForInitialIssuer) but we don't know that it's valid
193+
// since the issuer might not have been created yet; so we try, with a timeout, to fetch a
194+
// cert and if it fails we'll hit the backoff and try again
195+
196+
issuanceTimeout := 10 * time.Second
197+
198+
fetchCtx, cancelFunc := context.WithTimeout(ctx, issuanceTimeout)
199+
200+
var err error
201+
notAfter, err = p.fetchCertificate(fetchCtx)
202+
if err != nil {
203+
cancelFunc()
204+
205+
// Some of the functions which block in fetchCertificate don't wrap errors and we won't be able
206+
// to reliably confirm DeadlineExceeded was the actual error. But we can try and later the underlying
207+
// libraries might improve!
208+
if errors.Is(err, context.DeadlineExceeded) {
209+
p.log.Info(fmt.Sprintf("initial serving certificate didn't complete in %s; will retry", issuanceTimeout))
210+
} else {
211+
p.log.Info(fmt.Sprintf("failed to fetch initial serving certificate in %s: %s; will retry", issuanceTimeout, err))
212+
}
213+
214+
continue
215+
}
216+
217+
cancelFunc()
218+
break
178219
}
179220

180221
p.log.Info("fetched initial serving certificate")
@@ -212,8 +253,7 @@ func (p *Provider) Start(ctx context.Context) error {
212253
}
213254
}
214255

215-
// waitForInitialIssuer blocks until there's an issuer provided for creating the initial
216-
// serving cert.
256+
// waitForInitialIssuer blocks until there's an issuer provided for creating the initial serving cert.
217257
func (p *Provider) waitForInitialIssuer(ctx context.Context) {
218258
for {
219259
timer := time.NewTimer(5 * time.Second)
@@ -427,7 +467,8 @@ func (p *Provider) TrustDomain() string {
427467
return p.opts.TrustDomain
428468
}
429469

430-
// All istio-csr's need renewed service serving certificates.
470+
// All istio-csr pods need up-to-date serving certs to minimise the delay when a non-leader pod
471+
// takes leadership.
431472
func (p *Provider) NeedLeaderElection() bool {
432473
return false
433474
}

0 commit comments

Comments
 (0)