@@ -32,6 +32,7 @@ import (
3232 cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1"
3333 "github.com/cert-manager/cert-manager/pkg/util/pki"
3434 "github.com/go-logr/logr"
35+ "github.com/lestrrat-go/backoff/v2"
3536 "github.com/prometheus/client_golang/prometheus"
3637 "istio.io/istio/pkg/spiffe"
3738 pkiutil "istio.io/istio/security/pkg/pki/util"
@@ -164,17 +165,57 @@ func (p *Provider) Start(ctx context.Context) error {
164165 }()
165166 }
166167
167- // If using pure runtime configuration (i.e. no issuerRef provided on startup)
168- // we need to wait for an issuer to be available before we try to fetch the initial
169- // serving certificate
168+ var notAfter time.Time
170169
171- if p .issuerChangeNotifier .MustWaitForIssuer () {
172- p .waitForInitialIssuer (ctx )
173- }
170+ backoffPolicy := backoff .Exponential (
171+ backoff .WithMinInterval (time .Second ),
172+ backoff .WithMaxInterval (time .Second * 30 ),
173+ backoff .WithJitterFactor (0.05 ),
174+ backoff .WithMaxRetries (250 ),
175+ )
174176
175- notAfter , err := p .fetchCertificate (ctx )
176- if err != nil {
177- return fmt .Errorf ("failed to fetch initial serving certificate: %w" , err )
177+ backoffController := backoffPolicy .Start (ctx )
178+
179+ for backoff .Continue (backoffController ) {
180+ // If using pure runtime configuration (i.e. no issuerRef provided on startup) we might need
181+ // to wait for an issuer to be available before we try to fetch the initial serving certificate.
182+
183+ // We also need to wait to be able to actually successfully issue a certificate; if the
184+ // user is installing istio-csr and cert-manager at the same time and has already provisioned
185+ // a ConfigMap with runtime configuration before installing either, it's very possible that
186+ // the issuer the ConfigMap refers to doesn't yet exist, even though the issuer config exists
187+
188+ if ! p .issuerChangeNotifier .HasIssuerConfig () {
189+ p .waitForInitialIssuer (ctx )
190+ }
191+
192+ // We have an issuerRef now (after waitForInitialIssuer) but we don't know that it's valid
193+ // since the issuer might not have been created yet; so we try, with a timeout, to fetch a
194+ // cert and if it fails we'll hit the backoff and try again
195+
196+ issuanceTimeout := 10 * time .Second
197+
198+ fetchCtx , cancelFunc := context .WithTimeout (ctx , issuanceTimeout )
199+
200+ var err error
201+ notAfter , err = p .fetchCertificate (fetchCtx )
202+ if err != nil {
203+ cancelFunc ()
204+
205+ // Some of the functions which block in fetchCertificate don't wrap errors and we won't be able
206+ // to reliably confirm DeadlineExceeded was the actual error. But we can try and later the underlying
207+ // libraries might improve!
208+ if errors .Is (err , context .DeadlineExceeded ) {
209+ p .log .Info (fmt .Sprintf ("initial serving certificate didn't complete in %s; will retry" , issuanceTimeout ))
210+ } else {
211+ p .log .Info (fmt .Sprintf ("failed to fetch initial serving certificate in %s: %s; will retry" , issuanceTimeout , err ))
212+ }
213+
214+ continue
215+ }
216+
217+ cancelFunc ()
218+ break
178219 }
179220
180221 p .log .Info ("fetched initial serving certificate" )
@@ -212,8 +253,7 @@ func (p *Provider) Start(ctx context.Context) error {
212253 }
213254}
214255
215- // waitForInitialIssuer blocks until there's an issuer provided for creating the initial
216- // serving cert.
256+ // waitForInitialIssuer blocks until there's an issuer provided for creating the initial serving cert.
217257func (p * Provider ) waitForInitialIssuer (ctx context.Context ) {
218258 for {
219259 timer := time .NewTimer (5 * time .Second )
@@ -427,7 +467,8 @@ func (p *Provider) TrustDomain() string {
427467 return p .opts .TrustDomain
428468}
429469
430- // All istio-csr's need renewed service serving certificates.
470+ // All istio-csr pods need up-to-date serving certs to minimise the delay when a non-leader pod
471+ // takes leadership.
431472func (p * Provider ) NeedLeaderElection () bool {
432473 return false
433474}
0 commit comments