Skip to content

Commit 03f1db2

Browse files
committed
Implement comprehensive improvements to certificate renewal reliability
This commit addresses network unreliability and improves the robustness of the ACME certificate lifecycle with the following enhancements: **1. Challenge Cleanup and Memory Management** - Add explicit cleanup of HTTP challenge responses after validation - Implement automatic cleanup of orphaned challenges (1-hour TTL) - Add IDisposable to InMemoryHttpChallengeResponseStore with background timer - Prevent memory leaks during long-running processes **2. Challenge Endpoint Self-Test** - Add self-test of HTTP challenge endpoint before requesting Let's Encrypt validation - Verify challenge endpoint is reachable and returns correct response - Catch configuration issues (firewall, reverse proxy, port binding) early - Configurable via LettuceEncryptOptions.EnableChallengeSelfTest (default: true) - Reduces rate limiting risk from failed validation attempts **3. Configurable Validation Timeouts and Polling** - Replace hardcoded 60 retries × 2s with configurable timeout/interval - Add LettuceEncryptOptions.ValidationTimeout (default: 5 minutes) - Add LettuceEncryptOptions.ValidationPollInterval (default: 2 seconds) - Improved for environments with slow DNS propagation or network latency - Enhanced timeout error messages with actionable guidance **4. Exponential Backoff for Renewal Failures** - Implement RenewalFailureTracker to track failures per domain - Exponential backoff: 1h, 2h, 4h, 8h, capped at 24h - Override backoff when certificate expires in < 7 days (aggressive retry) - Reset failure count on successful renewal - Prevent cascading failures and rate limit exhaustion - BeginCertificateCreationState no longer throws on failure (returns to CheckForRenewalState) **5. Enhanced Logging and Diagnostics** - Add structured logging throughout certificate lifecycle - Log validation attempt counts, timing, and detailed error context - Track which challenge methods are attempted and why they fail - Log renewal decisions with certificate expiry information - Add startup certificate validation logging with counts **6. Graceful Degradation on Validation Failures** - Improve error handling in ValidateDomainOwnershipAsync - Log each validation method attempt with detailed failure reasons - Collect all failures before throwing AggregateException - Better error messages showing which methods were tried and why they failed - Show remaining validators count during attempts **7. Startup Certificate Validation** - Validate certificates on startup (private key, expiry, validity period) - Skip expired, corrupted, or invalid certificates - Warn about certificates expiring within 30 days - Prevent loading broken certificates into the selector - Comprehensive logging of validation results **Breaking Changes:** None - all changes are backward compatible with sensible defaults. **Configuration Examples:** ```csharp services.AddLettuceEncrypt(options => { options.ValidationTimeout = TimeSpan.FromMinutes(10); // For slow DNS options.ValidationPollInterval = TimeSpan.FromSeconds(5); // Less aggressive polling options.EnableChallengeSelfTest = true; // Default, catches config issues early }); ``` Fixes issues with: - Network unreliability causing certificate renewal failures - Memory leaks from orphaned challenge responses - Configuration errors not being caught until Let's Encrypt validation - Lack of backoff after failures leading to rate limiting - Poor diagnostics when renewals fail - Loading of invalid/corrupted certificates Related to the timing fix in commit efe145b that ensured HTTP server readiness before ACME challenge validation.
1 parent 47be11c commit 03f1db2

13 files changed

+615
-35
lines changed

src/LettuceEncrypt/Internal/AcmeCertificateFactory.cs

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,23 +236,26 @@ private async Task ValidateDomainOwnershipAsync(IAuthorizationContext authorizat
236236
cancellationToken.ThrowIfCancellationRequested();
237237

238238
var validators = new List<DomainOwnershipValidator>();
239+
var validationTimeout = _options.Value.ValidationTimeout;
240+
var validationPollInterval = _options.Value.ValidationPollInterval;
241+
var enableSelfTest = _options.Value.EnableChallengeSelfTest;
239242

240243
if (_tlsAlpnChallengeResponder.IsEnabled)
241244
{
242245
validators.Add(new TlsAlpn01DomainValidator(
243-
_tlsAlpnChallengeResponder, _appLifetime, _client, _logger, domainName));
246+
_tlsAlpnChallengeResponder, _appLifetime, _client, _logger, domainName, validationTimeout, validationPollInterval));
244247
}
245248

246249
if (_options.Value.AllowedChallengeTypes.HasFlag(ChallengeType.Http01))
247250
{
248251
validators.Add(new Http01DomainValidator(
249-
_challengeStore, _appLifetime, _client, _logger, domainName));
252+
_challengeStore, _appLifetime, _client, _logger, domainName, validationTimeout, validationPollInterval, enableSelfTest));
250253
}
251254

252255
if (_options.Value.AllowedChallengeTypes.HasFlag(ChallengeType.Dns01))
253256
{
254257
validators.Add(new Dns01DomainValidator(
255-
_dnsChallengeProvider, _appLifetime, _client, _logger, domainName));
258+
_dnsChallengeProvider, _appLifetime, _client, _logger, domainName, validationTimeout, validationPollInterval));
256259
}
257260

258261
if (validators.Count == 0)
@@ -263,23 +266,61 @@ private async Task ValidateDomainOwnershipAsync(IAuthorizationContext authorizat
263266
"Ensure at least one kind of these challenge types is configured: " + challengeTypes);
264267
}
265268

269+
_logger.LogInformation(
270+
"Attempting domain validation for '{DomainName}' using {ValidatorCount} challenge method(s): {Validators}",
271+
domainName,
272+
validators.Count,
273+
string.Join(", ", validators.Select(v => v.GetType().Name.Replace("DomainValidator", ""))));
274+
275+
var failures = new List<Exception>();
276+
266277
foreach (var validator in validators)
267278
{
268279
cancellationToken.ThrowIfCancellationRequested();
280+
var validatorName = validator.GetType().Name.Replace("DomainValidator", "");
281+
269282
try
270283
{
284+
_logger.LogDebug("Trying {ValidatorName} validation for domain '{DomainName}'",
285+
validatorName, domainName);
286+
271287
await validator.ValidateOwnershipAsync(authorizationContext, cancellationToken);
288+
272289
// The method above raises if validation fails. If no exception occurs, we assume validation completed successfully.
290+
_logger.LogInformation(
291+
"Domain validation succeeded using {ValidatorName} for '{DomainName}'",
292+
validatorName, domainName);
273293
return;
274294
}
275295
catch (Exception ex)
276296
{
277-
_logger.LogDebug(ex, "Validation with {validatorType} failed with error: {error}",
278-
validator.GetType().Name, ex.Message);
297+
failures.Add(ex);
298+
_logger.LogWarning(ex,
299+
"Validation with {ValidatorName} failed for domain '{DomainName}'. " +
300+
"Error: {ErrorMessage}. " +
301+
"{RemainingValidators} validation method(s) remaining.",
302+
validatorName,
303+
domainName,
304+
ex.Message,
305+
validators.Count - failures.Count);
279306
}
280307
}
281308

282-
throw new InvalidOperationException($"Failed to validate ownership of domainName '{domainName}'");
309+
// All validators failed
310+
var failureDetails = string.Join("; ", failures.Select((ex, i) =>
311+
$"{validators[i].GetType().Name.Replace("DomainValidator", "")}: {ex.Message}"));
312+
313+
_logger.LogError(
314+
"All {ValidatorCount} validation methods failed for domain '{DomainName}'. Failures: {FailureDetails}",
315+
validators.Count,
316+
domainName,
317+
failureDetails);
318+
319+
throw new AggregateException(
320+
$"Failed to validate ownership of domainName '{domainName}' using any available challenge method. " +
321+
$"Attempted: {string.Join(", ", validators.Select(v => v.GetType().Name.Replace("DomainValidator", "")))}. " +
322+
$"See inner exceptions for details.",
323+
failures);
283324
}
284325

285326
private async Task<X509Certificate2> CompleteCertificateRequestAsync(IOrderContext order,

src/LettuceEncrypt/Internal/AcmeStates/BeginCertificateCreationState.cs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,24 @@ internal class BeginCertificateCreationState : AcmeState
1414
private readonly AcmeCertificateFactory _acmeCertificateFactory;
1515
private readonly CertificateSelector _selector;
1616
private readonly IEnumerable<ICertificateRepository> _certificateRepositories;
17+
private readonly RenewalFailureTracker _failureTracker;
1718

1819
public BeginCertificateCreationState(
19-
AcmeStateMachineContext context, ILogger<ServerStartupState> logger,
20-
IOptions<LettuceEncryptOptions> options, AcmeCertificateFactory acmeCertificateFactory,
21-
CertificateSelector selector, IEnumerable<ICertificateRepository> certificateRepositories)
20+
AcmeStateMachineContext context,
21+
ILogger<ServerStartupState> logger,
22+
IOptions<LettuceEncryptOptions> options,
23+
AcmeCertificateFactory acmeCertificateFactory,
24+
CertificateSelector selector,
25+
IEnumerable<ICertificateRepository> certificateRepositories,
26+
RenewalFailureTracker failureTracker)
2227
: base(context)
2328
{
2429
_logger = logger;
2530
_options = options;
2631
_acmeCertificateFactory = acmeCertificateFactory;
2732
_selector = selector;
2833
_certificateRepositories = certificateRepositories;
34+
_failureTracker = failureTracker;
2935
}
3036

3137
public override async Task<IAcmeState> MoveNextAsync(CancellationToken cancellationToken)
@@ -47,11 +53,25 @@ public override async Task<IAcmeState> MoveNextAsync(CancellationToken cancellat
4753
cert.Thumbprint);
4854

4955
await SaveCertificateAsync(cert, cancellationToken);
56+
57+
// Record success for all domains
58+
foreach (var domain in domainNames)
59+
{
60+
_failureTracker.RecordSuccess(domain);
61+
}
5062
}
5163
catch (Exception ex)
5264
{
65+
// Record failure for all domains
66+
foreach (var domain in domainNames)
67+
{
68+
_failureTracker.RecordFailure(domain, ex);
69+
}
70+
5371
_logger.LogError(0, ex, "Failed to automatically create a certificate for {hostname}", domainNames);
54-
throw;
72+
73+
// Don't throw - return to CheckForRenewalState to implement backoff
74+
// The exception has been logged and tracked
5575
}
5676

5777
return MoveTo<CheckForRenewalState>();

src/LettuceEncrypt/Internal/AcmeStates/CheckForRenewalState.cs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@ internal class CheckForRenewalState : AcmeState
1313
private readonly IOptions<LettuceEncryptOptions> _options;
1414
private readonly CertificateSelector _selector;
1515
private readonly IClock _clock;
16+
private readonly RenewalFailureTracker _failureTracker;
1617

1718
public CheckForRenewalState(
1819
AcmeStateMachineContext context,
1920
ILogger<CheckForRenewalState> logger,
2021
IOptions<LettuceEncryptOptions> options,
2122
CertificateSelector selector,
22-
IClock clock) : base(context)
23+
IClock clock,
24+
RenewalFailureTracker failureTracker) : base(context)
2325
{
2426
_logger = logger;
2527
_options = options;
2628
_selector = selector;
2729
_clock = clock;
30+
_failureTracker = failureTracker;
2831
}
2932

3033
public override async Task<IAcmeState> MoveNextAsync(CancellationToken cancellationToken)
@@ -53,6 +56,24 @@ public override async Task<IAcmeState> MoveNextAsync(CancellationToken cancellat
5356
|| cert == null
5457
|| cert.NotAfter <= _clock.Now.DateTime + daysInAdvance.Value)
5558
{
59+
var certExpiration = cert?.NotAfter ?? _clock.Now.DateTime;
60+
61+
// Check backoff policy before attempting renewal
62+
if (!_failureTracker.ShouldAttemptRenewal(domainName, certExpiration))
63+
{
64+
_logger.LogDebug(
65+
"Skipping renewal for '{DomainName}' due to backoff policy. {FailureInfo}",
66+
domainName,
67+
_failureTracker.GetFailureInfo(domainName));
68+
continue;
69+
}
70+
71+
_logger.LogInformation(
72+
"Certificate renewal needed for '{DomainName}'. Expiration: {Expiration}, Days until expiry: {DaysUntilExpiry:F1}",
73+
domainName,
74+
certExpiration,
75+
(certExpiration - _clock.Now.DateTime).TotalDays);
76+
5677
return MoveTo<BeginCertificateCreationState>();
5778
}
5879
}

src/LettuceEncrypt/Internal/Dns01DomainValidator.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ public Dns01DomainValidator(
1919
IHostApplicationLifetime appLifetime,
2020
AcmeClient client,
2121
ILogger logger,
22-
string domainName
23-
) : base(appLifetime, client, logger, domainName)
22+
string domainName,
23+
TimeSpan validationTimeout,
24+
TimeSpan validationPollInterval
25+
) : base(appLifetime, client, logger, domainName, validationTimeout, validationPollInterval)
2426
{
2527
_dnsChallengeProvider = dnsChallengeProvider;
2628
}

src/LettuceEncrypt/Internal/DomainOwnershipValidator.cs

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,22 @@ internal abstract class DomainOwnershipValidator
1414
protected readonly ILogger _logger;
1515
protected readonly string _domainName;
1616
protected readonly TaskCompletionSource<object?> _appStarted = new();
17-
18-
protected DomainOwnershipValidator(IHostApplicationLifetime appLifetime, AcmeClient client, ILogger logger, string domainName)
17+
protected readonly TimeSpan _validationTimeout;
18+
protected readonly TimeSpan _validationPollInterval;
19+
20+
protected DomainOwnershipValidator(
21+
IHostApplicationLifetime appLifetime,
22+
AcmeClient client,
23+
ILogger logger,
24+
string domainName,
25+
TimeSpan validationTimeout,
26+
TimeSpan validationPollInterval)
1927
{
2028
_client = client;
2129
_logger = logger;
2230
_domainName = domainName;
31+
_validationTimeout = validationTimeout;
32+
_validationPollInterval = validationPollInterval;
2333

2434
appLifetime.ApplicationStarted.Register(() => _appStarted.TrySetResult(null));
2535
if (appLifetime.ApplicationStarted.IsCancellationRequested)
@@ -32,25 +42,37 @@ protected DomainOwnershipValidator(IHostApplicationLifetime appLifetime, AcmeCli
3242

3343
protected async Task WaitForChallengeResultAsync(IAuthorizationContext authorizationContext, CancellationToken cancellationToken)
3444
{
35-
var retries = 60;
36-
var delay = TimeSpan.FromSeconds(2);
45+
var startTime = DateTimeOffset.UtcNow;
46+
var attempt = 0;
3747

38-
while (retries > 0)
48+
while (true)
3949
{
40-
retries--;
50+
attempt++;
51+
var elapsed = DateTimeOffset.UtcNow - startTime;
52+
53+
if (elapsed >= _validationTimeout)
54+
{
55+
throw new TimeoutException(
56+
$"Timed out after {elapsed.TotalSeconds:F1} seconds waiting for domain ownership validation of '{_domainName}'. " +
57+
$"Made {attempt} attempts. Consider increasing ValidationTimeout in LettuceEncryptOptions.");
58+
}
4159

4260
cancellationToken.ThrowIfCancellationRequested();
4361

4462
var authorization = await _client.GetAuthorizationAsync(authorizationContext);
4563

4664
_logger.LogAcmeAction("GetAuthorization");
65+
_logger.LogTrace("Validation attempt {Attempt} for domain '{DomainName}': status = {Status}, elapsed = {Elapsed:F1}s",
66+
attempt, _domainName, authorization.Status, elapsed.TotalSeconds);
4767

4868
switch (authorization.Status)
4969
{
5070
case AuthorizationStatus.Valid:
71+
_logger.LogInformation("Domain '{DomainName}' validated successfully after {Attempts} attempts in {Elapsed:F1}s",
72+
_domainName, attempt, elapsed.TotalSeconds);
5173
return;
5274
case AuthorizationStatus.Pending:
53-
await Task.Delay(delay, cancellationToken);
75+
await Task.Delay(_validationPollInterval, cancellationToken);
5476
continue;
5577
case AuthorizationStatus.Invalid:
5678
throw InvalidAuthorizationError(authorization);
@@ -66,8 +88,6 @@ protected async Task WaitForChallengeResultAsync(IAuthorizationContext authoriza
6688
"Unexpected response from server while validating domain ownership.");
6789
}
6890
}
69-
70-
throw new TimeoutException("Timed out waiting for domain ownership validation.");
7191
}
7292

7393
private Exception InvalidAuthorizationError(Authorization authorization)

0 commit comments

Comments
 (0)