Skip to content

Commit 9c8e212

Browse files
committed
Fix BackupRepositories becoming stale when BSL config changes while Velero is not running
- Add BSLConfigHashAnnotation to track BSL configuration changes - Implement calculateBSLConfigHash() for deterministic hash generation - Add validateBackupRepositoriesOnStartup() to check repositories on startup - Trigger validation on first reconciliation instead of using time.Sleep - Use existing repoReady/repoNotReady helper functions consistently - Add comprehensive test coverage for new functionality Fixes #8279 Signed-off-by: Tapio Kaovila <tkaovila@redhat.com>
1 parent 3be76da commit 9c8e212

File tree

3 files changed

+698
-15
lines changed

3 files changed

+698
-15
lines changed

pkg/apis/velero/v1/labels_annotations.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ const (
5454
// RepositoryTypeLabel is the label key used to identify the type of a repository
5555
RepositoryTypeLabel = "velero.io/repository-type"
5656

57+
// BSLConfigHashAnnotation is the annotation key for storing BSL config hash on BackupRepository
58+
// to detect configuration changes that occurred while Velero was not running
59+
BSLConfigHashAnnotation = "velero.io/bsl-config-hash"
60+
5761
// DataUploadLabel is the label key used to identify the dataupload for snapshot backup pod
5862
DataUploadLabel = "velero.io/data-upload"
5963

pkg/controller/backup_repository_controller.go

Lines changed: 191 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ package controller
1919
import (
2020
"bytes"
2121
"context"
22+
"crypto/sha256"
23+
"encoding/hex"
2224
"encoding/json"
2325
"fmt"
2426
"reflect"
2527
"slices"
28+
"sort"
29+
"sync"
2630
"time"
2731

2832
"github.com/petar/GoLLRB/llrb"
@@ -66,6 +70,10 @@ type BackupRepoReconciler struct {
6670
repoMaintenanceConfig string
6771
logLevel logrus.Level
6872
logFormat *logging.FormatFlag
73+
74+
// Startup validation fields
75+
startupValidationDone bool
76+
startupMutex sync.Mutex
6977
}
7078

7179
func NewBackupRepoReconciler(
@@ -80,16 +88,18 @@ func NewBackupRepoReconciler(
8088
logFormat *logging.FormatFlag,
8189
) *BackupRepoReconciler {
8290
c := &BackupRepoReconciler{
83-
client,
84-
namespace,
85-
logger,
86-
clocks.RealClock{},
87-
maintenanceFrequency,
88-
backupRepoConfig,
89-
repositoryManager,
90-
repoMaintenanceConfig,
91-
logLevel,
92-
logFormat,
91+
Client: client,
92+
namespace: namespace,
93+
logger: logger,
94+
clock: clocks.RealClock{},
95+
maintenanceFrequency: maintenanceFrequency,
96+
backupRepoConfig: backupRepoConfig,
97+
repositoryManager: repositoryManager,
98+
repoMaintenanceConfig: repoMaintenanceConfig,
99+
logLevel: logLevel,
100+
logFormat: logFormat,
101+
startupValidationDone: false,
102+
startupMutex: sync.Mutex{},
93103
}
94104

95105
return c
@@ -149,7 +159,13 @@ func (r *BackupRepoReconciler) invalidateBackupReposForBSL(ctx context.Context,
149159
requests := []reconcile.Request{}
150160
for i := range list.Items {
151161
r.logger.WithField("BSL", bsl.Name).Infof("Invalidating Backup Repository %s", list.Items[i].Name)
152-
if err := r.patchBackupRepository(context.Background(), &list.Items[i], repoNotReady("re-establish on BSL change, create or delete")); err != nil {
162+
if err := r.patchBackupRepository(context.Background(), &list.Items[i], func(rr *velerov1api.BackupRepository) {
163+
repoNotReady("re-establish on BSL change, create or delete")(rr)
164+
// Clear the hash annotation so it will be recalculated
165+
if rr.Annotations != nil {
166+
delete(rr.Annotations, velerov1api.BSLConfigHashAnnotation)
167+
}
168+
}); err != nil {
153169
r.logger.WithField("BSL", bsl.Name).WithError(err).Errorf("fail to patch BackupRepository %s", list.Items[i].Name)
154170
continue
155171
}
@@ -159,6 +175,128 @@ func (r *BackupRepoReconciler) invalidateBackupReposForBSL(ctx context.Context,
159175
return requests
160176
}
161177

178+
// calculateBSLConfigHash generates a deterministic hash of BSL configuration fields
179+
// that affect repository connectivity
180+
func calculateBSLConfigHash(bsl *velerov1api.BackupStorageLocation) string {
181+
if bsl == nil {
182+
return ""
183+
}
184+
185+
// Create a struct with only the fields that affect repository connectivity
186+
configData := struct {
187+
Bucket string
188+
Prefix string
189+
CACert []byte
190+
Config map[string]string
191+
}{}
192+
193+
if bsl.Spec.StorageType.ObjectStorage != nil {
194+
configData.Bucket = bsl.Spec.StorageType.ObjectStorage.Bucket
195+
configData.Prefix = bsl.Spec.StorageType.ObjectStorage.Prefix
196+
configData.CACert = bsl.Spec.StorageType.ObjectStorage.CACert
197+
}
198+
199+
// Sort config keys for deterministic hash
200+
if bsl.Spec.Config != nil {
201+
configData.Config = make(map[string]string)
202+
keys := make([]string, 0, len(bsl.Spec.Config))
203+
for k := range bsl.Spec.Config {
204+
keys = append(keys, k)
205+
}
206+
sort.Strings(keys)
207+
for _, k := range keys {
208+
configData.Config[k] = bsl.Spec.Config[k]
209+
}
210+
}
211+
212+
// Calculate hash
213+
data, err := json.Marshal(configData)
214+
if err != nil {
215+
// This should never happen with our simple struct, but handle it anyway
216+
// Return empty string which will trigger repository re-validation
217+
return ""
218+
}
219+
hash := sha256.Sum256(data)
220+
return hex.EncodeToString(hash[:])
221+
}
222+
223+
// validateBackupRepositoriesOnStartup checks all BackupRepositories on controller startup
224+
// and invalidates any that have stale BSL configuration
225+
func (r *BackupRepoReconciler) validateBackupRepositoriesOnStartup(ctx context.Context) error {
226+
r.logger.Info("Validating backup repositories on startup")
227+
228+
// List all BackupRepositories
229+
repoList := &velerov1api.BackupRepositoryList{}
230+
if err := r.List(ctx, repoList); err != nil {
231+
return errors.Wrap(err, "failed to list backup repositories")
232+
}
233+
234+
// List all BSLs for hash calculation
235+
bslList := &velerov1api.BackupStorageLocationList{}
236+
if err := r.List(ctx, bslList); err != nil {
237+
return errors.Wrap(err, "failed to list backup storage locations")
238+
}
239+
240+
// Create map of BSL name to BSL for quick lookup
241+
bslMap := make(map[string]*velerov1api.BackupStorageLocation)
242+
for i := range bslList.Items {
243+
bslMap[bslList.Items[i].Name] = &bslList.Items[i]
244+
}
245+
246+
invalidatedCount := 0
247+
for i := range repoList.Items {
248+
repo := &repoList.Items[i]
249+
250+
// Skip if repository is new (not yet initialized)
251+
if repo.Status.Phase == "" || repo.Status.Phase == velerov1api.BackupRepositoryPhaseNew {
252+
continue
253+
}
254+
255+
// Get the associated BSL
256+
bsl, exists := bslMap[repo.Spec.BackupStorageLocation]
257+
if !exists {
258+
r.logger.WithField("repo", repo.Name).Warn("BSL not found for repository, skipping validation")
259+
continue
260+
}
261+
262+
// Calculate current BSL config hash
263+
currentHash := calculateBSLConfigHash(bsl)
264+
265+
// Get stored hash from annotation
266+
storedHash := ""
267+
if repo.Annotations != nil {
268+
storedHash = repo.Annotations[velerov1api.BSLConfigHashAnnotation]
269+
}
270+
271+
// If hashes don't match or annotation is missing, invalidate the repository
272+
if storedHash == "" || storedHash != currentHash {
273+
r.logger.WithFields(logrus.Fields{
274+
"repo": repo.Name,
275+
"bsl": bsl.Name,
276+
"storedHash": storedHash,
277+
"currentHash": currentHash,
278+
}).Info("Invalidating backup repository due to BSL configuration change detected on startup")
279+
280+
if err := r.patchBackupRepository(ctx, repo, func(rr *velerov1api.BackupRepository) {
281+
repoNotReady("BSL configuration changed while Velero was not running, re-establishing connection")(rr)
282+
283+
// Update the hash annotation
284+
if rr.Annotations == nil {
285+
rr.Annotations = make(map[string]string)
286+
}
287+
rr.Annotations[velerov1api.BSLConfigHashAnnotation] = currentHash
288+
}); err != nil {
289+
r.logger.WithField("repo", repo.Name).WithError(err).Error("Failed to invalidate backup repository")
290+
continue
291+
}
292+
invalidatedCount++
293+
}
294+
}
295+
296+
r.logger.WithField("invalidatedCount", invalidatedCount).Info("Completed backup repository validation on startup")
297+
return nil
298+
}
299+
162300
// needInvalidBackupRepo returns true if the BSL's storage type, bucket, prefix, CACert, or config has changed
163301
func (r *BackupRepoReconciler) needInvalidBackupRepo(oldObj client.Object, newObj client.Object) bool {
164302
oldBSL := oldObj.(*velerov1api.BackupStorageLocation)
@@ -212,6 +350,23 @@ func (r *BackupRepoReconciler) needInvalidBackupRepo(oldObj client.Object, newOb
212350
}
213351

214352
func (r *BackupRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
353+
// Perform one-time startup validation
354+
r.startupMutex.Lock()
355+
if !r.startupValidationDone {
356+
r.startupValidationDone = true
357+
r.startupMutex.Unlock()
358+
359+
// Run validation asynchronously to avoid blocking this reconciliation
360+
go func() {
361+
r.logger.Info("Running startup validation for backup repositories")
362+
if err := r.validateBackupRepositoriesOnStartup(context.Background()); err != nil {
363+
r.logger.WithError(err).Error("Failed to validate backup repositories on startup")
364+
}
365+
}()
366+
} else {
367+
r.startupMutex.Unlock()
368+
}
369+
215370
log := r.logger.WithField("backupRepo", req.String())
216371
backupRepo := &velerov1api.BackupRepository{}
217372
if err := r.Get(ctx, req.NamespacedName, backupRepo); err != nil {
@@ -315,8 +470,7 @@ func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1
315470
repoIdentifier, err = r.getIdentifierByBSL(bsl, req)
316471
if err != nil {
317472
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
318-
rr.Status.Message = err.Error()
319-
rr.Status.Phase = velerov1api.BackupRepositoryPhaseNotReady
473+
repoNotReady(err.Error())(rr)
320474

321475
if rr.Spec.MaintenanceFrequency.Duration <= 0 {
322476
rr.Spec.MaintenanceFrequency = metav1.Duration{Duration: r.getRepositoryMaintenanceFrequency(req)}
@@ -332,6 +486,9 @@ func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1
332486
log.Infof("Init repo with config %v", config)
333487
}
334488

489+
// Calculate BSL config hash for tracking changes
490+
bslConfigHash := calculateBSLConfigHash(bsl)
491+
335492
// defaulting - if the patch fails, return an error so the item is returned to the queue
336493
if err := r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
337494
// Only set ResticIdentifier for restic repositories
@@ -344,6 +501,12 @@ func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1
344501
}
345502

346503
rr.Spec.RepositoryConfig = config
504+
505+
// Add BSL config hash annotation
506+
if rr.Annotations == nil {
507+
rr.Annotations = make(map[string]string)
508+
}
509+
rr.Annotations[velerov1api.BSLConfigHashAnnotation] = bslConfigHash
347510
}); err != nil {
348511
return err
349512
}
@@ -353,7 +516,7 @@ func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1
353516
}
354517

355518
return r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
356-
rr.Status.Phase = velerov1api.BackupRepositoryPhaseReady
519+
repoReady()(rr)
357520
rr.Status.LastMaintenanceTime = &metav1.Time{Time: time.Now()}
358521
})
359522
}
@@ -539,6 +702,9 @@ func dueForMaintenance(req *velerov1api.BackupRepository, now time.Time) bool {
539702
func (r *BackupRepoReconciler) checkNotReadyRepo(ctx context.Context, req *velerov1api.BackupRepository, bsl *velerov1api.BackupStorageLocation, log logrus.FieldLogger) (bool, error) {
540703
log.Info("Checking backup repository for readiness")
541704

705+
// Calculate current BSL config hash
706+
bslConfigHash := calculateBSLConfigHash(bsl)
707+
542708
// Only check and update restic identifier for restic repositories
543709
if req.Spec.RepositoryType == "" || req.Spec.RepositoryType == velerov1api.BackupRepositoryTypeRestic {
544710
repoIdentifier, err := r.getIdentifierByBSL(bsl, req)
@@ -560,7 +726,17 @@ func (r *BackupRepoReconciler) checkNotReadyRepo(ctx context.Context, req *veler
560726
if err := ensureRepo(req, r.repositoryManager); err != nil {
561727
return false, r.patchBackupRepository(ctx, req, repoNotReady(err.Error()))
562728
}
563-
err := r.patchBackupRepository(ctx, req, repoReady())
729+
730+
// Update status to ready and update BSL config hash
731+
err := r.patchBackupRepository(ctx, req, func(rr *velerov1api.BackupRepository) {
732+
repoReady()(rr)
733+
734+
// Update BSL config hash annotation
735+
if rr.Annotations == nil {
736+
rr.Annotations = make(map[string]string)
737+
}
738+
rr.Annotations[velerov1api.BSLConfigHashAnnotation] = bslConfigHash
739+
})
564740
if err != nil {
565741
return false, err
566742
}

0 commit comments

Comments
 (0)