Skip to content

Commit

Permalink
drpc: set ProgressionWaitOnUserToCleanUp along with placement update
Browse files Browse the repository at this point in the history
For discovered apps, we want user to perform the cleanup of the
workload. We should advertise the progression to them at the same time
when we ask OCM/ACM to perform the cleanup.

Signed-off-by: Raghavendra Talur <[email protected]>
  • Loading branch information
raghavendra-talur committed Nov 11, 2024
1 parent e0e3ab3 commit c6b8f80
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
52 changes: 38 additions & 14 deletions internal/controller/drplacementcontrol.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func (d *DRPCInstance) RunFailover() (bool, error) {
return !done, nil
}

return d.ensureActionCompleted(failoverCluster)
return d.ensureFailoverActionCompleted(failoverCluster)
} else if yes, err := d.mwExistsAndPlacementUpdated(failoverCluster); yes || err != nil {
// We have to wait for the VRG to appear on the failoverCluster or
// in case of an error, try again later
Expand Down Expand Up @@ -863,7 +863,7 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
metav1.ConditionTrue, string(d.instance.Status.Phase), "Completed")

return d.ensureActionCompleted(preferredCluster)
return d.ensureRelocateActionCompleted(preferredCluster)
}

d.setStatusInitiating()
Expand Down Expand Up @@ -896,6 +896,29 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
return d.relocate(preferredCluster, preferredClusterNamespace, rmn.Relocating)
}

func (d *DRPCInstance) ensureRelocateActionCompleted(srcCluster string) (bool, error) {
d.setProgression(rmn.ProgressionCleaningUp)

return d.ensureActionCompleted(srcCluster)
}

func (d *DRPCInstance) ensureFailoverActionCompleted(srcCluster string) (bool, error) {
// This is the time to cleanup the workload from the preferredCluster.
// For managed apps, it will be done automatically by ACM, when we update
// the placement to the targetCluster. For discovered apps, we have to let
// the user know that they need to clean up the apps.
// So set the progression to wait on user to clean up.
// If not discovered apps, then we can set the progression to cleaning up.
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
d.setProgression(rmn.ProgressionCleaningUp)
}

return d.ensureActionCompleted(srcCluster)
}

func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) {
const done = true

Expand All @@ -909,8 +932,6 @@ func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) {
return !done, err
}

d.setProgression(rmn.ProgressionCleaningUp)

// Cleanup and setup VolSync if enabled
err = d.ensureCleanupAndVolSyncReplicationSetup(srcCluster)
if err != nil {
Expand Down Expand Up @@ -974,8 +995,19 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error)
addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), "Starting quiescing for relocation")

// clear current user PlacementRule's decision
d.setProgression(rmn.ProgressionClearingPlacement)
// We are going to clear the placement, this is when ACM will start
// deleting the workloads from the current cluster. In case of
// discovered apps, we have to let the user know that they need to
// clean up the apps from the current cluster. So set the progression
// to wait on user to clean up. For non-discovered apps, we can set the
// progression to clearing placement.
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
// clear current user PlacementRule's decision
d.setProgression(rmn.ProgressionClearingPlacement)
}

err := d.clearUserPlacementRuleStatus()
if err != nil {
Expand Down Expand Up @@ -2048,10 +2080,6 @@ func (d *DRPCInstance) ensureVRGManifestWorkOnClusterDeleted(clusterName string)

d.log.Info("Request not complete yet", "cluster", clusterName)

if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
}

// IF we get here, either the VRG has not transitioned to secondary (yet) or delete didn't succeed. In either cases,
// we need to make sure that the VRG object is deleted. IOW, we still have to wait
return !done, nil
Expand All @@ -2067,10 +2095,6 @@ func (d *DRPCInstance) ensureVRGIsSecondaryEverywhere(clusterToSkip string) bool
continue
}

if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
}

if !d.ensureVRGIsSecondaryOnCluster(clusterName) {
d.log.Info("Still waiting for VRG to transition to secondary", "cluster", clusterName)

Expand Down
12 changes: 12 additions & 0 deletions internal/controller/volsync/vshandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ type VSHandler struct {
destinationCopyMethod volsyncv1alpha1.CopyMethodType
volumeSnapshotClassList *snapv1.VolumeSnapshotClassList
vrgInAdminNamespace bool
workloadStatus string
}

func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, owner metav1.Object,
Expand All @@ -98,6 +99,10 @@ func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, ow
return vsHandler
}

func (v *VSHandler) GetWorkloadStatus() string {
return v.workloadStatus
}

// returns replication destination only if create/update is successful and the RD is considered available.
// Callers should assume getting a nil replication destination back means they should retry/requeue.
//
Expand Down Expand Up @@ -364,6 +369,8 @@ func (v *VSHandler) validatePVCBeforeRS(rsSpec ramendrv1alpha1.VolSyncReplicatio
return false, nil
}

v.workloadStatus = "inactive"

return true, nil // Good to proceed - PVC is not in use, not mounted to node (or does not exist-should not happen)
}

Expand Down Expand Up @@ -1634,6 +1641,8 @@ func (v *VSHandler) IsRDDataProtected(pvcName, pvcNamespace string) (bool, error
func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncReplicationDestinationSpec,
) (*string, error) {
if !v.IsCopyMethodDirect() {
// TODO:
// We need to check the workload status even in other cases.
v.log.Info("Using default copyMethod of Snapshot")

return nil, nil // use default copyMethod
Expand All @@ -1659,6 +1668,9 @@ func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncRepl
util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC))
}

// At this point, we are sure that there is no active workload
v.workloadStatus = "inactive"

v.log.Info(fmt.Sprintf("Using App PVC %s for syncing directly to it",
util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC)))
// Using the application PVC for syncing from source to destination and save a snapshot
Expand Down
4 changes: 3 additions & 1 deletion internal/controller/volumereplicationgroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1238,7 +1238,9 @@ func (v *VRGInstance) updateStatusState() {
// VRG is exclusively using volsync
if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary &&
len(v.instance.Spec.VolSync.RDSpec) > 0 {
v.instance.Status.State = ramendrv1alpha1.SecondaryState
if v.volSyncHandler.GetWorkloadStatus() == "inactive" {
v.instance.Status.State = ramendrv1alpha1.SecondaryState
}

return
}
Expand Down
3 changes: 3 additions & 0 deletions internal/controller/vrg_volsync.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ func (v *VRGInstance) reconcileVolSyncAsSecondary() bool {
// If we are secondary, and RDSpec is not set, then we don't want to have any PVC
// flagged as a VolSync PVC.
if v.instance.Spec.VolSync.RDSpec == nil {
// This might be a case where we lose the RDSpec temporarily,
// so we don't know if workload status is truly inactive.
idx := 0

for _, protectedPVC := range v.instance.Status.ProtectedPVCs {
Expand All @@ -250,6 +252,7 @@ func (v *VRGInstance) reconcileRDSpecForDeletionOrReplication() bool {
requeue := false
rdinCGs := []ramendrv1alpha1.VolSyncReplicationDestinationSpec{}

// TODO: Set the workload status in CG code path later
for _, rdSpec := range v.instance.Spec.VolSync.RDSpec {
cg, ok := rdSpec.ProtectedPVC.Labels[ConsistencyGroupLabel]
if ok && util.IsCGEnabled(v.instance.Annotations) {
Expand Down

0 comments on commit c6b8f80

Please sign in to comment.