Skip to content

Commit

Permalink
Merge pull request rook#13511 from ushitora-anqou/set-pdb-even-if-pgs…
Browse files Browse the repository at this point in the history
…-remain-active-clean

core: set blocking PDB even if no unhealthy PGs appear
  • Loading branch information
travisn authored Feb 6, 2024
2 parents 39f4458 + 6def9c8 commit 4e0c4f6
Showing 1 changed file with 4 additions and 35 deletions.
39 changes: 4 additions & 35 deletions pkg/operator/ceph/disruption/clusterdisruption/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,24 +338,9 @@ func (r *ReconcileClusterDisruption) reconcilePDBsForOSDs(
}

switch {
// osd is down but pgs are active+clean
case osdDown && pgClean:
lastDrainTimeStamp, err := getLastDrainTimeStamp(pdbStateMap, drainingFailureDomainDurationKey)
if err != nil {
return reconcile.Result{}, errors.Wrapf(err, "failed to get last drain timestamp from the configmap %q", pdbStateMap.Name)
}
timeSinceOSDDown := time.Since(lastDrainTimeStamp)
if timeSinceOSDDown > 30*time.Second {
logger.Infof("osd is down in failure domain %q is down for the last %.2f minutes, but pgs are active+clean", drainingFailureDomain, timeSinceOSDDown.Minutes())
resetPDBConfig(pdbStateMap)
} else {
logger.Infof("osd is down in the failure domain %q, but pgs are active+clean. Requeuing in case pg status is not updated yet...", drainingFailureDomain)
return reconcile.Result{Requeue: true, RequeueAfter: 15 * time.Second}, nil
}

// osd is down and pgs are not healthy
case osdDown && !pgClean:
logger.Infof("osd is down in failure domain %q and pgs are not active+clean. pg health: %q", drainingFailureDomain, pgHealthMsg)
// osd is down
case osdDown:
logger.Infof("osd is down in failure domain %q. pg health: %q", drainingFailureDomain, pgHealthMsg)
currentlyDrainingFD, ok := pdbStateMap.Data[drainingFailureDomainKey]
if !ok || drainingFailureDomain != currentlyDrainingFD {
pdbStateMap.Data[drainingFailureDomainKey] = drainingFailureDomain
Expand Down Expand Up @@ -383,7 +368,7 @@ func (r *ReconcileClusterDisruption) reconcilePDBsForOSDs(
}
}

if pdbStateMap.Data[drainingFailureDomainKey] != "" && !pgClean {
if pdbStateMap.Data[drainingFailureDomainKey] != "" {
// delete default OSD pdb and create blocking OSD pdbs
err := r.handleActiveDrains(allFailureDomains, pdbStateMap.Data[drainingFailureDomainKey], failureDomainType, clusterInfo.Namespace, pgClean)
if err != nil {
Expand Down Expand Up @@ -646,22 +631,6 @@ func getPDBName(failureDomainType, failureDomainName string) string {
return k8sutil.TruncateNodeName(fmt.Sprintf("%s-%s-%s", osdPDBAppName, failureDomainType, "%s"), failureDomainName)
}

func getLastDrainTimeStamp(pdbStateMap *corev1.ConfigMap, key string) (time.Time, error) {
var err error
var lastDrainTimeStamp time.Time
lastDrainTimeStampString, ok := pdbStateMap.Data[key]
if !ok || len(lastDrainTimeStampString) == 0 {
return time.Now(), nil
} else {
lastDrainTimeStamp, err = time.Parse(time.RFC3339, pdbStateMap.Data[key])
if err != nil {
return time.Time{}, errors.Wrapf(err, "failed to parse timestamp %q", pdbStateMap.Data[key])
}
}

return lastDrainTimeStamp, nil
}

func (r *ReconcileClusterDisruption) getAllowedDisruptions(pdbName, namespace string) (int32, error) {
usePDBV1Beta1, err := k8sutil.UsePDBV1Beta1Version(r.context.ClusterdContext.Clientset)
if err != nil {
Expand Down

0 comments on commit 4e0c4f6

Please sign in to comment.