Skip to content

Commit

Permalink
core: set blocking PDB even if no unhealthy PGs appear
Browse files Browse the repository at this point in the history
When `managePodBudgets` is enabled, the Rook operator sets a blocking
PDB by considering the failure domains of the OSDs. This functionality
is implemented by `reconcilePDBsForOSDs`, and it sets the PDB only after
unhealthy PGs appear. However, there are no unhealthy PGs when an OSD
with no PGs becomes down. In this case, the PDB is never enabled.

This PR makes the operator configure the blocking PDB without waiting
for the unhealthy PGs to appear. This PR solves the above problem
because the blocking PDB is always enabled when a down OSD is detected.

Signed-off-by: Ryotaro Banno <[email protected]>
  • Loading branch information
ushitora-anqou committed Jan 5, 2024
1 parent c10fc80 commit 6def9c8
Showing 1 changed file with 4 additions and 35 deletions.
39 changes: 4 additions & 35 deletions pkg/operator/ceph/disruption/clusterdisruption/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,24 +338,9 @@ func (r *ReconcileClusterDisruption) reconcilePDBsForOSDs(
}

switch {
// osd is down but pgs are active+clean
case osdDown && pgClean:
lastDrainTimeStamp, err := getLastDrainTimeStamp(pdbStateMap, drainingFailureDomainDurationKey)
if err != nil {
return reconcile.Result{}, errors.Wrapf(err, "failed to get last drain timestamp from the configmap %q", pdbStateMap.Name)
}
timeSinceOSDDown := time.Since(lastDrainTimeStamp)
if timeSinceOSDDown > 30*time.Second {
logger.Infof("osd is down in failure domain %q is down for the last %.2f minutes, but pgs are active+clean", drainingFailureDomain, timeSinceOSDDown.Minutes())
resetPDBConfig(pdbStateMap)
} else {
logger.Infof("osd is down in the failure domain %q, but pgs are active+clean. Requeuing in case pg status is not updated yet...", drainingFailureDomain)
return reconcile.Result{Requeue: true, RequeueAfter: 15 * time.Second}, nil
}

// osd is down and pgs are not healthy
case osdDown && !pgClean:
logger.Infof("osd is down in failure domain %q and pgs are not active+clean. pg health: %q", drainingFailureDomain, pgHealthMsg)
// osd is down
case osdDown:
logger.Infof("osd is down in failure domain %q. pg health: %q", drainingFailureDomain, pgHealthMsg)
currentlyDrainingFD, ok := pdbStateMap.Data[drainingFailureDomainKey]
if !ok || drainingFailureDomain != currentlyDrainingFD {
pdbStateMap.Data[drainingFailureDomainKey] = drainingFailureDomain
Expand Down Expand Up @@ -383,7 +368,7 @@ func (r *ReconcileClusterDisruption) reconcilePDBsForOSDs(
}
}

if pdbStateMap.Data[drainingFailureDomainKey] != "" && !pgClean {
if pdbStateMap.Data[drainingFailureDomainKey] != "" {
// delete default OSD pdb and create blocking OSD pdbs
err := r.handleActiveDrains(allFailureDomains, pdbStateMap.Data[drainingFailureDomainKey], failureDomainType, clusterInfo.Namespace, pgClean)
if err != nil {
Expand Down Expand Up @@ -646,22 +631,6 @@ func getPDBName(failureDomainType, failureDomainName string) string {
return k8sutil.TruncateNodeName(fmt.Sprintf("%s-%s-%s", osdPDBAppName, failureDomainType, "%s"), failureDomainName)
}

func getLastDrainTimeStamp(pdbStateMap *corev1.ConfigMap, key string) (time.Time, error) {
var err error
var lastDrainTimeStamp time.Time
lastDrainTimeStampString, ok := pdbStateMap.Data[key]
if !ok || len(lastDrainTimeStampString) == 0 {
return time.Now(), nil
} else {
lastDrainTimeStamp, err = time.Parse(time.RFC3339, pdbStateMap.Data[key])
if err != nil {
return time.Time{}, errors.Wrapf(err, "failed to parse timestamp %q", pdbStateMap.Data[key])
}
}

return lastDrainTimeStamp, nil
}

func (r *ReconcileClusterDisruption) getAllowedDisruptions(pdbName, namespace string) (int32, error) {
usePDBV1Beta1, err := k8sutil.UsePDBV1Beta1Version(r.context.ClusterdContext.Clientset)
if err != nil {
Expand Down

0 comments on commit 6def9c8

Please sign in to comment.