From cefc2d5e33f9f3ae6f164eca2736e41ddd9d283c Mon Sep 17 00:00:00 2001
From: Blaine Gardner <blaine.gardner@ibm.com>
Date: Thu, 3 Oct 2024 08:37:39 -0600
Subject: [PATCH 1/2] mds: fix liveness probe timeout

When the MDS liveness probe times out, it should not fail the probe. If
the cluster has a network partition or other issue that causes the Ceph
mon cluster to become unstable, `ceph ...` commands can hang and cause
a timeout. In this case, the MDS should not be restarted so as to not
cause cascading cluster disruption.

Signed-off-by: Blaine Gardner <blaine.gardner@ibm.com>
(cherry picked from commit ad1bae9e1bcd9646013ecec79c3b67f40b2a16d0)
---
 pkg/operator/ceph/file/mds/livenessprobe.go | 5 ++---
 pkg/operator/ceph/file/mds/livenessprobe.sh | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkg/operator/ceph/file/mds/livenessprobe.go b/pkg/operator/ceph/file/mds/livenessprobe.go
index beec3882ac5d..6f318d6bab89 100644
--- a/pkg/operator/ceph/file/mds/livenessprobe.go
+++ b/pkg/operator/ceph/file/mds/livenessprobe.go
@@ -3,7 +3,6 @@ package mds
 import (
 	"bytes"
 	_ "embed"
-	"fmt"
 	"html/template"
 
 	"github.com/pkg/errors"
@@ -38,6 +37,7 @@ type mdsLivenessProbeConfig struct {
 	MdsId          string
 	FilesystemName string
 	Keyring        string
+	CmdTimeout     int32
 }
 
 func renderProbe(mdsLivenessProbeConfigValue mdsLivenessProbeConfig) (string, error) {
@@ -64,6 +64,7 @@ func generateMDSLivenessProbeExecDaemon(daemonID, filesystemName, keyring string
 		MdsId:          daemonID,
 		FilesystemName: filesystemName,
 		Keyring:        keyring,
+		CmdTimeout:     mdsCmdTimeout,
 	}
 
 	mdsLivenessProbeCmd, err := renderProbe(mdsLivenessProbeConfigValue)
@@ -75,8 +76,6 @@ func generateMDSLivenessProbeExecDaemon(daemonID, filesystemName, keyring string
 		ProbeHandler: v1.ProbeHandler{
 			Exec: &v1.ExecAction{
 				Command: []string{
-					"timeout",
-					fmt.Sprintf("%d", mdsCmdTimeout),
 					"sh",
 					"-c",
 					mdsLivenessProbeCmd,
diff --git a/pkg/operator/ceph/file/mds/livenessprobe.sh b/pkg/operator/ceph/file/mds/livenessprobe.sh
index 97ad139bdbeb..5cddfac15f32 100644
--- a/pkg/operator/ceph/file/mds/livenessprobe.sh
+++ b/pkg/operator/ceph/file/mds/livenessprobe.sh
@@ -5,8 +5,9 @@
 MDS_ID="{{ .MdsId }}"
 FILESYSTEM_NAME="{{ .FilesystemName }}"
 KEYRING="{{ .Keyring }}"
+CMD_TIMEOUT="{{ .CmdTimeout }}"
 
-outp="$(ceph fs dump --mon-host="$ROOK_CEPH_MON_HOST" --mon-initial-members="$ROOK_CEPH_MON_INITIAL_MEMBERS" --keyring "$KEYRING" --format json)"
+outp="$(ceph fs dump --mon-host="$ROOK_CEPH_MON_HOST" --mon-initial-members="$ROOK_CEPH_MON_INITIAL_MEMBERS" --keyring "$KEYRING" --connect-timeout="$CMD_TIMEOUT" --format json)"
 rc=$?
 if [ $rc -ne 0 ]; then
     echo "ceph MDS dump check failed with the following output:"

From b655f4dcf5764d71ad7d4c0f8ae1073081e38c4c Mon Sep 17 00:00:00 2001
From: Travis Nielsen <tnielsen@redhat.com>
Date: Fri, 4 Oct 2024 14:23:05 -0600
Subject: [PATCH 2/2] mon: do not remove extra mon in middle of failover

If the mon failover is in progress, ensure the removal
of an extra mon deployment is skipped since that code
path only has one mon in the list for the mon that was
just newly started. The extra mon was erroneously removing
a random mon in that case, followed immediately by the mon
failover completing and removing the expected failed mon,
and potentially causing mon quroum loss.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
(cherry picked from commit e2cadab6fe10e7d2c66a9f2981737477cc9302db)
---
 pkg/operator/ceph/cluster/mon/mon.go      |  9 +++++++++
 pkg/operator/ceph/cluster/mon/mon_test.go | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go
index 17258a5322d5..8375163e8d5b 100644
--- a/pkg/operator/ceph/cluster/mon/mon.go
+++ b/pkg/operator/ceph/cluster/mon/mon.go
@@ -1068,7 +1068,16 @@ func (c *Cluster) startDeployments(mons []*monConfig, requireAllInQuorum bool) e
 }
 
 func (c *Cluster) checkForExtraMonResources(mons []*monConfig, deployments []apps.Deployment) string {
+	// If there are fewer mon deployments than the desired count, no need to remove an extra.
 	if len(deployments) <= c.spec.Mon.Count || len(deployments) <= len(mons) {
+		logger.Debug("no extra mon deployments to remove")
+		return ""
+	}
+	// If there are fewer mons in the list than expected, either new mons are being created for
+	// a new cluster, or a mon failover is in progress and the list of mons only
+	// includes the single mon that was just started
+	if len(mons) < c.spec.Mon.Count {
+		logger.Debug("new cluster or mon failover in progress, not checking for extra mon deployments")
 		return ""
 	}
 
diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go
index 9d27be4014c9..1a4d1f3d4a5a 100644
--- a/pkg/operator/ceph/cluster/mon/mon_test.go
+++ b/pkg/operator/ceph/cluster/mon/mon_test.go
@@ -718,6 +718,24 @@ func TestRemoveExtraMonDeployments(t *testing.T) {
 	c.spec.Mon.Count = 3
 	removed = c.checkForExtraMonResources(mons, deployments)
 	assert.Equal(t, "", removed)
+
+	// Do not remove a mon when it was during failover and only a single mon is in the list, even if extra deployments exist
+	mons = []*monConfig{
+		{ResourceName: "rook-ceph-mon-d", DaemonName: "d"},
+	}
+	deployments = append(deployments, apps.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "rook-ceph-mon-c",
+			Labels: map[string]string{"ceph_daemon_id": "c"},
+		}})
+	deployments = append(deployments, apps.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "rook-ceph-mon-d",
+			Labels: map[string]string{"ceph_daemon_id": "d"},
+		}})
+	c.spec.Mon.Count = 3
+	removed = c.checkForExtraMonResources(mons, deployments)
+	assert.Equal(t, "", removed)
 }
 
 func TestStretchMonVolumeClaimTemplate(t *testing.T) {