Misc mqbc::StorageMgr: Always clear existing watchdog and start anew (#945)

kaikulimu · web-flow · commit cef108054953 · 2025-10-07T15:03:14.000-04:00
Signed-off-by: Yuan Jing Vincent Yan &lt;yyan82@bloomberg.net&gt;
diff --git a/src/groups/mqb/mqbc/mqbc_clusterstatemanager.cpp b/src/groups/mqb/mqbc/mqbc_clusterstatemanager.cpp
@@ -1144,7 +1144,7 @@ void ClusterStateManager::onWatchDogDispatched()
     }
 
     BALL_LOG_WARN << d_clusterData_p->identity().description()
-                  << ": Watch dog triggered because node startup healing "
+                  << ": Watchdog triggered because node startup healing "
                   << "sequence was not completed in the configured time of "
                   << d_watchDogTimeoutInterval.totalSeconds() << " seconds.";
 
diff --git a/src/groups/mqb/mqbc/mqbc_clusterstatemanager.t.cpp b/src/groups/mqb/mqbc/mqbc_clusterstatemanager.t.cpp
@@ -2425,19 +2425,19 @@ static void test23_selectLeaderFromFollower()
 
 static void test24_watchDogLeader()
 // ------------------------------------------------------------------------
-// WATCH DOG LEADER
+// WATCHDOG LEADER
 //
 // Concerns:
 //   Verify that the watchdog triggers upon timeout when the leader is
 //   healing.
 //
 // Testing:
-//   Watch dog for healing leader
+//   Watchdog for healing leader
 // ------------------------------------------------------------------------
 // ------------------------------------------------------------------------
 {
     bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
-                                      "WATCH DOG LEADER");
+                                      "WATCHDOG LEADER");
 
     Tester tester(true);  // isLeader
     BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
@@ -2456,11 +2456,11 @@ static void test24_watchDogLeader()
     tester.verifyFollowerLSNRequestsSent();
     tester.clearChannels();
 
-    // 1.b.) Trigger watch dog via timeout
+    // 1.b.) Trigger watchdog via timeout
     tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
     tester.d_cluster_mp->waitForScheduler();
 
-    // Verify that the watch dog triggers re-transition to Leader Healing
+    // Verify that the watchdog triggers re-transition to Leader Healing
     // Stage 1, where we send follower LSN requests again.
     BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
                      mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
@@ -2494,11 +2494,11 @@ static void test24_watchDogLeader()
     BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
                     mqbc::ClusterStateTableState::e_LDR_HEALING_STG2);
 
-    // 2.b.) Trigger watch dog via timeout
+    // 2.b.) Trigger watchdog via timeout
     tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
     tester.d_cluster_mp->waitForScheduler();
 
-    // Verify that the watch dog triggers re-transition to Leader Healing
+    // Verify that the watchdog triggers re-transition to Leader Healing
     // Stage 1, where we send follower LSN requests again.
     BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
                      mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
@@ -2520,30 +2520,30 @@ static void test24_watchDogLeader()
     BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
                     mqbc::ClusterStateTableState::e_LDR_HEALED);
 
-    // 3.b.) Attempt to trigger watch dog via timeout, but should fail
+    // 3.b.) Attempt to trigger watchdog via timeout, but should fail
     tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
     tester.d_cluster_mp->waitForScheduler();
 
-    // Verify that watch dog did not trigger
+    // Verify that watchdog did not trigger
     BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
                      mqbc::ClusterStateTableState::e_LDR_HEALED);
 }
 
 static void test25_watchDogFollower()
 // ------------------------------------------------------------------------
-// WATCH DOG FOLLOWER
+// WATCHDOG FOLLOWER
 //
 // Concerns:
 //   Verify that the watchdog triggers upon timeout when the follower is
 //   healing.
 //
 // Testing:
-//   Watch dog for healing follower
+//   Watchdog for healing follower
 // ------------------------------------------------------------------------
 // ------------------------------------------------------------------------
 {
     bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
-                                      "WATCH DOG FOLLOWER");
+                                      "WATCHDOG FOLLOWER");
 
     Tester tester(false);  // isLeader
 
diff --git a/src/groups/mqb/mqbc/mqbc_storagemanager.cpp b/src/groups/mqb/mqbc/mqbc_storagemanager.cpp
@@ -176,7 +176,7 @@ void StorageManager::onWatchDogDispatched(int partitionId)
     BMQTSK_ALARMLOG_ALARM("RECOVERY")
         << d_clusterData_p->identity().description() << " Partition ["
         << partitionId
-        << "]: " << "Watch dog triggered because partition startup healing "
+        << "]: " << "Watchdog triggered because partition startup healing "
         << "sequence was not completed in the configured time of "
         << d_watchDogTimeoutInterval.totalSeconds() << " seconds."
         << BMQTSK_ALARMLOG_END;
@@ -1154,14 +1154,8 @@ void StorageManager::do_startWatchDog(const PartitionFSMArgsSp& args)
 
     const int partitionId = eventDataVec[0].partitionId();
 
-    if (static_cast<const bdlmt::EventSchedulerEventHandle::Event*>(
-            d_watchDogEventHandles[partitionId]) != 0) {
-        BALL_LOG_WARN << d_clusterData_p->identity().description()
-                      << " Partition [" << partitionId << "]: "
-                      << "Not starting watchdog since it has already been "
-                      << "started.";
-        return;  // RETURN
-    }
+    // Clear any existing watchdog before starting the timer anew.
+    d_watchDogEventHandles[partitionId].release();
 
     d_clusterData_p->scheduler().scheduleEvent(
         &d_watchDogEventHandles[partitionId],
@@ -1191,8 +1185,6 @@ void StorageManager::do_stopWatchDog(const PartitionFSMArgsSp& args)
                        << " Partition [" << partitionId << "]: "
                        << "Failed to cancel WatchDog, rc: " << rc;
     }
-
-    d_watchDogEventHandles[partitionId].release();
 }
 
 void StorageManager::do_openRecoveryFileSet(const PartitionFSMArgsSp& args)
diff --git a/src/groups/mqb/mqbc/mqbc_storagemanager.h b/src/groups/mqb/mqbc/mqbc_storagemanager.h
@@ -195,14 +195,14 @@ class StorageManager BSLS_KEYWORD_FINAL
     /// Whether this StorageMgr has started.
     bsls::AtomicBool d_isStarted;
 
-    /// List of event handles for the watch dog, indexed by partitionId.
+    /// List of event handles for the watchdog, indexed by partitionId.
     ///
     /// THREAD: Except during the ctor, the i-th index of this data member
     ///         **must** be accessed in the associated Queue dispatcher thread
     ///         for the i-th partitionId.
     EventHandles d_watchDogEventHandles;
 
-    /// Timeout interval for the watch dog.
+    /// Timeout interval for the watchdog.
     const bsls::TimeInterval d_watchDogTimeoutInterval;
 
     /// Flag to denote if a low disk space warning was issued.  This flag is
@@ -403,13 +403,13 @@ class StorageManager BSLS_KEYWORD_FINAL
     void recoveredQueuesCb(int                    partitionId,
                            const QueueKeyInfoMap& queueKeyInfoMap);
 
-    /// Process the watch dog trigger event for the specified `partitionId`,
+    /// Process the watchdog trigger event for the specified `partitionId`,
     /// indicating unhealthiness in the Partition FSM.
     ///
     /// THREAD: Executed by the scheduler thread.
     void onWatchDog(int partitionId);
 
-    /// Process the watch dog trigger event for the specified `partitionId`,
+    /// Process the watchdog trigger event for the specified `partitionId`,
     /// indicating unhealthiness in the Partition FSM.
     ///
     /// THREAD: This method is invoked in the associated cluster's

Original file line number	Diff line number	Diff line change
`@@ -1144,7 +1144,7 @@ void ClusterStateManager::onWatchDogDispatched()`
`1144`	`1144`	`}`
`1145`	`1145`
`1146`	`1146`	`BALL_LOG_WARN << d_clusterData_p->identity().description()`
`1147`		`- << ": Watch dog triggered because node startup healing "`
	`1147`	`+ << ": Watchdog triggered because node startup healing "`
`1148`	`1148`	`<< "sequence was not completed in the configured time of "`
`1149`	`1149`	`<< d_watchDogTimeoutInterval.totalSeconds() << " seconds.";`
`1150`	`1150`