Skip to content

Commit cef1080

Browse files
authored
Misc mqbc::StorageMgr: Always clear existing watchdog and start anew (#945)
Signed-off-by: Yuan Jing Vincent Yan <[email protected]>
1 parent 4b54fb1 commit cef1080

File tree

4 files changed

+20
-28
lines changed

4 files changed

+20
-28
lines changed

src/groups/mqb/mqbc/mqbc_clusterstatemanager.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1144,7 +1144,7 @@ void ClusterStateManager::onWatchDogDispatched()
11441144
}
11451145

11461146
BALL_LOG_WARN << d_clusterData_p->identity().description()
1147-
<< ": Watch dog triggered because node startup healing "
1147+
<< ": Watchdog triggered because node startup healing "
11481148
<< "sequence was not completed in the configured time of "
11491149
<< d_watchDogTimeoutInterval.totalSeconds() << " seconds.";
11501150

src/groups/mqb/mqbc/mqbc_clusterstatemanager.t.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2425,19 +2425,19 @@ static void test23_selectLeaderFromFollower()
24252425

24262426
static void test24_watchDogLeader()
24272427
// ------------------------------------------------------------------------
2428-
// WATCH DOG LEADER
2428+
// WATCHDOG LEADER
24292429
//
24302430
// Concerns:
24312431
// Verify that the watchdog triggers upon timeout when the leader is
24322432
// healing.
24332433
//
24342434
// Testing:
2435-
// Watch dog for healing leader
2435+
// Watchdog for healing leader
24362436
// ------------------------------------------------------------------------
24372437
// ------------------------------------------------------------------------
24382438
{
24392439
bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
2440-
"WATCH DOG LEADER");
2440+
"WATCHDOG LEADER");
24412441

24422442
Tester tester(true); // isLeader
24432443
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
@@ -2456,11 +2456,11 @@ static void test24_watchDogLeader()
24562456
tester.verifyFollowerLSNRequestsSent();
24572457
tester.clearChannels();
24582458

2459-
// 1.b.) Trigger watch dog via timeout
2459+
// 1.b.) Trigger watchdog via timeout
24602460
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
24612461
tester.d_cluster_mp->waitForScheduler();
24622462

2463-
// Verify that the watch dog triggers re-transition to Leader Healing
2463+
// Verify that the watchdog triggers re-transition to Leader Healing
24642464
// Stage 1, where we send follower LSN requests again.
24652465
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
24662466
mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
@@ -2494,11 +2494,11 @@ static void test24_watchDogLeader()
24942494
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
24952495
mqbc::ClusterStateTableState::e_LDR_HEALING_STG2);
24962496

2497-
// 2.b.) Trigger watch dog via timeout
2497+
// 2.b.) Trigger watchdog via timeout
24982498
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
24992499
tester.d_cluster_mp->waitForScheduler();
25002500

2501-
// Verify that the watch dog triggers re-transition to Leader Healing
2501+
// Verify that the watchdog triggers re-transition to Leader Healing
25022502
// Stage 1, where we send follower LSN requests again.
25032503
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
25042504
mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
@@ -2520,30 +2520,30 @@ static void test24_watchDogLeader()
25202520
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
25212521
mqbc::ClusterStateTableState::e_LDR_HEALED);
25222522

2523-
// 3.b.) Attempt to trigger watch dog via timeout, but should fail
2523+
// 3.b.) Attempt to trigger watchdog via timeout, but should fail
25242524
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
25252525
tester.d_cluster_mp->waitForScheduler();
25262526

2527-
// Verify that watch dog did not trigger
2527+
// Verify that watchdog did not trigger
25282528
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
25292529
mqbc::ClusterStateTableState::e_LDR_HEALED);
25302530
}
25312531

25322532
static void test25_watchDogFollower()
25332533
// ------------------------------------------------------------------------
2534-
// WATCH DOG FOLLOWER
2534+
// WATCHDOG FOLLOWER
25352535
//
25362536
// Concerns:
25372537
// Verify that the watchdog triggers upon timeout when the follower is
25382538
// healing.
25392539
//
25402540
// Testing:
2541-
// Watch dog for healing follower
2541+
// Watchdog for healing follower
25422542
// ------------------------------------------------------------------------
25432543
// ------------------------------------------------------------------------
25442544
{
25452545
bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
2546-
"WATCH DOG FOLLOWER");
2546+
"WATCHDOG FOLLOWER");
25472547

25482548
Tester tester(false); // isLeader
25492549

src/groups/mqb/mqbc/mqbc_storagemanager.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ void StorageManager::onWatchDogDispatched(int partitionId)
176176
BMQTSK_ALARMLOG_ALARM("RECOVERY")
177177
<< d_clusterData_p->identity().description() << " Partition ["
178178
<< partitionId
179-
<< "]: " << "Watch dog triggered because partition startup healing "
179+
<< "]: " << "Watchdog triggered because partition startup healing "
180180
<< "sequence was not completed in the configured time of "
181181
<< d_watchDogTimeoutInterval.totalSeconds() << " seconds."
182182
<< BMQTSK_ALARMLOG_END;
@@ -1154,14 +1154,8 @@ void StorageManager::do_startWatchDog(const PartitionFSMArgsSp& args)
11541154

11551155
const int partitionId = eventDataVec[0].partitionId();
11561156

1157-
if (static_cast<const bdlmt::EventSchedulerEventHandle::Event*>(
1158-
d_watchDogEventHandles[partitionId]) != 0) {
1159-
BALL_LOG_WARN << d_clusterData_p->identity().description()
1160-
<< " Partition [" << partitionId << "]: "
1161-
<< "Not starting watchdog since it has already been "
1162-
<< "started.";
1163-
return; // RETURN
1164-
}
1157+
// Clear any existing watchdog before starting the timer anew.
1158+
d_watchDogEventHandles[partitionId].release();
11651159

11661160
d_clusterData_p->scheduler().scheduleEvent(
11671161
&d_watchDogEventHandles[partitionId],
@@ -1191,8 +1185,6 @@ void StorageManager::do_stopWatchDog(const PartitionFSMArgsSp& args)
11911185
<< " Partition [" << partitionId << "]: "
11921186
<< "Failed to cancel WatchDog, rc: " << rc;
11931187
}
1194-
1195-
d_watchDogEventHandles[partitionId].release();
11961188
}
11971189

11981190
void StorageManager::do_openRecoveryFileSet(const PartitionFSMArgsSp& args)

src/groups/mqb/mqbc/mqbc_storagemanager.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,14 @@ class StorageManager BSLS_KEYWORD_FINAL
195195
/// Whether this StorageMgr has started.
196196
bsls::AtomicBool d_isStarted;
197197

198-
/// List of event handles for the watch dog, indexed by partitionId.
198+
/// List of event handles for the watchdog, indexed by partitionId.
199199
///
200200
/// THREAD: Except during the ctor, the i-th index of this data member
201201
/// **must** be accessed in the associated Queue dispatcher thread
202202
/// for the i-th partitionId.
203203
EventHandles d_watchDogEventHandles;
204204

205-
/// Timeout interval for the watch dog.
205+
/// Timeout interval for the watchdog.
206206
const bsls::TimeInterval d_watchDogTimeoutInterval;
207207

208208
/// Flag to denote if a low disk space warning was issued. This flag is
@@ -403,13 +403,13 @@ class StorageManager BSLS_KEYWORD_FINAL
403403
void recoveredQueuesCb(int partitionId,
404404
const QueueKeyInfoMap& queueKeyInfoMap);
405405

406-
/// Process the watch dog trigger event for the specified `partitionId`,
406+
/// Process the watchdog trigger event for the specified `partitionId`,
407407
/// indicating unhealthiness in the Partition FSM.
408408
///
409409
/// THREAD: Executed by the scheduler thread.
410410
void onWatchDog(int partitionId);
411411

412-
/// Process the watch dog trigger event for the specified `partitionId`,
412+
/// Process the watchdog trigger event for the specified `partitionId`,
413413
/// indicating unhealthiness in the Partition FSM.
414414
///
415415
/// THREAD: This method is invoked in the associated cluster's

0 commit comments

Comments
 (0)