Skip to content

Commit 6299bad

Browse files
committed
[#25408] DocDB: Fix MasterFailoverTestIndexCreation/MasterFailoverTestIndexCreation.TestPauseAfterCreateIndexIssued/1 in TSAN
Summary: There is an expected data race when we are trying to detect stuck RPC call. But stuck call itself should not happen, particullary in this case it is caused by TSAN slowness, so call was not actually stuck. Fixed by increasing stuck call detection threshold in TSAN. Also fixed issue with double call to Transferred during connection shutdown. Test Plan: ./yb_build.sh tsan --cxx-test integration-tests_master_failover-itest --gtest_filter MasterFailoverTestIndexCreation/MasterFailoverTestIndexCreation.TestPauseAfterCreateIndexIssued/1 -n 40 -- -p 6 Reviewers: hsunder Reviewed By: hsunder Subscribers: ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D40848
1 parent 615ae5b commit 6299bad

File tree

2 files changed

+3
-8
lines changed

2 files changed

+3
-8
lines changed

src/yb/consensus/consensus_peers.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ DEFINE_test_flag(int32, delay_removing_peer_with_failed_tablet_secs, 0,
101101
"indicating that a tablet is in the FAILED state, and before marking this peer "
102102
"as failed.");
103103

104-
DEFINE_RUNTIME_int32(consensus_stuck_peer_call_threshold_ms, 10000,
104+
DEFINE_RUNTIME_int32(consensus_stuck_peer_call_threshold_ms, 10000 * yb::kTimeMultiplier,
105105
"Time to wait after timeout before considering an RPC call as stuck.");
106106
TAG_FLAG(consensus_stuck_peer_call_threshold_ms, advanced);
107107

@@ -185,7 +185,7 @@ Status Peer::SignalRequest(RequestTriggerMode trigger_mode) {
185185
auto last_rpc_start_time = last_rpc_start_time_.load(std::memory_order_acquire);
186186
if (last_rpc_start_time != CoarseTimePoint::min() &&
187187
now > last_rpc_start_time + stuck_threshold + timeout && !controller_.finished()) {
188-
LOG_WITH_PREFIX(INFO) << Format(
188+
LOG_WITH_PREFIX(DFATAL) << Format(
189189
"Found an RPC call in stuck state - timeout: $0, last_rpc_start_time: $1, "
190190
"stuck threshold: $2, force recover: $3, call state: $4",
191191
timeout, last_rpc_start_time, stuck_threshold,

src/yb/rpc/connection.cc

+1-6
Original file line numberDiff line numberDiff line change
@@ -413,12 +413,7 @@ Result<size_t> Connection::DoQueueOutboundData(OutboundDataPtr outbound_data, bo
413413
}
414414

415415
if (!batch) {
416-
s = OutboundQueued();
417-
if (!s.ok()) {
418-
outbound_data->Transferred(s, shared_from_this());
419-
// The connection shutdown has already been triggered by OutboundQueued.
420-
return s;
421-
}
416+
RETURN_NOT_OK(OutboundQueued());
422417
}
423418

424419
return *result;

0 commit comments

Comments
 (0)