diff --git a/Cargo.lock b/Cargo.lock index b2efc0d1717..aa9679d4a2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2231,7 +2231,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e22d8864c9579660424f83b62ca196c03ef85927" +source = "git+https://github.com/pingcap/kvproto.git#0e2f26c0a46ae7d666d6ca4410046a39e0c96f36" dependencies = [ "futures 0.3.15", "grpcio", @@ -3490,6 +3490,7 @@ dependencies = [ "fs2", "futures 0.3.15", "futures-util", + "getset", "into_other", "itertools 0.10.0", "keys", diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index eebd6fbe403..d7bf1af4ad5 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -28,6 +28,7 @@ define_error_codes!( DATA_IS_NOT_READY => ("DataIsNotReady", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), + RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -60,6 +61,8 @@ impl ErrorCodeExt for errorpb::Error { PROPOSAL_IN_MERGING_MODE } else if self.has_data_is_not_ready() { DATA_IS_NOT_READY + } else if self.has_recovery_in_progress() { + RECOVERY_IN_PROGRESS } else { UNKNOWN } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index b9fdce8740b..9d97682302e 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -719,7 +719,7 @@ impl PdClient for RpcClient { fn store_heartbeat( &self, mut stats: pdpb::StoreStats, - report_opt: Option, + store_report: Option, dr_autosync_status: Option, ) -> PdFuture { let timer = Instant::now(); @@ -730,7 +730,7 @@ impl PdClient for RpcClient { .mut_interval() .set_end_timestamp(UnixSecs::now().into_inner()); req.set_stats(stats); - if let Some(report) = report_opt { + if let Some(report) = store_report { req.set_store_report(report); } if let Some(status) = dr_autosync_status { diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index dc58506cdbd..970fcc8feff 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -548,6 +548,8 @@ impl PdConnector { let addr_trim = trim_http_prefix(addr); let channel = { let cb = ChannelBuilder::new(self.env.clone()) + .max_send_message_len(-1) + .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) .keepalive_timeout(Duration::from_secs(3)); self.security_mgr.connect(cb, addr_trim) diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 935e5f59f71..d90c0b3563d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -42,6 +42,7 @@ file_system = { path = "../file_system", default-features = false } fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = ["io"] } +getset = "0.1" into_other = { path = "../into_other", default-features = false } itertools = "0.10" keys = { path = "../keys", default-features = false } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 801626ac5fc..9929f43b7f7 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -60,6 +60,9 @@ pub enum Error { #[error("store ids {0:?}, errmsg {1}")] DiskFull(Vec, String), + #[error("region {0} is in the recovery progress")] + RecoveryInProgress(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -238,6 +241,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_region_not_initialized(e); } + Error::RecoveryInProgress(region_id) => { + let mut e = errorpb::RecoveryInProgress::default(); + e.set_region_id(region_id); + errorpb.set_recovery_in_progress(e); + } _ => {} }; @@ -271,6 +279,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::raftstore::REGION_NOT_FOUND, Error::NotLeader(..) => error_code::raftstore::NOT_LEADER, Error::DiskFull(..) => error_code::raftstore::DISK_FULL, + Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6b3ecdc32ad..c26c2a3ad21 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5,16 +5,14 @@ use std::borrow::Cow; use std::cell::Cell; use std::collections::Bound::{Excluded, Unbounded}; use std::collections::VecDeque; -use std::iter::Iterator; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::iter::{FromIterator, Iterator}; +use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{cmp, mem, u64}; use batch_system::{BasicMailbox, Fsm}; use collections::{HashMap, HashSet}; -use engine_traits::{ - Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatch, WriteBatchExt, WriteOptions, -}; +use engine_traits::{Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatchExt}; use engine_traits::{CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; @@ -23,7 +21,7 @@ use kvproto::errorpb; use kvproto::import_sstpb::SwitchMode; use kvproto::kvrpcpb::DiskFullOpt; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::{CheckPolicy, StoreStats}; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, CmdType, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusResponse, @@ -58,7 +56,7 @@ use crate::store::cmd_resp::{bind_term, new_error}; use crate::store::fsm::store::{PollContext, StoreMeta}; use crate::store::fsm::{ apply, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, StoreInfo, + ExecResult, }; use crate::store::hibernate_state::{GroupState, HibernateState}; use crate::store::local_metrics::RaftMetrics; @@ -66,14 +64,16 @@ use crate::store::memory::*; use crate::store::metrics::*; use crate::store::msg::{Callback, ExtCallback, InspectedRaftMessage}; use crate::store::peer::{ - ConsistencyState, Peer, PersistSnapshotResult, StaleState, TRANSFER_LEADER_COMMAND_REPLY_CTX, + ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }; -use crate::store::peer_storage::write_peer_state; use crate::store::transport::Transport; -use crate::store::util::{is_learner, KeysInfoFormatter}; +use crate::store::util::{is_learner, KeysInfoFormatter, LeaseState}; use crate::store::worker::{ - ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, - SplitCheckTask, + new_change_peer_v2_request, ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, + ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }; use crate::store::PdTask; use crate::store::RaftlogFetchResult; @@ -148,12 +148,6 @@ where /// Before actually destroying a peer, ensure all log gc tasks are finished, so we /// can start destroying without seeking. logs_gc_flushed: bool, - - /// To make sure the reported store/peer meta is up to date, each peer has to wait for the log - /// at its target commit index to be applied. The last peer does so triggers the next procedure - /// which is reporting the store/peer meta to PD. - unsafe_recovery_target_commit_index: Option, - unsafe_recovery_wait_apply_counter: Option>, } pub struct BatchRaftCmdRequestBuilder @@ -272,8 +266,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -328,8 +320,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -677,12 +667,6 @@ where } } } - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - self.on_update_region_for_unsafe_recover(region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => { - self.on_unsafe_recovery_wait_apply(counter) - } } } // Propose batch request which may be still waiting for more raft-command @@ -744,189 +728,163 @@ where } } - fn on_update_region_for_unsafe_recover(&mut self, region: Region) { - let mut new_peer_list = HashSet::default(); - for peer in region.get_peers() { - new_peer_list.insert(peer.get_id()); - } - let to_be_removed: Vec = self - .region() - .get_peers() - .iter() - .filter(|&peer| !new_peer_list.contains(&peer.get_id())) - .map(|peer| peer.get_id()) - .collect(); - if to_be_removed.is_empty() - && self.region().get_start_key() == region.get_start_key() - && self.region().get_end_key() == region.get_end_key() - { - // Nothing to be updated, return directly. - return; - } - info!( - "updating the reigon for unsafe recover, original: {:?}, target: {:?}", - self.region(), - region - ); - if self.fsm.peer.has_valid_leader() { - panic!("region update for unsafe recover should only occur in leaderless reigons"); - } - if self.fsm.peer.raft_group.store().applied_index() - != self.fsm.peer.raft_group.store().commit_index() - { + fn on_unsafe_recovery_pre_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if self.fsm.peer.unsafe_recovery_state.is_some() { warn!( - "cannot proceed region update for unsafe recover, applied index is not equal to commit index" + "Unsafe recovery, demote failed voters has already been initiated"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); + syncer.abort(); return; } - let region_state_key = keys::region_state_key(region.get_id()); - let original_region_state = match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - { - Ok(Some(region_state)) => region_state, - Ok(None) => { - panic!("Can't find RegionLocalState while updating {:?}", region); - } - Err(e) => { - panic!( - "Fail to look up RegionLocalState while updating {:?} err {:?}", - region, e - ); - } - }; - let mut kv_wb = self.ctx.engines.kv.write_batch(); - write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None).unwrap_or_else(|e| { - panic!( - "fails to write RegionLocalState {:?} into write brach, err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to update RegionLocalstate {:?} err {:?}", region, e); + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, demoting failed voters failed, since this peer is not forced leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + ); + return; } - { - let mut meta = self.ctx.store_meta.lock().unwrap(); - meta.set_region( - &self.ctx.coprocessor_host, - region.clone(), - &mut self.fsm.peer, + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, already in joint state, exit first"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); - if meta - .region_ranges - .remove(&enc_end_key(original_region_state.get_region())) - .is_none() - { - panic!( - "{} original region does not exist in store meta", - self.fsm.peer.tag - ); - } - for (_, id) in meta.region_ranges.range(( - Excluded(keys::data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= keys::data_end_key(region.get_end_key()) { - break; + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to exit residual joint state"; + "err" => ?resp.response.get_header().get_error(), + ); } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } - if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) - .is_some() - { - panic!( - "key conflicts while inserting region {:?} into store meta", - region - ); - } - } - for peer_id in to_be_removed.clone() { - let mut cc = eraftpb::ConfChangeV2::default(); - let mut ccs = eraftpb::ConfChangeSingle::default(); - ccs.set_change_type(eraftpb::ConfChangeType::RemoveNode); - ccs.set_node_id(peer_id); - cc.set_transition(eraftpb::ConfChangeTransition::Auto); - cc.mut_changes().push(ccs); - if let Err(e) = self.fsm.peer.raft_group.apply_conf_change(&cc) { - panic!("fail to apply conf change for unsafe recover {:?}", e); + })); + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + callback, + DiskFullOpt::AllowedOnAlmostFull, + ); + + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: true, + }); } + } else { + self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); } - self.fsm - .peer - .peer_heartbeats - .retain(|&k, _| new_peer_list.contains(&k)); - self.fsm - .peer - .peers_start_pending_time - .retain(|&(k, _)| new_peer_list.contains(&k)); - for peer in to_be_removed { - self.fsm.peer.remove_peer_from_cache(peer); - } - self.fsm.peer.post_split(); - self.fsm.reset_hibernate_state(GroupState::Chaos); - self.register_raft_base_tick(); } - fn finish_unsafe_recovery_wait_apply(&mut self) { - if self - .fsm - .unsafe_recovery_wait_apply_counter - .as_ref() - .unwrap() - .fetch_sub(1, Ordering::Relaxed) - == 1 + fn unsafe_recovery_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if let Some(req) = + demote_failed_voters_request(self.region(), &self.fsm.peer.peer, failed_voters) { - let mut stats = StoreStats::default(); - stats.set_store_id(self.store_id()); - let store_info = StoreInfo { - kv_engine: self.ctx.engines.kv.clone(), - raft_engine: self.ctx.engines.raft.clone(), - capacity: self.ctx.cfg.capacity.0, - }; - let task = PdTask::StoreHeartbeat { - stats, - store_info, - send_detailed_report: true, - dr_autosync_status: self - .ctx - .global_replication_state - .lock() - .unwrap() - .store_dr_autosync_status(), - }; - if let Err(e) = self.ctx.pd_scheduler.schedule(task) { - panic!("fail to send detailed report to pd {:?}", e); + info!( + "Unsafe recovery, demoting failed voters"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + "req" => ?req); + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to finish demotion"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })); + self.propose_raft_command_internal(req, callback, DiskFullOpt::AllowedOnAlmostFull); + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters: vec![], // No longer needed since here. + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: false, + }); } + } else { + warn!( + "Unsafe recovery, no need to demote failed voters"; + "region" => ?self.region(), + ); } - self.fsm.unsafe_recovery_target_commit_index = None; - self.fsm.unsafe_recovery_wait_apply_counter = None; } - fn on_unsafe_recovery_wait_apply(&mut self, counter: Arc) { - self.fsm.unsafe_recovery_target_commit_index = - Some(self.fsm.peer.raft_group.store().commit_index()); - self.fsm.unsafe_recovery_wait_apply_counter = Some(counter); - // If the applied index equals to the commit index, there is nothing to wait for, proceeds - // to the next step immediately. If they are not equal, further checks will be performed in - // on_apply_res(). - if self.fsm.stopped - || self.fsm.peer.raft_group.store().applied_index() - == self.fsm.peer.raft_group.store().commit_index() - { - self.finish_unsafe_recovery_wait_apply(); + fn on_unsafe_recovery_destroy(&mut self, syncer: UnsafeRecoveryExecutePlanSyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't destroy, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; } + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Destroy(syncer)); + self.handle_destroy_peer(DestroyPeerJob { + initialized: self.fsm.peer.is_initialized(), + region_id: self.region_id(), + peer: self.fsm.peer.peer.clone(), + }); + } + + fn on_unsafe_recovery_wait_apply(&mut self, syncer: UnsafeRecoveryWaitApplySyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't wait apply, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; + } + let target_index = if self.fsm.peer.force_leader.is_some() { + // For regions that lose quorum (or regions have force leader), whatever has been + // proposed will be committed. Based on that fact, we simply use "last index" here to + // avoid implementing another "wait commit" process. + self.fsm.peer.raft_group.raft.raft_log.last_index() + } else { + self.fsm.peer.raft_group.raft.raft_log.committed + }; + + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ self.fsm.stopped); + } + + fn on_unsafe_recovery_fill_out_report(&mut self, syncer: UnsafeRecoveryFillOutReportSyncer) { + let mut self_report = pdpb::PeerReport::default(); + self_report.set_raft_state(self.fsm.peer.get_store().raft_state().clone()); + let mut region_local_state = RegionLocalState::default(); + region_local_state.set_region(self.region().clone()); + self_report.set_region_state(region_local_state); + self_report.set_is_force_leader(self.fsm.peer.force_leader.is_some()); + syncer.report_for_self(self_report); } fn on_casual_msg(&mut self, msg: CasualMessage) { @@ -1034,6 +992,10 @@ where CasualMessage::SnapshotApplied => { self.fsm.has_ready = true; } + CasualMessage::Campaign => { + let _ = self.fsm.peer.raft_group.campaign(); + self.fsm.has_ready = true; + } } } @@ -1269,6 +1231,383 @@ where SignificantMsg::RaftlogFetched { context, res } => { self.on_raft_log_fetched(context, res); } + SignificantMsg::EnterForceLeaderState { + syncer, + failed_stores, + } => { + self.on_enter_pre_force_leader(syncer, failed_stores); + } + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer, + failed_voters, + } => self.on_unsafe_recovery_pre_demote_failed_voters(syncer, failed_voters), + SignificantMsg::UnsafeRecoveryDestroy(syncer) => { + self.on_unsafe_recovery_destroy(syncer) + } + SignificantMsg::UnsafeRecoveryWaitApply(syncer) => { + self.on_unsafe_recovery_wait_apply(syncer) + } + SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { + self.on_unsafe_recovery_fill_out_report(syncer) + } + } + } + + fn on_enter_pre_force_leader( + &mut self, + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + ) { + match self.fsm.peer.force_leader { + Some(ForceLeaderState::PreForceLeader { .. }) => { + self.on_force_leader_fail(); + } + Some(ForceLeaderState::ForceLeader { .. }) => { + // already is a force leader, do nothing + return; + } + Some(ForceLeaderState::WaitTicks { .. }) => { + self.fsm.peer.force_leader = None; + } + None => {} + } + + if !self.fsm.peer.is_initialized() { + warn!( + "Unsafe recovery, cannot force leader since this peer is not initialized"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + let ticks = if self.fsm.peer.is_leader() { + if self.fsm.hibernate_state.group_state() == GroupState::Ordered { + warn!( + "Unsafe recovery, reject pre force leader due to already being leader"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait two rounds of election timeout to trigger check quorum to step down the leader + // note: check quorum is triggered every `election_timeout` instead of `randomized_election_timeout` + Some( + self.fsm.peer.raft_group.raft.election_timeout() * 2 + - self.fsm.peer.raft_group.raft.election_elapsed, + ) + // When election timeout is triggered, leader_id is set to INVALID_ID. + // But learner(not promotable) is a exception here as it wouldn't tick + // election. + } else if self.fsm.peer.raft_group.raft.promotable() + && self.fsm.peer.leader_id() != raft::INVALID_ID + { + if self.fsm.hibernate_state.group_state() == GroupState::Ordered + || self.fsm.hibernate_state.group_state() == GroupState::Chaos + { + warn!( + "Unsafe recovery, reject pre force leader due to leader lease may not expired"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait one round of election timeout to make sure leader_id is invalid + Some( + self.fsm.peer.raft_group.raft.randomized_election_timeout() + - self.fsm.peer.raft_group.raft.election_elapsed, + ) + } else { + None + }; + + if let Some(ticks) = ticks { + info!( + "Unsafe recovery, enter wait ticks"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "ticks" => ticks, + ); + self.fsm.peer.force_leader = Some(ForceLeaderState::WaitTicks { + syncer, + failed_stores, + ticks, + }); + self.reset_raft_tick(if self.fsm.peer.is_leader() { + GroupState::Ordered + } else { + GroupState::Chaos + }); + self.fsm.has_ready = true; + return; + } + + let expected_alive_voter = self.get_force_leader_expected_alive_voter(&failed_stores); + if !expected_alive_voter.is_empty() + && self + .fsm + .peer + .raft_group + .raft + .prs() + .has_quorum(&expected_alive_voter) + { + warn!( + "Unsafe recovery, reject pre force leader due to has quorum"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + info!( + "Unsafe recovery, enter pre force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + ); + + // Do not use prevote as prevote won't set `vote` to itself. + // When PD issues force leader on two different peer, it may cause + // two force leader in same term. + self.fsm.peer.raft_group.raft.pre_vote = false; + // trigger vote request to all voters, will check the vote result in `check_force_leader` + if let Err(e) = self.fsm.peer.raft_group.campaign() { + warn!( + "Unsafe recovery, campaign failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => ?e, + ); + } + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + if !self + .fsm + .peer + .raft_group + .raft + .prs() + .votes() + .get(&self.fsm.peer.peer_id()) + .unwrap() + { + warn!( + "Unsafe recovery, pre force leader failed to campaign"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.on_force_leader_fail(); + return; + } + + self.fsm.peer.force_leader = Some(ForceLeaderState::PreForceLeader { + syncer, + failed_stores, + }); + self.fsm.has_ready = true; + } + + fn on_force_leader_fail(&mut self) { + self.fsm.peer.raft_group.raft.pre_vote = true; + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.force_leader = None; + } + + fn on_enter_force_leader(&mut self) { + info!( + "Unsafe recovery, enter force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + + let failed_stores = match self.fsm.peer.force_leader.take() { + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + _ => unreachable!(), + }; + + let peer_ids: Vec<_> = self.fsm.peer.voters().iter().collect(); + for peer_id in peer_ids { + let store_id = self + .region() + .get_peers() + .iter() + .find(|p| p.get_id() == peer_id) + .unwrap() + .get_store_id(); + if !failed_stores.contains(&store_id) { + continue; + } + + // make fake vote response + let mut msg = raft::eraftpb::Message::new(); + msg.msg_type = MessageType::MsgRequestVoteResponse; + msg.reject = false; + msg.term = self.fsm.peer.term(); + msg.from = peer_id; + msg.to = self.fsm.peer.peer_id(); + self.fsm.peer.raft_group.step(msg).unwrap(); + } + + // after receiving all votes, should become leader + assert!(self.fsm.peer.is_leader()); + self.fsm.peer.raft_group.raft.set_check_quorum(false); + + // make sure it's not hibernated + self.reset_raft_tick(GroupState::Ordered); + + self.fsm.peer.force_leader = Some(ForceLeaderState::ForceLeader { + time: TiInstant::now_coarse(), + failed_stores, + }); + self.fsm.has_ready = true; + } + + fn on_exit_force_leader(&mut self) { + if self.fsm.peer.force_leader.is_none() { + return; + } + + info!( + "exit force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.fsm.peer.force_leader = None; + // make sure it's not hibernated + assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); + // leader lease shouldn't be renewed in force leader state. + assert_eq!( + self.fsm.peer.leader_lease().inspect(None), + LeaseState::Expired + ); + self.fsm + .peer + .raft_group + .raft + .become_follower(self.fsm.peer.term(), raft::INVALID_ID); + + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.raft_group.raft.pre_vote = true; + if self.fsm.peer.raft_group.raft.promotable() { + // Do not campaign directly here, otherwise on_role_changed() won't called for follower state + let _ = self.ctx.router.send( + self.region_id(), + PeerMsg::CasualMessage(CasualMessage::Campaign), + ); + } + self.fsm.has_ready = true; + } + + #[inline] + fn get_force_leader_expected_alive_voter(&self, failed_stores: &HashSet) -> HashSet { + let region = self.region(); + self.fsm + .peer + .voters() + .iter() + .filter(|peer_id| { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + !failed_stores.contains(&store_id) + }) + .collect() + } + + #[inline] + fn check_force_leader(&mut self) { + if let Some(ForceLeaderState::WaitTicks { + syncer, + failed_stores, + ticks, + }) = &mut self.fsm.peer.force_leader + { + if *ticks == 0 { + let syncer_clone = syncer.clone(); + let s = mem::take(failed_stores); + self.on_enter_pre_force_leader(syncer_clone, s); + } else { + *ticks -= 1; + } + return; + }; + + let failed_stores = match &self.fsm.peer.force_leader { + None => return, + Some(ForceLeaderState::ForceLeader { .. }) => { + if self.fsm.peer.maybe_force_forward_commit_index() { + self.fsm.has_ready = true; + } + return; + } + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + Some(ForceLeaderState::WaitTicks { .. }) => unreachable!(), + }; + + if self.fsm.peer.raft_group.raft.election_elapsed + 1 + < self.ctx.cfg.raft_election_timeout_ticks + { + // wait as longer as it can to collect responses of request vote + return; + } + + let expected_alive_voter: HashSet<_> = + self.get_force_leader_expected_alive_voter(failed_stores); + let check = || { + if self.fsm.peer.raft_group.raft.state != StateRole::Candidate { + Err(format!( + "unexpected role {:?}", + self.fsm.peer.raft_group.raft.state + )) + } else { + let mut granted = 0; + for (id, vote) in self.fsm.peer.raft_group.raft.prs().votes() { + if expected_alive_voter.contains(id) { + if *vote { + granted += 1; + } else { + return Err(format!("receive reject response from {}", *id)); + } + } else if *id == self.fsm.peer_id() { + // self may be a learner + continue; + } else { + return Err(format!( + "receive unexpected vote from {} vote {}", + *id, *vote + )); + } + } + Ok(granted) + } + }; + + match check() { + Err(err) => { + warn!( + "Unsafe recovery, pre force leader check failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + "reason" => err, + ); + self.on_force_leader_fail(); + } + Ok(granted) => { + info!( + "Unsafe recovery, expected live voters:"; + "voters" => ?expected_alive_voter, + "granted" => granted + ); + if granted == expected_alive_voter.len() { + self.on_enter_force_leader(); + } + } } } @@ -1382,6 +1721,19 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); } + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { + if r != StateRole::Leader { + // for some reason, it's not leader anymore + info!( + "step down in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "state" => ?r, + ); + self.on_force_leader_fail(); + } + } } } @@ -1512,6 +1864,8 @@ where self.fsm.peer.retry_pending_reads(&self.ctx.cfg); + self.check_force_leader(); + let mut res = None; if self.ctx.cfg.hibernate_regions { if self.fsm.hibernate_state.group_state() == GroupState::Idle { @@ -1519,14 +1873,14 @@ where // follower may tick more than an election timeout in chaos state. // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` - 2 // - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) - // and the follwer's `election_elapsed` in raft-rs is 1. + // and the follower's `election_elapsed` in raft-rs is 1. // After the group state becomes Chaos, the next tick will call `raft_group.tick` // `missing_tick` + 1 times(default 7). // Then the follower's `election_elapsed` will be 1 + `missing_tick` + 1 // (default 1 + 6 + 1 = 8) which is less than the min election timeout. // The reason is that we don't want let all followers become (pre)candidate if one // follower may receive a request, then becomes (pre)candidate and sends (pre)vote msg - // to others. As long as the leader can wake up and broadcast hearbeats in one `raft_heartbeat_ticks` + // to others. As long as the leader can wake up and broadcast heartbeats in one `raft_heartbeat_ticks` // time(default 2s), no more followers will wake up and sends vote msg again. if self.fsm.missing_ticks + 1 /* for the next tick after the peer isn't Idle */ + self.fsm.peer.raft_group.raft.election_elapsed @@ -1594,6 +1948,69 @@ where } } + fn check_unsafe_recovery_state(&mut self) { + match &self.fsm.peer.unsafe_recovery_state { + Some(UnsafeRecoveryState::WaitApply { .. }) => self + .fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false), + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index, + demote_after_exit, + }) => { + if self.fsm.peer.raft_group.raft.raft_log.applied >= *target_index { + if *demote_after_exit { + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, lost forced leadership after exiting joint state"; + "region_id" => self.region().get_id(), + ); + return; + } + let syncer_clone = syncer.clone(); + let failed_voters_clone = failed_voters.clone(); + self.unsafe_recovery_demote_failed_voters( + syncer_clone, + failed_voters_clone, + ); + } else { + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, exiting joint state"; + "region_id" => self.region().get_id() + ); + if self.fsm.peer.is_force_leader() { + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + Callback::::write(Box::new(|resp| { + if resp.response.get_header().has_error() { + error!( + "Unsafe recovery, fail to exit joint state"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })), + DiskFullOpt::AllowedOnAlmostFull, + ); + } else { + error!( + "Unsafe recovery, lost forced leadership while trying to exit joint state"; + "region_id" => self.region().get_id(), + ); + } + } + + self.fsm.peer.unsafe_recovery_state = None; + } + } + } + // Destroy does not need be processed, the state is cleaned up together with peer. + Some(_) | None => {} + } + } + fn on_apply_res(&mut self, res: ApplyTaskRes) { fail_point!("on_apply_res", |_| {}); match res { @@ -1658,11 +2075,8 @@ where } } } - // After a log has been applied, check if we need to trigger the unsafe recovery reporting procedure. - if let Some(target_commit_index) = self.fsm.unsafe_recovery_target_commit_index { - if self.fsm.peer.raft_group.store().applied_index() >= target_commit_index { - self.finish_unsafe_recovery_wait_apply(); - } + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.check_unsafe_recovery_state(); } } @@ -1847,9 +2261,7 @@ where Either::Right(v) => v, }; - if util::is_vote_msg(msg.get_message()) - || msg.get_message().get_msg_type() == MessageType::MsgTimeoutNow - { + if util::is_vote_msg(msg.get_message()) || msg_type == MessageType::MsgTimeoutNow { if self.fsm.hibernate_state.group_state() != GroupState::Chaos { self.fsm.reset_hibernate_state(GroupState::Chaos); self.register_raft_base_tick(); @@ -1861,7 +2273,7 @@ where let from_peer_id = msg.get_from_peer().get_id(); self.fsm.peer.insert_peer_cache(msg.take_from_peer()); - let result = if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + let result = if msg_type == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(msg.get_message(), peer_disk_usage); Ok(()) } else { @@ -2747,8 +3159,10 @@ where assert!(!self.fsm.peer.is_handling_snapshot()); // No need to wait for the apply anymore. - if self.fsm.unsafe_recovery_target_commit_index.is_some() { - self.finish_unsafe_recovery_wait_apply(); + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ true); } { @@ -3011,7 +3425,7 @@ where // until new leader elected, but we can't revert this operation // because its result is already persisted in apply worker // TODO: should we transfer leader here? - let demote_self = is_learner(&self.fsm.peer.peer); + let demote_self = is_learner(&self.fsm.peer.peer) && !self.fsm.peer.is_force_leader(); if remove_self || demote_self { warn!( "Removing or demoting leader"; @@ -4161,6 +4575,16 @@ where let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + if self.fsm.peer.force_leader.is_some() { + // in force leader state, forbid requests to make the recovery progress less error-prone + if !(msg.has_admin_request() + && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + { + return Err(Error::RecoveryInProgress(self.region_id())); + } + } + // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; @@ -4839,6 +5263,18 @@ where return; } + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // If the force leader state lasts a long time, it probably means PD recovery process aborts for some reasons. + // So just exit to avoid blocking the read and write requests for this peer. + if time.saturating_elapsed() > self.ctx.cfg.peer_stale_state_check_interval.0 { + warn!( + "Unsafe recovery, step down as force leader due to holding it too long"; + "duration" => ?time.saturating_elapsed(), + ); + self.on_exit_force_leader(); + } + } + if self.ctx.cfg.hibernate_regions { let group_state = self.fsm.hibernate_state.group_state(); if group_state == GroupState::Idle { @@ -5222,6 +5658,57 @@ fn new_compact_log_request( request } +fn demote_failed_voters_request( + region: &metapb::Region, + peer: &metapb::Peer, + failed_voters: Vec, +) -> Option { + let failed_voter_ids = HashSet::from_iter(failed_voters.iter().map(|voter| voter.get_id())); + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let mut change_peer_reqs: Vec = region + .get_peers() + .iter() + .filter_map(|peer| { + if failed_voter_ids.contains(&peer.get_id()) + && peer.get_role() == metapb::PeerRole::Voter + { + let mut peer_clone = peer.clone(); + peer_clone.set_role(metapb::PeerRole::Learner); + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddLearnerNode); + cp.set_peer(peer_clone); + return Some(cp); + } + None + }) + .collect(); + + // Promote self if it is a learner. + if peer.get_role() == metapb::PeerRole::Learner { + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddNode); + let mut promote = peer.clone(); + promote.set_role(metapb::PeerRole::Voter); + cp.set_peer(promote); + change_peer_reqs.push(cp); + } + if change_peer_reqs.is_empty() { + return None; + } + req.set_admin_request(new_change_peer_v2_request(change_peer_reqs)); + Some(req) +} + +fn exit_joint_request(region: &metapb::Region, peer: &metapb::Peer) -> RaftCmdRequest { + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.set_admin_request(new_change_peer_v2_request(vec![])); + req +} + impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> where EK: KvEngine, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3aee7df4259..c3836807647 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -16,7 +16,10 @@ use batch_system::{ HandlerBuilder, PollHandler, Priority, }; use crossbeam::channel::{unbounded, Sender, TryRecvError, TrySendError}; -use engine_traits::{Engines, KvEngine, Mutable, PerfContextKind, WriteBatch}; +use engine_traits::{ + CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, + RaftLogBatch, Range, WriteBatch, WriteOptions, +}; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use fail::fail_point; use futures::compat::Future01CompatExt; @@ -24,8 +27,7 @@ use futures::FutureExt; use kvproto::import_sstpb::SstMeta; use kvproto::import_sstpb::SwitchMode; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::QueryStats; -use kvproto::pdpb::StoreStats; +use kvproto::pdpb::{self, QueryStats, StoreStats}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest}; use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}; use kvproto::replication_modepb::{ReplicationMode, ReplicationStatus}; @@ -34,8 +36,6 @@ use raft::StateRole; use time::{self, Timespec}; use collections::HashMap; -use engine_traits::CompactedEvent; -use engine_traits::{RaftEngine, RaftLogBatch, WriteOptions}; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use pd_client::{FeatureGate, PdClient}; use sst_importer::SSTImporter; @@ -621,7 +621,11 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> inspector.record_store_wait(send_time.saturating_elapsed()); self.ctx.pending_latency_inspect.push(inspector); } - StoreMsg::CreatePeer(region) => self.on_create_peer(region), + StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), + StoreMsg::UnsafeRecoveryCreatePeer { syncer, create } => { + self.on_unsafe_recovery_create_peer(create); + drop(syncer); + } } } } @@ -2157,7 +2161,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - fn store_heartbeat_pd(&mut self) { + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); @@ -2235,7 +2239,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let task = PdTask::StoreHeartbeat { stats, store_info, - send_detailed_report: false, + report, dr_autosync_status: self .ctx .global_replication_state @@ -2252,7 +2256,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } fn on_pd_store_heartbeat_tick(&mut self) { - self.store_heartbeat_pd(); + self.store_heartbeat_pd(None); self.register_pd_store_heartbeat_tick(); } @@ -2639,39 +2643,35 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.router.report_status_update() } - fn on_create_peer(&self, region: Region) { - info!("creating a peer"; "peer" => ?region); - let mut kv_wb = self.ctx.engines.kv.write_batch(); - let region_state_key = keys::region_state_key(region.get_id()); - match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) + fn on_unsafe_recovery_create_peer(&self, region: Region) { + info!("Unsafe recovery, creating a peer"; "peer" => ?region); + let mut meta = self.ctx.store_meta.lock().unwrap(); + if let Some((_, id)) = meta + .region_ranges + .range(( + Excluded(data_key(region.get_start_key())), + Unbounded::>, + )) + .next() { - Ok(Some(region_state)) => { - info!( - "target region already exists, existing region: {:?}, want to create: {:?}", - region_state, region - ); - return; - } - Ok(None) => {} - Err(e) => { - panic!("cannot determine whether {:?} exists, err {:?}", region, e) + let exist_region = &meta.regions[id]; + if enc_start_key(exist_region) < data_end_key(region.get_end_key()) { + if exist_region.get_id() == region.get_id() { + warn!( + "Unsafe recovery, region has already been created."; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } else { + error!( + "Unsafe recovery, region to be created overlaps with an existing region"; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } } - }; - peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) - .unwrap_or_else(|e| { - panic!( - "fail to add peer state into write batch while creating {:?} err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to write while creating {:?} err {:?}", region, e); } let (sender, mut peer) = match PeerFsm::create( self.ctx.store.get_id(), @@ -2683,33 +2683,29 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ) { Ok((sender, peer)) => (sender, peer), Err(e) => { - panic!( - "fail to create peer fsm while creating {:?} err {:?}", - region, e + error!( + "Unsafe recovery, fail to create peer fsm"; + "region" => ?region, + "err" => ?e, ); + return; } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); peer.peer.init_replication_mode(&mut *replication_state); + drop(replication_state); peer.peer.activate(self.ctx); - let mut meta = self.ctx.store_meta.lock().unwrap(); - for (_, id) in meta.region_ranges.range(( - Excluded(data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= data_end_key(region.get_end_key()) { - break; - } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } + + let start_key = keys::enc_start_key(®ion); + let end_key = keys::enc_end_key(®ion); if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) + .regions + .insert(region.get_id(), region.clone()) .is_some() + || meta + .region_ranges + .insert(end_key.clone(), region.get_id()) + .is_some() || meta .readers .insert(region.get_id(), ReadDelegate::from_peer(peer.get_peer())) @@ -2720,10 +2716,38 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .is_some() { panic!( - "key conflicts while insert region {:?} into store meta", - region + "Unsafe recovery, key conflicts while inserting region {:?} into store meta", + region, ); } + drop(meta); + + if let Err(e) = self.ctx.engines.kv.delete_all_in_range( + DeleteStrategy::DeleteByKey, + &[Range::new(&start_key, &end_key)], + ) { + panic!( + "Unsafe recovery, fail to clean up stale data while creating the new region {:?}, the error is {:?}", + region, e, + ); + } + let mut kv_wb = self.ctx.engines.kv.write_batch(); + if let Err(e) = peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) + { + panic!( + "Unsafe recovery, fail to add peer state for {:?} into write batch, the error is {:?}", + region, e, + ); + } + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + if let Err(e) = kv_wb.write_opt(&write_opts) { + panic!( + "Unsafe recovery, fail to write to disk while creating peer {:?}, the error is {:?}", + region, e, + ); + } + let mailbox = BasicMailbox::new(sender, peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(region.get_id(), mailbox); self.ctx diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 35e7843a0a4..7efd4e4882c 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -3,14 +3,12 @@ // #[PerformanceCriticalPath] use std::borrow::Cow; use std::fmt; -use std::sync::atomic::AtomicUsize; -use std::sync::Arc; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use kvproto::kvrpcpb::ExtraOp as TxnExtraOp; use kvproto::metapb; use kvproto::metapb::RegionEpoch; -use kvproto::pdpb::CheckPolicy; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::ReplicationStatus; @@ -18,15 +16,19 @@ use kvproto::{import_sstpb::SstMeta, kvrpcpb::DiskFullOpt}; use raft::{GetEntriesContext, SnapshotStatus}; use smallvec::{smallvec, SmallVec}; +use super::{AbstractPeer, RegionSnapshot}; use crate::store::fsm::apply::TaskRes as ApplyTaskRes; use crate::store::fsm::apply::{CatchUpLogs, ChangeObserver}; use crate::store::metrics::RaftEventDurationType; +use crate::store::peer::{ + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, +}; use crate::store::util::{KeysInfoFormatter, LatencyInspector}; use crate::store::{RaftlogFetchResult, SnapKey}; +use collections::HashSet; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use super::{AbstractPeer, RegionSnapshot}; - #[derive(Debug)] pub struct ReadResponse { pub response: RaftCmdResponse, @@ -313,6 +315,18 @@ where context: GetEntriesContext, res: Box, }, + EnterForceLeaderState { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + }, + ExitForceLeaderState, + UnsafeRecoveryDemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + }, + UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), + UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), + UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), } /// Message that will be sent to a peer. @@ -392,6 +406,9 @@ pub enum CasualMessage { // Snapshot is applied SnapshotApplied, + + // Trigger raft to campaign which is used after exiting force leader + Campaign, } impl fmt::Debug for CasualMessage { @@ -450,6 +467,7 @@ impl fmt::Debug for CasualMessage { CasualMessage::RefreshRegionBuckets { .. } => write!(fmt, "RefreshRegionBuckets"), CasualMessage::RenewLease => write!(fmt, "RenewLease"), CasualMessage::SnapshotApplied => write!(fmt, "SnapshotApplied"), + CasualMessage::Campaign => write!(fmt, "Campaign"), } } } @@ -539,8 +557,6 @@ pub enum PeerMsg { /// Asks region to change replication mode. UpdateReplicationMode, Destroy(u64), - UpdateRegionForUnsafeRecover(metapb::Region), - UnsafeRecoveryWaitApply(Arc), } impl fmt::Debug for PeerMsg { @@ -569,10 +585,6 @@ impl fmt::Debug for PeerMsg { PeerMsg::HeartbeatPd => write!(fmt, "HeartbeatPd"), PeerMsg::UpdateReplicationMode => write!(fmt, "UpdateReplicationMode"), PeerMsg::Destroy(peer_id) => write!(fmt, "Destroy {}", peer_id), - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - write!(fmt, "Update Region {} to {:?}", region.get_id(), region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => write!(fmt, "WaitApply {:?}", *counter), } } } @@ -617,7 +629,11 @@ where #[cfg(any(test, feature = "testexport"))] Validate(Box), - CreatePeer(metapb::Region), + UnsafeRecoveryReport(pdpb::StoreReport), + UnsafeRecoveryCreatePeer { + syncer: UnsafeRecoveryExecutePlanSyncer, + create: metapb::Region, + }, } impl fmt::Debug for StoreMsg @@ -646,7 +662,10 @@ where StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), - StoreMsg::CreatePeer(_) => write!(fmt, "CreatePeer"), + StoreMsg::UnsafeRecoveryReport(..) => write!(fmt, "UnsafeRecoveryReport"), + StoreMsg::UnsafeRecoveryCreatePeer { .. } => { + write!(fmt, "UnsafeRecoveryCreatePeer") + } } } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9c3520ba330..d9c635a97db 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -6,7 +6,7 @@ use std::collections::VecDeque; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use std::{cmp, mem, u64, usize}; +use std::{cmp, fmt, mem, u64, usize}; use bitflags::bitflags; use bytes::Bytes; @@ -17,10 +17,11 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; +use getset::Getters; use kvproto::errorpb; use kvproto::kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}; use kvproto::metapb::{self, PeerRole}; -use kvproto::pdpb::PeerStats; +use kvproto::pdpb::{self, PeerStats}; use kvproto::raft_cmdpb::{ self, AdminCmdType, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, TransferLeaderResponse, @@ -32,7 +33,7 @@ use kvproto::replication_modepb::{ DrAutoSyncState, RegionReplicationState, RegionReplicationStatus, ReplicationMode, }; use parking_lot::RwLockUpgradableReadGuard; -use protobuf::Message; +use protobuf::{Message, RepeatedField}; use raft::eraftpb::{self, ConfChangeType, Entry, EntryType, MessageType}; use raft::{ self, Changer, GetEntriesContext, LightReady, ProgressState, ProgressTracker, RawNode, Ready, @@ -49,11 +50,11 @@ use crate::errors::RAFTSTORE_IS_BUSY; use crate::store::async_io::write::WriteMsg; use crate::store::async_io::write_router::WriteRouter; use crate::store::fsm::apply::CatchUpLogs; -use crate::store::fsm::store::PollContext; +use crate::store::fsm::store::{PollContext, RaftRouter}; use crate::store::fsm::{apply, Apply, ApplyMetrics, ApplyTask, Proposal}; use crate::store::hibernate_state::GroupState; use crate::store::memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}; -use crate::store::msg::RaftCommand; +use crate::store::msg::{PeerMsg, RaftCommand, SignificantMsg, StoreMsg}; use crate::store::txn_ext::LocksStatus; use crate::store::util::{admin_cmd_epoch_lookup, RegionReadProgress}; use crate::store::worker::{ @@ -454,6 +455,226 @@ pub struct ReadyResult { pub has_write_ready: bool, } +#[derive(Debug)] +/// ForceLeader process would be: +/// 1. If it's hibernated, enter wait ticks state, and wake up the peer +/// 2. Enter pre force leader state, become candidate and send request vote to all peers +/// 3. Wait for the responses of the request vote, no reject should be received. +/// 4. Enter force leader state, become leader without leader lease +/// 5. Execute recovery plan(some remove-peer commands) +/// 6. After the plan steps are all applied, exit force leader state +pub enum ForceLeaderState { + WaitTicks { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + ticks: usize, + }, + PreForceLeader { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + }, + ForceLeader { + time: TiInstant, + failed_stores: HashSet, + }, +} + +// Following shared states are used while reporting to PD for unsafe recovery and shared among +// all the regions per their life cycle. +// The work flow is like: +// 1. report phase +// start_unsafe_recovery_report +// -> broadcast wait-apply commands +// -> wait for all the peers' apply indices meet their targets +// -> broadcast fill out report commands +// -> wait for all the peers fill out the reports for themselves +// -> send a store report (through store heartbeat) +// 2. force leader phase +// dispatch force leader commands +// -> wait for all the peers that received the command become force leader +// -> start_unsafe_recovery_report +// 3. plan execution phase +// dispatch recovery plans +// -> wait for all the creates, deletes and demotes to finish, for the demotes, +// procedures are: +// -> exit joint state if it is already in joint state +// -> demote failed voters, and promote self to be a voter if it is a learner +// -> exit joint state +// -> start_unsafe_recovery_report + +// Intends to use RAII to sync unsafe recovery procedures between peers, in addition to that, +// it uses a closure to avoid having a raft router as a member variable, which is statically +// dispatched, thus needs to propagate the generics everywhere. +pub struct InvokeClosureOnDrop(Box); + +impl fmt::Debug for InvokeClosureOnDrop { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "InvokeClosureOnDrop") + } +} + +impl Drop for InvokeClosureOnDrop { + fn drop(&mut self) { + self.0(); + } +} + +pub fn start_unsafe_recovery_report( + router: &RaftRouter, + report_id: u64, + exit_force_leader_before_reporting: bool, +) { + if exit_force_leader_before_reporting { + router.broadcast_normal(|| PeerMsg::SignificantMsg(SignificantMsg::ExitForceLeaderState)); + } + let wait_apply = UnsafeRecoveryWaitApplySyncer::new(report_id, router.clone()); + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryWaitApply(wait_apply.clone())) + }); +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryForceLeaderSyncer(Arc); + +impl UnsafeRecoveryForceLeaderSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let inner = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, force leader finished."); + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, false); + })); + UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryExecutePlanSyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryExecutePlanSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, plan execution finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, plan execution aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, true); + })); + UnsafeRecoveryExecutePlanSyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryWaitApplySyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryWaitApplySyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, wait apply finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, wait apply aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + let fill_out_report = + UnsafeRecoveryFillOutReportSyncer::new(report_id, (*router_ptr).clone()); + (*router_ptr).broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryFillOutReport( + fill_out_report.clone(), + )) + }); + })); + UnsafeRecoveryWaitApplySyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryFillOutReportSyncer { + _closure: Arc, + reports: Arc>>, +} + +impl UnsafeRecoveryFillOutReportSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let reports = Arc::new(Mutex::new(vec![])); + let reports_clone = reports.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, peer reports collected"); + let mut store_report = pdpb::StoreReport::default(); + { + let reports_ptr = reports_clone.lock().unwrap(); + store_report.set_peer_reports(RepeatedField::from_vec((*reports_ptr).to_vec())); + } + store_report.set_step(report_id); + let router_ptr = thread_safe_router.lock().unwrap(); + if let Err(e) = (*router_ptr).send_control(StoreMsg::UnsafeRecoveryReport(store_report)) + { + error!("Unsafe recovery, fail to schedule reporting"; "err" => ?e); + } + })); + UnsafeRecoveryFillOutReportSyncer { + _closure: Arc::new(closure), + reports, + } + } + + pub fn report_for_self(&self, report: pdpb::PeerReport) { + let mut reports_ptr = self.reports.lock().unwrap(); + (*reports_ptr).push(report); + } +} + +pub enum UnsafeRecoveryState { + // Stores the state that is necessary for the wait apply stage of unsafe recovery process. + // This state is set by the peer fsm. Once set, it is checked every time this peer applies a + // new entry or a snapshot, if the target index is met, this state is reset / droppeds. The + // syncer holds a reference counted inner object that is shared among all the peers, whose + // destructor triggers the next step of unsafe recovery report process. + WaitApply { + target_index: u64, + syncer: UnsafeRecoveryWaitApplySyncer, + }, + DemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + target_index: u64, + // Failed regions may be stuck in joint state, if that is the case, we need to ask the + // region to exit joint state before proposing the demotion. + demote_after_exit: bool, + }, + Destroy(UnsafeRecoveryExecutePlanSyncer), +} + +#[derive(Getters)] pub struct Peer where EK: KvEngine, @@ -478,6 +699,7 @@ where proposals: ProposalQueue, leader_missing_time: Option, + #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, @@ -491,6 +713,15 @@ where /// 2. all read requests must be rejected. pub pending_remove: bool, + /// Force leader state is only used in online recovery when the majority of + /// peers are missing. In this state, it forces one peer to become leader out + /// of accordance with Raft election rule, and forbids any read/write proposals. + /// With that, we can further propose remove failed-nodes conf-change, to make + /// the Raft group forms majority and works normally later on. + /// + /// For details, see the comment of `ForceLeaderState`. + pub force_leader: Option, + /// Record the instants of peers being added into the configuration. /// Remove them after they are not pending any more. pub peers_start_pending_time: Vec<(u64, Instant)>, @@ -610,6 +841,7 @@ where pub region_buckets: Option, /// lead_transferee if the peer is in a leadership transferring. pub lead_transferee: u64, + pub unsafe_recovery_state: Option, } impl Peer @@ -682,6 +914,7 @@ where leader_unreachable: false, pending_remove: false, should_wake_up: false, + force_leader: None, pending_merge_state: None, want_rollback_merge_peers: HashSet::default(), pending_request_snapshot_count: Arc::new(AtomicUsize::new(0)), @@ -736,6 +969,7 @@ where apply_snap_ctx: None, region_buckets: None, lead_transferee: raft::INVALID_ID, + unsafe_recovery_state: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -1079,6 +1313,10 @@ where res.reason = "transfer leader"; return res; } + if self.force_leader.is_some() { + res.reason = "force leader"; + return res; + } // Unapplied entries can change the configuration of the group. if self.get_store().applied_index() < last_index { res.reason = "unapplied"; @@ -1605,6 +1843,39 @@ where false } + pub fn maybe_force_forward_commit_index(&mut self) -> bool { + let failed_stores = match &self.force_leader { + Some(ForceLeaderState::ForceLeader { failed_stores, .. }) => failed_stores, + _ => unreachable!(), + }; + + let region = self.region(); + let mut replicated_idx = self.raft_group.raft.raft_log.persisted; + for (peer_id, p) in self.raft_group.raft.prs().iter() { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + if failed_stores.contains(&store_id) { + continue; + } + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + } + + if self.raft_group.store().term(replicated_idx).unwrap_or(0) < self.term() { + // do not commit logs of previous term directly + return false; + } + + self.raft_group.raft.raft_log.committed = + std::cmp::max(self.raft_group.raft.raft_log.committed, replicated_idx); + true + } + pub fn check_stale_state(&mut self, ctx: &mut PollContext) -> StaleState { if self.is_leader() { // Leaders always have valid state. @@ -1950,6 +2221,11 @@ where self.term(), self.raft_group.store().region(), ); + + if self.unsafe_recovery_state.is_some() { + debug!("unsafe recovery finishes applying a snapshot"); + self.unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false); + } } // If `apply_snap_ctx` is none, it means this snapshot does not // come from the ready but comes from the unfinished snapshot task @@ -2495,6 +2771,11 @@ where let persist_index = self.raft_group.raft.raft_log.persisted; self.mut_store().update_cache_persisted(persist_index); + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } } if self.apply_snap_ctx.is_some() && self.unpersisted_readies.is_empty() { @@ -2533,6 +2814,10 @@ where self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); let persist_index = self.raft_group.raft.raft_log.persisted; + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } self.mut_store().update_cache_persisted(persist_index); self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); @@ -2841,6 +3126,13 @@ where "peer_id" => self.peer.get_id(), ); None + } else if self.force_leader.is_some() { + debug!( + "prevents renew lease while in force leader state"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + None } else { self.leader_lease.renew(ts); let term = self.term(); @@ -3080,7 +3372,7 @@ where let kind = ConfChangeKind::confchange_kind(change_peers.len()); if kind == ConfChangeKind::LeaveJoint { - if self.peer.get_role() == PeerRole::DemotingVoter { + if self.peer.get_role() == PeerRole::DemotingVoter && !self.is_force_leader() { return Err(box_err!( "{} ignore leave joint command that demoting leader", self.tag @@ -3153,7 +3445,7 @@ where let promoted_commit_index = after_progress.maximal_committed_index().0; if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. - || promoted_commit_index >= self.get_store().truncated_index() + || promoted_commit_index >= self.get_store().truncated_index() || self.force_leader.is_some() { return Ok(()); } @@ -3778,6 +4070,16 @@ where poll_ctx: &mut PollContext, mut req: RaftCmdRequest, ) -> Result> { + // Should not propose normal in force leader state. + // In `pre_propose_raft_command`, it rejects all the requests expect conf-change + // if in force leader state. + if self.force_leader.is_some() { + panic!( + "{} propose normal in force leader state {:?}", + self.tag, self.force_leader + ); + }; + if (self.pending_merge_state.is_some() && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge) || (self.prepare_merge_fence > 0 @@ -4139,6 +4441,10 @@ where resp } + pub fn voters(&self) -> raft::util::Union<'_> { + self.raft_group.raft.prs().conf().voters().ids() + } + pub fn term(&self) -> u64 { self.raft_group.raft.term } @@ -4370,6 +4676,34 @@ where .propose_check_epoch(cmd, self.term()) .is_none() } + + #[inline] + pub fn is_force_leader(&self) -> bool { + matches!( + self.force_leader, + Some(ForceLeaderState::ForceLeader { .. }) + ) + } + + pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { + if let Some(UnsafeRecoveryState::WaitApply { target_index, .. }) = + &self.unsafe_recovery_state + { + if self.raft_group.raft.raft_log.applied >= *target_index || force { + if self.is_force_leader() { + info!( + "Unsafe recovery, finish wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_index" => target_index, + "applied" => self.raft_group.raft.raft_log.applied, + "force" => force, + ); + } + self.unsafe_recovery_state = None; + } + } + } } #[derive(Default, Debug)] diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d056e0d5497..d8640fc979b 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1093,6 +1093,11 @@ where self.last_term } + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + #[inline] pub fn applied_index(&self) -> u64 { self.apply_state.get_applied_index() diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 9043982901c..2d0f5dd66c0 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -780,10 +780,6 @@ impl< } } -pub fn integration_on_half_fail_quorum_fn(voters: usize) -> usize { - (voters + 1) / 2 + 1 -} - #[derive(PartialEq, Eq, Debug)] pub enum ConfChangeKind { // Only contains one configuration change @@ -1935,16 +1931,6 @@ mod tests { } } - #[test] - fn test_integration_on_half_fail_quorum_fn() { - let voters = vec![1, 2, 3, 4, 5, 6, 7]; - let quorum = vec![2, 2, 3, 3, 4, 4, 5]; - for (voter_count, expected_quorum) in voters.into_iter().zip(quorum) { - let quorum = super::integration_on_half_fail_quorum_fn(voter_count); - assert_eq!(quorum, expected_quorum); - } - } - #[test] fn test_is_region_initialized() { let mut region = metapb::Region::default(); diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 9c46daf201e..1f2d357b80c 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -23,7 +23,8 @@ pub use self::cleanup_sst::{Runner as CleanupSSTRunner, Task as CleanupSSTTask}; pub use self::compact::{Runner as CompactRunner, Task as CompactTask}; pub use self::consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}; pub use self::pd::{ - FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, + new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, + Runner as PdRunner, Task as PdTask, }; pub use self::query_stats::QueryStats; pub use self::raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index bdca2d161f5..4652a35876a 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -3,12 +3,12 @@ use std::cmp::Ordering as CmpOrdering; use std::fmt::{self, Display, Formatter}; use std::sync::mpsc::{self, Receiver, Sender}; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::sync::{atomic::Ordering, Arc}; use std::thread::{Builder, JoinHandle}; use std::time::{Duration, Instant}; use std::{cmp, io}; -use engine_traits::{KvEngine, RaftEngine, CF_RAFT}; +use engine_traits::{KvEngine, RaftEngine}; #[cfg(feature = "failpoints")] use fail::fail_point; use kvproto::kvrpcpb::DiskFullOpt; @@ -16,7 +16,7 @@ use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, SplitRequest, }; -use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState}; +use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}; use kvproto::{metapb, pdpb}; use ordered_float::OrderedFloat; @@ -26,31 +26,31 @@ use yatp::Remote; use crate::store::cmd_resp::new_error; use crate::store::metrics::*; -use crate::store::util::{ - is_epoch_stale, ConfChangeKind, KeysInfoFormatter, LatencyInspector, RaftstoreDuration, -}; +use crate::store::util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}; use crate::store::worker::query_stats::QueryStats; use crate::store::worker::split_controller::{SplitInfo, TOP_N}; use crate::store::worker::{AutoSplitController, ReadStats, WriteStats}; use crate::store::{ + peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, + transport::SignificantRouter, Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SnapManager, StoreInfo, StoreMsg, TxnExt, + RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }; use collections::HashMap; +use collections::HashSet; use concurrency_manager::ConcurrencyManager; use futures::compat::Future01CompatExt; use futures::FutureExt; use pd_client::metrics::*; use pd_client::{Error, PdClient, RegionStat}; -use protobuf::Message; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; use tikv_util::metrics::ThreadInfoStatistics; use tikv_util::time::UnixSecs; use tikv_util::timer::GLOBAL_TIMER_HANDLE; use tikv_util::topn::TopN; use tikv_util::worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}; -use tikv_util::{box_err, box_try, debug, error, info, thd_name, warn}; +use tikv_util::{box_err, debug, error, info, thd_name, warn}; type RecordPairVec = Vec; @@ -135,7 +135,7 @@ where StoreHeartbeat { stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + report: Option, dr_autosync_status: Option, }, ReportBatchSplit { @@ -1024,7 +1024,7 @@ where &mut self, mut stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + store_report: Option, dr_autosync_status: Option, ) { let store_stats = self.engine_store_server_helper.handle_compute_store_stats(); @@ -1117,119 +1117,72 @@ where let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); - let mut optional_report = None; - if send_detailed_report { - let mut store_report = pdpb::StoreReport::new(); - store_info - .kv_engine - .scan_cf( - CF_RAFT, - keys::REGION_META_MIN_KEY, - keys::REGION_META_MAX_KEY, - false, - |key, value| { - let (_, suffix) = box_try!(keys::decode_region_meta_key(key)); - if suffix != keys::REGION_STATE_SUFFIX { - return Ok(true); - } - - let mut region_local_state = RegionLocalState::default(); - region_local_state.merge_from_bytes(value)?; - if region_local_state.get_state() == PeerState::Tombstone { - return Ok(true); - } - let raft_local_state = match store_info - .raft_engine - .get_raft_state(region_local_state.get_region().get_id()) - .unwrap() - { - None => return Ok(true), - Some(value) => value, - }; - let mut peer_report = pdpb::PeerReport::new(); - peer_report.set_region_state(region_local_state); - peer_report.set_raft_state(raft_local_state); - store_report.mut_peer_reports().push(peer_report); - Ok(true) - }, - ) - .unwrap(); - optional_report = Some(store_report); - } let router = self.router.clone(); - let scheduler = self.scheduler.clone(); - let stats_copy = stats.clone(); - let resp = - self.pd_client - .store_heartbeat(stats, optional_report, dr_autosync_status.clone()); + let resp = self + .pd_client + .store_heartbeat(stats, store_report, dr_autosync_status); let f = async move { match resp.await { Ok(mut resp) => { if let Some(status) = resp.replication_status.take() { let _ = router.send_control(StoreMsg::UpdateReplicationMode(status)); } - if resp.get_require_detailed_report() { - // This store needs to report detailed info of hosted regions to PD. - // - // The info has to be up to date, meaning that all committed changes til now have to be applied before the report is sent. - // The entire process may include: - // 1. `broadcast_normal` "wait apply" messsages to all peers. - // 2. `on_unsafe_recovery_wait_apply` examines whether the peer have not-yet-applied entries, if so, memorize the target index. - // 3. `on_apply_res` checks whether entries before the "unsafe recovery report target commit index" have all been applied. - // The one who finally finds out the number of remaining tasks is 0 schedules an unsafe recovery reporting store heartbeat. - info!("required to send detailed report in the next heartbeat"); - // Init the counter with 1 in case the msg processing is faster than the distributing thus cause FSMs race to send a report. - let counter = Arc::new(AtomicUsize::new(1)); - let counter_clone = counter.clone(); - router.broadcast_normal(|| { - let _ = counter_clone.fetch_add(1, Ordering::Relaxed); - PeerMsg::UnsafeRecoveryWaitApply(counter_clone.clone()) - }); - // Reporting needs to be triggered here in case there is no message to be sent or messages processing finished before above function returns. - if counter.fetch_sub(1, Ordering::Relaxed) == 1 { - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); + if let Some(mut plan) = resp.recovery_plan.take() { + info!("Unsafe recovery, received a recovery plan"); + if plan.has_force_leader() { + let mut failed_stores = HashSet::default(); + for failed_store in plan.get_force_leader().get_failed_stores() { + failed_stores.insert(*failed_store); } - } - } else if resp.has_plan() { - info!("asked to execute recovery plan"); - for create in resp.get_plan().get_creates() { - info!("asked to create region"; "region" => ?create); - if let Err(e) = - router.send_control(StoreMsg::CreatePeer(create.clone())) - { - error!("fail to send creat peer message for recovery"; "err" => ?e); + let syncer = UnsafeRecoveryForceLeaderSyncer::new( + plan.get_step(), + router.clone(), + ); + for region in plan.get_force_leader().get_enter_force_leaders() { + if let Err(e) = router.significant_send( + *region, + SignificantMsg::EnterForceLeaderState { + syncer: syncer.clone(), + failed_stores: failed_stores.clone(), + }, + ) { + error!("fail to send force leader message for recovery"; "err" => ?e); + } } - } - for delete in resp.get_plan().get_deletes() { - info!("asked to delete peer"; "peer" => delete); - if let Err(e) = router.force_send(*delete, PeerMsg::Destroy(*delete)) { - error!("fail to send delete peer message for recovery"; "err" => ?e); + } else { + let syncer = UnsafeRecoveryExecutePlanSyncer::new( + plan.get_step(), + router.clone(), + ); + for create in plan.take_creates().into_iter() { + if let Err(e) = + router.send_control(StoreMsg::UnsafeRecoveryCreatePeer { + syncer: syncer.clone(), + create, + }) + { + error!("fail to send create peer message for recovery"; "err" => ?e); + } } - } - for update in resp.get_plan().get_updates() { - info!("asked to update region's range"; "region" => ?update); - if let Err(e) = router.force_send( - update.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(update.clone()), - ) { - error!("fail to send update range message for recovery"; "err" => ?e); + for delete in plan.take_tombstones().into_iter() { + if let Err(e) = router.significant_send( + delete, + SignificantMsg::UnsafeRecoveryDestroy(syncer.clone()), + ) { + error!("fail to send delete peer message for recovery"; "err" => ?e); + } + } + for mut demote in plan.take_demotes().into_iter() { + if let Err(e) = router.significant_send( + demote.get_region_id(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer: syncer.clone(), + failed_voters: demote.take_failed_voters().into_vec(), + }, + ) { + error!("fail to send update peer list message for recovery"; "err" => ?e); + } } - } - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); } } } @@ -1358,7 +1311,6 @@ where "try to change peer"; "region_id" => region_id, "changes" => ?change_peer_v2.get_changes(), - "kind" => ?ConfChangeKind::confchange_kind(change_peer_v2.get_changes().len()), ); let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); @@ -1787,14 +1739,9 @@ where Task::StoreHeartbeat { stats, store_info, - send_detailed_report, - dr_autosync_status, - } => self.handle_store_heartbeat( - stats, - store_info, - send_detailed_report, + report, dr_autosync_status, - ), + } => self.handle_store_heartbeat(stats, store_info, report, dr_autosync_status), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::ValidatePeer { region, peer } => self.handle_validate_peer(region, peer), Task::ReadStats { read_stats } => self.handle_read_stats(read_stats), @@ -1889,7 +1836,7 @@ fn new_change_peer_request(change_type: ConfChangeType, peer: metapb::Peer) -> A req } -fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { +pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ChangePeerV2); let change_peer_reqs = changes diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index 05d10f03dfb..067983d2805 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -89,6 +89,7 @@ impl Runner { Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) }); let deleted = box_try!(self.engines.raft.batch_gc(regions)); + fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); Ok(deleted) } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 55f5b991e7c..833920be724 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1460,6 +1460,42 @@ impl Cluster { .unwrap(); } + pub fn enter_force_leader(&mut self, region_id: u64, store_id: u64, failed_stores: Vec) { + let mut plan = pdpb::RecoveryPlan::default(); + let mut force_leader = pdpb::ForceLeader::default(); + force_leader.set_enter_force_leaders([region_id].to_vec()); + force_leader.set_failed_stores(failed_stores.to_vec()); + plan.set_force_leader(force_leader); + // Triggers the unsafe recovery plan execution. + self.pd_client.must_set_unsafe_recovery_plan(store_id, plan); + self.must_send_store_heartbeat(store_id); + } + + pub fn must_enter_force_leader( + &mut self, + region_id: u64, + store_id: u64, + failed_stores: Vec, + ) { + self.enter_force_leader(region_id, store_id, failed_stores); + let mut store_report = None; + for _ in 0..20 { + store_report = self.pd_client.must_get_store_report(store_id); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + } + + pub fn exit_force_leader(&mut self, region_id: u64, store_id: u64) { + let router = self.sim.rl().get_router(store_id).unwrap(); + router + .significant_send(region_id, SignificantMsg::ExitForceLeaderState) + .unwrap(); + } + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { let mut try_cnt = 0; let split_count = self.pd_client.get_split_count(); @@ -1700,57 +1736,6 @@ impl Cluster { StoreRouter::send(&router, StoreMsg::Tick(StoreTick::PdStoreHeartbeat)).unwrap(); } - pub fn must_update_region_for_unsafe_recover(&mut self, node_id: u64, region: &metapb::Region) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - router - .force_send( - region.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(region.clone()), - ) - .unwrap(); - } - if let Ok(Some(current)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - if current.get_start_key() == region.get_start_key() - && current.get_end_key() == region.get_end_key() - { - return; - } - } - if try_cnt > 500 { - panic!("region {:?} is not updated", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - - pub fn must_recreate_region_for_unsafe_recover( - &mut self, - node_id: u64, - region: &metapb::Region, - ) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - StoreRouter::send(&router, StoreMsg::CreatePeer(region.clone())).unwrap(); - } - if let Ok(Some(_)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - return; - } - if try_cnt > 250 { - panic!("region {:?} is not created", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - pub fn gc_peer( &mut self, region_id: u64, diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index a436dae8d82..ab494178820 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -325,7 +325,8 @@ struct PdCluster { pub check_merge_target_integrity: bool, unsafe_recovery_require_report: bool, - unsafe_recovery_store_reported: HashMap, + unsafe_recovery_store_reports: HashMap, + unsafe_recovery_plan: HashMap, } impl PdCluster { @@ -360,7 +361,8 @@ impl PdCluster { region_replication_status: HashMap::default(), check_merge_target_integrity: true, unsafe_recovery_require_report: false, - unsafe_recovery_store_reported: HashMap::default(), + unsafe_recovery_store_reports: HashMap::default(), + unsafe_recovery_plan: HashMap::default(), } } @@ -517,11 +519,11 @@ impl PdCluster { region.get_region_epoch().clone(), ); assert!(end_key > start_key); - let created_by_unsafe_recover = (!start_key.is_empty() || !end_key.is_empty()) - && incoming_epoch.get_version() == 1 - && incoming_epoch.get_conf_ver() == 1; + let created_by_unsafe_recovery = (!start_key.is_empty() || !end_key.is_empty()) + && incoming_epoch.get_version() == 0 + && incoming_epoch.get_conf_ver() == 0; let overlaps = self.get_overlap(start_key, end_key); - if created_by_unsafe_recover { + if created_by_unsafe_recovery { // Allow recreated region by unsafe recover to overwrite other regions with a "older" // epoch. return Ok(overlaps); @@ -727,10 +729,14 @@ impl PdCluster { self.min_resolved_ts } - fn handle_store_heartbeat(&mut self) -> Result { + fn handle_store_heartbeat(&mut self, store_id: u64) -> Result { let mut resp = pdpb::StoreHeartbeatResponse::default(); resp.set_require_detailed_report(self.unsafe_recovery_require_report); self.unsafe_recovery_require_report = false; + if let Some((_, plan)) = self.unsafe_recovery_plan.remove_entry(&store_id) { + debug!("Unsafe recovery, sending recovery plan"; "store_id" => store_id, "plan" => ?plan); + resp.set_recovery_plan(plan); + } Ok(resp) } @@ -739,19 +745,16 @@ impl PdCluster { self.unsafe_recovery_require_report = require_report; } - fn get_store_reported(&self, store_id: &u64) -> i32 { - *self - .unsafe_recovery_store_reported - .get(store_id) - .unwrap_or(&0) + fn set_unsafe_recovery_plan(&mut self, store_id: u64, recovery_plan: pdpb::RecoveryPlan) { + self.unsafe_recovery_plan.insert(store_id, recovery_plan); } - fn store_reported_inc(&mut self, store_id: u64) { - let reported = self - .unsafe_recovery_store_reported - .entry(store_id) - .or_insert(0); - *reported += 1; + fn get_store_report(&mut self, store_id: u64) -> Option { + self.unsafe_recovery_store_reports.remove(&store_id) + } + + fn set_store_report(&mut self, store_id: u64, report: pdpb::StoreReport) { + let _ = self.unsafe_recovery_store_reports.insert(store_id, report); } } @@ -1308,8 +1311,12 @@ impl TestPdClient { self.cluster.wl().set_require_report(require_report); } - pub fn must_get_store_reported(&self, store_id: &u64) -> i32 { - self.cluster.rl().get_store_reported(store_id) + pub fn must_get_store_report(&self, store_id: u64) -> Option { + self.cluster.wl().get_store_report(store_id) + } + + pub fn must_set_unsafe_recovery_plan(&self, store_id: u64, plan: pdpb::RecoveryPlan) { + self.cluster.wl().set_unsafe_recovery_plan(store_id, plan) } } @@ -1462,11 +1469,6 @@ impl PdClient for TestPdClient { { let cluster1 = Arc::clone(&self.cluster); let timer = self.timer.clone(); - { - if let Err(e) = self.cluster.try_write() { - println!("try write {:?}", e); - } - } let mut cluster = self.cluster.wl(); let store = cluster .stores @@ -1600,11 +1602,11 @@ impl PdClient for TestPdClient { cluster.store_stats.insert(store_id, stats); - if report.is_some() { - cluster.store_reported_inc(store_id); + if let Some(store_report) = report { + cluster.set_store_report(store_id, store_report); } - let mut resp = cluster.handle_store_heartbeat().unwrap(); + let mut resp = cluster.handle_store_heartbeat(store_id).unwrap(); if let Some(ref status) = cluster.replication_status { resp.set_replication_status(status.clone()); diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 293e3620f5c..d3a9d2a6d43 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -1,16 +1,18 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::iter::FromIterator; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::Arc; +use std::time::Duration; use futures::executor::block_on; +use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::util::find_peer; use test_raftstore::*; +use tikv_util::{config::ReadableDuration, mpsc}; -#[allow(clippy::mutex_atomic)] #[test] -fn test_unsafe_recover_send_report() { +fn test_unsafe_recovery_send_report() { let mut cluster = new_server_cluster(0, 3); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); @@ -27,68 +29,438 @@ fn test_unsafe_recover_send_report() { cluster.put(b"random_key1", b"random_val1").unwrap(); // Blocks the raft apply process on store 1 entirely . - let apply_triggered_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_triggered_pair2 = Arc::clone(&apply_triggered_pair); - let apply_released_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_released_pair2 = Arc::clone(&apply_released_pair); + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); fail::cfg_callback("on_handle_apply_store_1", move || { - { - let (lock, cvar) = &*apply_triggered_pair2; - let mut triggered = lock.lock().unwrap(); - *triggered = true; - cvar.notify_one(); - } - { - let (lock2, cvar2) = &*apply_released_pair2; - let mut released = lock2.lock().unwrap(); - while !*released { - released = cvar2.wait(released).unwrap(); - } - } + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); }) .unwrap(); // Mannually makes an update, and wait for the apply to be triggered, to simulate "some entries are commited but not applied" scenario. cluster.put(b"random_key2", b"random_val2").unwrap(); - { - let (lock, cvar) = &*apply_triggered_pair; - let mut triggered = lock.lock().unwrap(); - while !*triggered { - triggered = cvar.wait(triggered).unwrap(); + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + // Triggers the unsafe recovery store reporting process. + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Unblocks the apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; } + sleep_ms(100); } + assert_ne!(store_report, None); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_execution_result_report() { + let mut cluster = new_server_cluster(0, 3); + // Prolong force leader time. + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::minutes(5); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::minutes(10); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); // Makes the group lose its quorum. cluster.stop_node(nodes[1]); cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } - // Triggers the unsafe recovery store reporting process. - pd_client.must_set_require_report(true); + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region2 + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region2.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); cluster.must_send_store_heartbeat(nodes[0]); // No store report is sent, since there are peers have unapplied entries. for _ in 0..20 { - assert_eq!(pd_client.must_get_store_reported(&nodes[0]), 0); + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Unblocks the apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + for peer_report in store_report.unwrap().get_peer_reports() { + let region = peer_report.get_region_state().get_region(); + if region.get_id() == 101 { + assert_eq!(region.get_end_key(), b"random_key1".to_vec()); + } else { + assert_eq!(region.get_id(), region2.get_id()); + for peer in region.get_peers() { + if peer.get_store_id() != nodes[0] { + assert_eq!(peer.get_role(), metapb::PeerRole::Learner); + } + } + } + } + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recover_wait_for_snapshot_apply() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = 8; + cluster.cfg.raft_store.merge_max_log_gap = 3; + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.stop_node(nodes[1]); + let (raft_gc_triggered_tx, raft_gc_triggered_rx) = mpsc::bounded::<()>(1); + let (raft_gc_finished_tx, raft_gc_finished_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("worker_gc_raft_log", move || { + let _ = raft_gc_triggered_rx.recv(); + }) + .unwrap(); + fail::cfg_callback("worker_gc_raft_log_finished", move || { + let _ = raft_gc_finished_tx.send(()); + }) + .unwrap(); + // Add at least 4m data + (0..10).for_each(|_| cluster.must_put(b"random_k", b"random_v")); + // Unblock raft log GC. + drop(raft_gc_triggered_tx); + // Wait until logs are GCed. + raft_gc_finished_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + // Makes the group lose its quorum. + cluster.stop_node(nodes[2]); + + // Blocks the raft snap apply process. + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("region_apply_snap", move || { + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + cluster.run_node(nodes[1]).unwrap(); + + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + + // Triggers the unsafe recovery store reporting process. + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[1], plan); + cluster.must_send_store_heartbeat(nodes[1]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_report(nodes[1]), None); + sleep_ms(100); + } + + // Unblocks the snap apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[1]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + fail::remove("worker_gc_raft_log"); + fail::remove("worker_gc_raft_log_finished"); + fail::remove("raft_before_apply_snap_callback"); +} + +#[test] +fn test_unsafe_recovery_demotion_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..10 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); sleep_ms(100); } + // Send the plan again. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + // Unblocks the apply process. + drop(apply_released_tx); + + let mut demoted = false; + for _ in 0..10 { + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + demoted = region_in_pd + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + sleep_ms(100); + } + assert_eq!(demoted, true); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_create_destroy_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); { - let (lock2, cvar2) = &*apply_released_pair; - let mut released = lock2.lock().unwrap(); - *released = true; - cvar2.notify_all(); + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); } + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + plan.mut_tombstones().push(region2.get_id()); + + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + sleep_ms(100); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + // Store reports are sent once the entries are applied. - let mut reported = false; + let mut store_report = None; for _ in 0..20 { - if pd_client.must_get_store_reported(&nodes[0]) > 0 { - reported = true; + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { break; } sleep_ms(100); } - assert_eq!(reported, true); + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), 101); + assert_eq!(reported_region.get_peers().len(), 1); + assert_eq!(reported_region.get_peers()[0].get_id(), 102); fail::remove("on_handle_apply_store_1"); } diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index decc5ea1ab7..efa118fb8f1 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -28,5 +28,5 @@ mod test_status_command; mod test_tombstone; mod test_transfer_leader; mod test_transport; -mod test_unsafe_recover; +mod test_unsafe_recovery; mod test_update_region_size; diff --git a/tests/integrations/raftstore/test_unsafe_recover.rs b/tests/integrations/raftstore/test_unsafe_recover.rs deleted file mode 100644 index 5e7b774a6d1..00000000000 --- a/tests/integrations/raftstore/test_unsafe_recover.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::iter::FromIterator; -use std::sync::Arc; - -use futures::executor::block_on; -use kvproto::metapb; -use pd_client::PdClient; -use test_raftstore::*; - -#[test] -fn test_unsafe_recover_update_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey2".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - assert_eq!(region.get_end_key(), b"anykey2"); -} - -#[test] -fn test_unsafe_recover_create_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - let mut create = metapb::Region::default(); - create.set_id(101); - create.set_start_key(b"anykey".to_vec()); - let mut peer = metapb::Peer::default(); - peer.set_id(102); - peer.set_store_id(nodes[0]); - create.mut_peers().push(peer); - cluster.must_recreate_region_for_unsafe_recover(nodes[0], &create); - let region = pd_client.get_region(b"anykey1").unwrap(); - assert_eq!(create.get_id(), region.get_id()); -} diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs new file mode 100644 index 00000000000..487389e407f --- /dev/null +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -0,0 +1,1070 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::iter::FromIterator; +use std::sync::Arc; +use std::time::Duration; + +use futures::executor::block_on; +use kvproto::{metapb, pdpb}; +use pd_client::PdClient; +use raft::eraftpb::{ConfChangeType, MessageType}; +use raftstore::store::util::find_peer; +use test_raftstore::*; +use tikv_util::config::ReadableDuration; +use tikv_util::HandyRwLock; + +fn confirm_quorum_is_lost(cluster: &mut Cluster, region: &metapb::Region) { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); +} + +#[test] +fn test_unsafe_recovery_demote_failed_voters() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + demoted = true; + for peer in region.get_peers() { + if peer.get_id() != nodes[0] && peer.get_role() == metapb::PeerRole::Voter { + demoted = false; + } + } + if demoted { + break; + } + sleep_ms(200); + } + assert_eq!(demoted, true); +} + +// Demote non-exist voters will not work, but TiKV should still report to PD. +#[test] +fn test_unsafe_recovery_demote_non_exist_voters() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + let mut peer = metapb::Peer::default(); + peer.set_id(12345); + peer.set_store_id(region.get_id()); + peer.set_role(metapb::PeerRole::Voter); + demote.mut_failed_voters().push(peer); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), region.get_id()); + assert_eq!(reported_region.get_peers().len(), 3); + let demoted = reported_region + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); + + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + let demoted = region_in_pd + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); +} + +#[test] +fn test_unsafe_recovery_auto_promote_learner() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store0.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_already_in_joint_state() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store2.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[2], peer_on_store2.get_id()), + ); + // Wait the new learner to be initialized. + sleep_ms(100); + pd_client.must_joint_confchange( + region.get_id(), + vec![ + ( + ConfChangeType::AddLearnerNode, + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ), + ( + ConfChangeType::AddNode, + new_peer(nodes[2], peer_on_store2.get_id()), + ), + ], + ); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_create_region() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let store0_peer = find_peer(®ion, nodes[0]).unwrap().to_owned(); + + // Removes the boostrap region, since it overlaps with any regions we create. + pd_client.must_remove_peer(region.get_id(), store0_peer); + cluster.must_remove_region(nodes[0], region.get_id()); + + configure_for_lease_read(&mut cluster, None, None); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_start_key(b"anykey".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + let mut plan = pdpb::RecoveryPlan::default(); + plan.mut_creates().push(create); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + let mut created = false; + for _ in 1..11 { + let region = pd_client.get_region(b"anykey1").unwrap(); + if region.get_id() == 101 { + created = true; + } + sleep_ms(200); + } + assert_eq!(created, true); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); +} + +// Test the case that two of three nodes fail and force leader on the rest node. +#[test] +fn test_force_leader_three_nodes() { + let mut cluster = new_node_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store3 = find_peer(®ion, 3).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store3.clone()); + + cluster.stop_node(2); + cluster.stop_node(3); + + // quorum is lost, can't propose command successfully. + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![2, 3]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on one of the +// rest nodes. +#[test] +fn test_force_leader_five_nodes() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // quorum is lost, can't propose command successfully. + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// which is a learner. +#[test] +fn test_force_leader_for_learner() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store1.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(peer_on_store1.get_store_id(), peer_on_store1.get_id()), + ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); + + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // promote the learner first and remove the peers on failed nodes + cluster + .pd_client + .must_add_peer(region.get_id(), find_peer(®ion, 1).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on a hibernated +// previous leader. +#[test] +fn test_force_leader_on_hibernated_leader() { + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on a hibernated +// previous follower. +#[test] +fn test_force_leader_on_hibernated_follower() { + test_util::init_log_for_test(); + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with triggering snapshot. +#[test] +fn test_force_leader_trigger_snapshot() { + let mut cluster = new_node_cluster(0, 5); + configure_for_snapshot(&mut cluster); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // Isolate node 2 + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // Compact logs to force requesting snapshot after clearing send filters. + let state = cluster.truncated_state(region.get_id(), 1); + // Write some data to trigger snapshot. + for i in 100..150 { + let key = format!("k{}", i); + let value = format!("v{}", i); + cluster.must_put(key.as_bytes(), value.as_bytes()); + } + cluster.wait_log_truncated(region.get_id(), 1, state.get_index() + 40); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // Recover the isolation of 2, but still don't permit snapshot + let recv_filter = Box::new( + RegionPacketFilter::new(region.get_id(), 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgSnapshot), + ); + cluster.sim.wl().add_recv_filter(2, recv_filter); + cluster.clear_send_filters(); + + // wait election timeout + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 5, + ); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + ); + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(1000)) + .unwrap() + .get_header() + .has_error() // error "there is a pending conf change" indicating no committed log after being the leader + ); + + // Permit snapshot message, snapshot should be applied and advance commit index now. + cluster.sim.wl().clear_recv_filters(2); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with uncommitted conf change. +#[test] +fn test_force_leader_with_uncommitted_conf_change() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // an uncommitted conf-change + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 2).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // the uncommitted conf-change is committed successfully after being force leader + cluster + .pd_client + .must_none_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), Some(b"v2".to_vec())); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that none of five nodes fails and force leader on one of the nodes. +// Note: It still can't defend extreme misuse cases. For example, a group of a, +// b and c. c is isolated from a, a is the leader. If c has increased its term +// by 2 somehow (for example false prevote success twice) and force leader is +// sent to b and break lease constrain, then b will reject a's heartbeat while +// can vote for c. So c becomes leader and there are two leaders in the group. +#[test] +fn test_force_leader_on_healthy_region() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // try to enter force leader, it can't succeed due to quorum isn't lost + cluster.enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // make sure it leaves pre force leader state. + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + // put and get can propose successfully. + assert_eq!(cluster.must_get(b"k1"), Some(b"v1".to_vec())); + cluster.must_put(b"k2", b"v2"); + + // try to exit force leader, it will be ignored silently as it's not in the force leader state + cluster.exit_force_leader(region.get_id(), 1); + + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the one not +// having latest log +#[test] +fn test_force_leader_on_wrong_leader() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // peer on node2 doesn't have latest committed log + cluster.stop_node(2); + cluster.must_put(b"k2", b"v2"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + cluster.run_node(2).unwrap(); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // try to force leader on peer of node2 which is stale + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); + // can't propose confchange as it's not in force leader state + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + cluster.exit_force_leader(region.get_id(), 2); + + // peer on node2 still doesn't have the latest committed log. + must_get_none(&cluster.get_engine(2), b"k2"); +} + +// Test the case that three of five nodes fail and force leader twice on +// peers on different nodes +#[test] +fn test_force_leader_twice_on_different_peers() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // enter force leader on a different peer + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); + // leader is the peer of store 2 + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap(), + *find_peer(®ion, 2).unwrap() + ); + + // peer of store 1 should exit force leader state, and propose conf-change on it should fail + let conf_change = new_change_peer_request(ConfChangeType::RemoveNode, new_peer(3, 3)); + let mut req = new_admin_request(region.get_id(), region.get_region_epoch(), conf_change); + req.mut_header() + .set_peer(find_peer(®ion, 1).unwrap().clone()); + let resp = cluster + .call_command(req, Duration::from_millis(10)) + .unwrap(); + let mut not_leader = kvproto::errorpb::NotLeader { + region_id: region.get_id(), + ..Default::default() + }; + not_leader.set_leader(find_peer(®ion, 2).unwrap().clone()); + assert_eq!(resp.get_header().get_error().get_not_leader(), ¬_leader,); + + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 2); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader twice on +// peer on the same node +#[test] +fn test_force_leader_twice_on_same_peer() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader doesn't finish +// in one election rounds due to network partition. +#[test] +fn test_force_leader_multiple_election_rounds() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.add_send_filter(IsolationFilterFactory::new(1)); + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // wait multiple election rounds + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 6, + )); + + cluster.clear_send_filters(); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +}