From 8415ad1ca8c7e128534681ea5ade3f7ad2c7d9ee Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Mon, 30 May 2022 14:27:09 +0800 Subject: [PATCH] Manually merge raftstore-proxy-6.0-try into raftstore-proxy (#69) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * storage: Add API V2 check for RawKV and TxnKV requests (txn part) (#11216) * wip Signed-off-by: pingyu * wip Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu * fix enable_ttl Signed-off-by: andylokandy * simplify test Signed-off-by: andylokandy * ref #10974: Add API V2 check for RawKV and TxnKV requests (txn) Signed-off-by: pingyu Co-authored-by: andylokandy Co-authored-by: Ti Chi Robot * cdc: load uninlined value more effectively (#11615) Signed-off-by: qupeng * cdc: load old value with prefix seek if possible (#11643) * close #11642 Signed-off-by: qupeng * a more comment Signed-off-by: qupeng * ctl: split into multiple modules (#11658) * ctl: split into multiple modules ref #10938 Signed-off-by: andylokandy * refactor if let to match Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy Co-authored-by: Ti Chi Robot * Fix build issues on non-linux platforms (#11671) * close #11676 Signed-off-by: devillove084 <786537003@qq.com> * all: Migrate from std:: types to primitives. (#11677) * Deprecation is planned for MIN,MAX,EPSLION,INFINITY,NEG_INFINITY of the (i|u|f)(8|16|32|64) types in standard. ref #11678 ```release-note NONE ``` Signed-off-by: Harold Dost * metrics alert: fix apply cpu alert (#11672) Signed-off-by: zhangjinpeng1987 Co-authored-by: Ti Chi Robot * resrc_mtr: introduce scanned keys statistics (#11085) close #11690 Signed-off-by: mornyx * file_system: support collect the io flow by proc (#11667) * file_system: support collect the read flow by proc Signed-off-by: MuZhou233 * fix lint Signed-off-by: MuZhou233 * fix clippy Signed-off-by: MuZhou233 * Delete unused content & Adjust copyright text Signed-off-by: MuZhou233 * Add bench_fetch_io_bytes Signed-off-by: MuZhou233 * fix lint Signed-off-by: MuZhou233 * Integrate thread_io Signed-off-by: MuZhou233 * record bytes in request Signed-off-by: MuZhou233 * fix logical bug Signed-off-by: MuZhou233 * Update test write_bytes Signed-off-by: MuZhou233 * Delete unused & Rename func Signed-off-by: MuZhou233 * Add test for read & Update test for write Signed-off-by: MuZhou233 * Add buffered thread io Signed-off-by: MuZhou233 * Replace DashMap with ThreadLocal Signed-off-by: MuZhou233 * Rename variables and function & Remove unused call Signed-off-by: MuZhou233 * Change fetch logic. close #10867 Signed-off-by: MuZhou233 * Add AtomicIOBytes Signed-off-by: MuZhou233 * Split ThreadIOSentinel Signed-off-by: MuZhou233 * add THREAD_IO_TOTAL for metrics bytes fetcher Signed-off-by: MuZhou233 * Adjust code style & fix Signed-off-by: MuZhou233 * improve write_bytes test Signed-off-by: MuZhou233 Co-authored-by: Ti Chi Robot * *: check epoch and validness of lock table when writing pessimistic locks (#11591) ref #11452 Signed-off-by: Yilin Chen * rsmeter: introduce datasink (#11688) * add datasink trait Signed-off-by: Zhenchi * RAII removes on_reporter_closing Signed-off-by: Zhenchi * fmt Signed-off-by: Zhenchi * fix build ref #11691 Signed-off-by: Zhenchi * datasink -> data_sink Signed-off-by: Zhenchi * fmt Signed-off-by: Zhenchi * fix build Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot * clippy: Fixing up clippy errors. (#11193) * clippy: Fixing up clippy errors. * Fix crate references * Fix anonymous lifetimes references. ref #4301 Signed-off-by: Harold Dost * Test Revert of Iterations ref #4301 Signed-off-by: Harold Dost * clippy: Enable check for rust-2018-idioms ref #4301 Signed-off-by: Harold Dost * rsmeter: add centralized place to construct protobuf data (#11696) ref #11691 Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot * tikv_util: update procfs to 0.12.0 (#11703) Signed-off-by: linning * Use generic for api version to reduce runtime branching (#11687) * Use generic for api version to reduce runtime branching ref #10938 Signed-off-by: andylokandy * fix test Signed-off-by: andylokandy * add test Signed-off-by: andylokandy * fix clippy Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy * fmt Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy * address comment Signed-off-by: andylokandy * rename vars Signed-off-by: andylokandy * improve vars Signed-off-by: andylokandy Co-authored-by: Ti Chi Robot * sst_importer: check api version when importing (#11664) * sst_importer: check api version when importing ref #10938 Signed-off-by: Peng Guanwen * Fix bugs Signed-off-by: Peng Guanwen * fix bug of Iterator::{scan,scan_cf} Signed-off-by: Peng Guanwen * remove redundant log Signed-off-by: Peng Guanwen * Show the entire key Signed-off-by: Peng Guanwen * Fix lint issue Signed-off-by: Peng Guanwen * resolve conflict Signed-off-by: andylokandy Co-authored-by: Andy Lok * backup: pipeline scan and save step (#11528) * br: sperate io and scan threads Signed-off-by: Yu Juncen * br/stream: make clippy happy Signed-off-by: yujuncen * br/stream: fix tests Signed-off-by: Yu Juncen * backup: fix a dummy bug Signed-off-by: Yu Juncen * backup: add config of io-threads Signed-off-by: yujuncen * backup: ref #11350: some minior change Signed-off-by: yujuncen * backup: added some metrics Signed-off-by: yujuncen * backup: better tuning concurrency, and added a metric Signed-off-by: yujuncen * backup: added some slow log and metrics Signed-off-by: yujuncen * backup: address comments Signed-off-by: yujuncen * backup: fix build Signed-off-by: yujuncen * Revert "backup: fix build" This reverts commit 74537be56c410d0f91af2f48b7e65356ab53720e. Signed-off-by: yujuncen * Revert "backup: address comments" This reverts commit 77d75756028bea3496f878f0ad8bbc562f5d00aa. Signed-off-by: yujuncen * backup: address comments Signed-off-by: yujuncen * backup: always set IO type to Export Signed-off-by: yujuncen * backup:make clippy happy Signed-off-by: yujuncen * backup: address comments Signed-off-by: yujuncen * metrics: fix the metrics cannot be displayed (#11710) ref #11662 Signed-off-by: Ryan Leung Co-authored-by: Ti Chi Robot * backup: return api-version to BR when backup (#11704) * Update kvproto Signed-off-by: Peng Guanwen * Return api-version for br ref #10938 Signed-off-by: Peng Guanwen * Reformat code Signed-off-by: Peng Guanwen * Update components/external_storage/export/src/export.rs Signed-off-by: Peng Guanwen Co-authored-by: Andy Lok * format code Signed-off-by: Peng Guanwen Co-authored-by: Andy Lok * file_system: limit thread_io target os (#11715) * limit thread_io target os Signed-off-by: MuZhou233 * limit thread_io target os. typo. close #11698 Signed-off-by: MuZhou233 Co-authored-by: Yilin Chen * file_system: bypass file allocate implementation in fs2 (#11700) Close #10688 Patch fs2 to https://github.com/tabokie/fs2-rs/tree/tikv. In which, `posix_fallocate` is replaced with `fallocate` (https://github.com/danburkert/fs2-rs/pull/42). * bypass file allocate implementation in fs2 Signed-off-by: tabokie * metrics: remove min legend of some Grafana panels (#11487) Minimum legend isn't very useful, and it occupied valuable visual real estate. Remove some min legends from Grafana metrics, except for the following: - Level 0 chance - Number files at each level - File Count - Entry Count - Allocator Stats - Encrypted files - Encryption meta files size Ref #11119 Signed-off-by: tabokie * coprocessor: not return rows when there is no input for simple aggregation (#11708) * not return rows when there is no input for simple aggregation Signed-off-by: xufei * close #11735, and address comments Signed-off-by: xufei * backup: reduce the default thread pool size of backup, enable auto tune by default (#11699) * backup: clamp auto tune values Signed-off-by: yujuncen * ref #11000: change the default backup threads. Signed-off-by: yujuncen * backup: run tasks in background Signed-off-by: yujuncen * backup: run rustfmt Signed-off-by: yujuncen * backup: set remain threads to 20% of vcpu Signed-off-by: yujuncen * br-stream: fix build Signed-off-by: yujuncen * Backup: add S3 metrics && add s3_multi_part_size config (#11666) * s3: add request metrics for s3 storage Signed-off-by: 3pointer * s3: add grafana json Signed-off-by: 3pointer * br: add config for s3 multi part upload Signed-off-by: 3pointer * update comment Signed-off-by: 3pointer * update comment Signed-off-by: 3pointer * address comment Signed-off-by: 3pointer * hidden the new config Signed-off-by: 3pointer * only hidden config in config-template.toml Signed-off-by: 3pointer * address comment Signed-off-by: 3pointer * close #11727 && format Signed-off-by: 3pointer * fix Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot * rsmeter: support multiple datasinks (#11707) * rsmeter: support multiple datasinks Signed-off-by: Zhenchi * wrap config notifier Signed-off-by: Zhenchi * add unit tests Signed-off-by: Zhenchi * polish ref #11691 Signed-off-by: Zhenchi * address comments Signed-off-by: Zhenchi * fix test Signed-off-by: Zhenchi * fix outdated comments Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot * engine: properly estimate store size (#11728) * properly estimate store size, ref #11119 Signed-off-by: tabokie * address comments Signed-off-by: tabokie * implement global config client for tikv (#11685) * implement global config client close #11686 Signed-off-by: lemonhx * implement unit test for global config in pd client Signed-off-by: lemonhx * resolved formatting issues Signed-off-by: lemonhx * impl pd client asyncly Signed-off-by: lemonhx * according to reviewer's opinion removing store method implementation Signed-off-by: lemonhx * Merge branch 'master' of https://github.com/tikv/tikv into global_conf Signed-off-by: lemonhx * write test case for watch global config when grpc server is closed. Signed-off-by: lemonhx * resove issues Signed-off-by: lemonhx * clippy Signed-off-by: lemonhx * watch global config rentry future in test deadlock just removed Signed-off-by: lemonhx Co-authored-by: Zhenchi * storage: skip scanning lock when using rc (#11701) * skip scanning lock when using rc, ref #11485 Signed-off-by: you06 * format code Signed-off-by: you06 * add test Signed-off-by: you06 * address comment Signed-off-by: you06 * add an integration test Signed-off-by: you06 * format code Signed-off-by: you06 Co-authored-by: Ti Chi Robot * copr: pushdown substring to tikv (#11494) * corp: pushdown substring to tikv Signed-off-by: guo-shaoge * fix case. close #11495 Signed-off-by: guo-shaoge * fix comment Signed-off-by: guo-shaoge * rsmeter: distinguish between collectors and observers within recorder (#11712) * rsmeter: support multiple datasinks Signed-off-by: Zhenchi * wrap config notifier Signed-off-by: Zhenchi * add unit tests Signed-off-by: Zhenchi * polish ref #11691 Signed-off-by: Zhenchi * rsmeter: distinguish between collectors and observers within recorder Signed-off-by: Zhenchi * fmt & try to resolve #11689 Signed-off-by: Zhenchi * add unit tests ref #11691 Signed-off-by: Zhenchi * retrigger test Signed-off-by: Zhenchi * address comments Signed-off-by: Zhenchi * fix test Signed-off-by: Zhenchi * fix outdated comments Signed-off-by: Zhenchi * Update components/resource_metering/src/recorder/sub_recorder/mod.rs Signed-off-by: Zhenchi Co-authored-by: Yexiang Zhang * address comment Signed-off-by: Zhenchi * remove enabled Signed-off-by: Zhenchi * fix Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot Co-authored-by: Wenxuan Co-authored-by: Yexiang Zhang * config: validate online configurable thread pools (tikv#11159) (#11714) ref #11159 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * res_meter: upgrade kvproto (#11749) * Upgrade kvproto close #11748 Signed-off-by: mornyx * Fix test compilation Signed-off-by: mornyx * Fix test compilation Signed-off-by: mornyx * Fix test compilation Signed-off-by: mornyx * Use master kvproto Signed-off-by: mornyx Co-authored-by: Ti Chi Robot * rsmeter: add pubsub datasink (#11719) * rsmeter: add pubsub datasink close #11691 Signed-off-by: Zhenchi * fix build Signed-off-by: Zhenchi * Update components/resource_metering/src/reporter/pubsub.rs Signed-off-by: Zhenchi Co-authored-by: Wenxuan * fix build Signed-off-by: Zhenchi Co-authored-by: Wenxuan * status_server: Support online config update for configuration hosting platform (#11693) close #11692 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * res_meter: optimize top k (#11753) * Keep top k on recorder Signed-off-by: mornyx * Modify comments Signed-off-by: mornyx * Fix pd.rs Signed-off-by: mornyx * Refactor Signed-off-by: mornyx * Remove import Signed-off-by: mornyx * max_resource_groups 200 -> 100 Signed-off-by: mornyx * Fix ut Signed-off-by: mornyx * Use iterator to avoid clone Signed-off-by: mornyx * Optimize close #11754 Signed-off-by: mornyx Co-authored-by: Ti Chi Robot * raftclient: delay flush (#11705) * raftclient: delay flush Signed-off-by: Jay Lee * remove raftstore wait Signed-off-by: Jay Lee * Ref #11309. tikv#11310 introduces delay in raftstore thread to reduce the flush rate. It can only reduce flush rate from one thread. If there are many raftstore threads or IO threads, messages can still be flush frequently. This PR reduces the flush rate by introducing delay in raft client. So delay will work at connection level and achieve the maximun batch affect. Signed-off-by: Jay Lee * remove useless field Signed-off-by: Jay Lee * address comment Signed-off-by: Jay Lee * fix unstable raw get case Signed-off-by: Jay Lee Co-authored-by: Liqi Geng Co-authored-by: Ti Chi Robot * cloud: support azure blob storage as external storage backend (#11732) * support azure storage blob for external storage Signed-off-by: Leavrth * modify the priority of the azure login methods Signed-off-by: Leavrth * fix feature match branch Signed-off-by: Leavrth * implement token update retriable Signed-off-by: Leavrth * fix some problems Signed-off-by: Leavrth * close #11731 Signed-off-by: Leavrth * rustfmt Signed-off-by: Leavrth * make clippy success Signed-off-by: Leavrth * simplify uploader Signed-off-by: Leavrth * commit some suggestions Signed-off-by: Leavrth * coprocesser: update encoding lib (#11506) Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot * coprocesser: implement upper and lower function on GBK charset (#11756) Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot * logger: support archiving and rotation (#11651) (#11657) * logger: support archiving and rotation (#11651) close #11651 Signed-off-by: Wenbo Zhang * logger: add testcases to test dt_from_file_name (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: simplify DateTime format (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: do a little refactoring (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: do a little refactoring (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: do a little refactoring (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: do a little refactoring (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: do a little refactoring (#11651) ref #11651 Signed-off-by: Wenbo Zhang * diagnostics: refactor (#11651) ref #11651 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * *: run clippy for integration tests (#11768) * run clippy --fix with nightly-2021-12-17 Signed-off-by: tabokie * manual clippy Signed-off-by: tabokie * format and clippy Signed-off-by: tabokie * enable default features when running clippy, ref #11119 Signed-off-by: tabokie * fix build Signed-off-by: tabokie * revert box_collection lint Signed-off-by: tabokie * Update tests/integrations/raftstore/test_bootstrap.rs Signed-off-by: Xinye Tao Co-authored-by: Lei Zhao Co-authored-by: Lei Zhao * logger: fix config-template's log unit (#11651) (#11777) ref #11651 Signed-off-by: Wenbo Zhang * logger: improve testcase (#11651) (#11778) ref #11651 Signed-off-by: Wenbo Zhang * raftstore: propose in-memory pessimistic locks before leader transfer (#11577) * *: check epoch and validness of lock table when writing pessimistic locks ref #11452 Signed-off-by: Yilin Chen * raftstore: propose in-memory pessimistic locks before leader transfer ref #11452 Signed-off-by: Yilin Chen * mark delete in scheduler and delete after apply Signed-off-by: Yilin Chen * Add tests for two different orders between proposing locks and write command Signed-off-by: Yilin Chen * do not clear pessimistic locks before transfer leader Signed-off-by: Yilin Chen * delete memory locks in the apply thread and add more term and version check Signed-off-by: Yilin Chen * remove potentially incorrect assert Signed-off-by: Yilin Chen * check term and version when reading the lock table Signed-off-by: Yilin Chen * revert a log change Signed-off-by: Yilin Chen * change term when becoming follower Signed-off-by: Yilin Chen * fix lint Signed-off-by: Yilin Chen * do not check version when reading locks (temporarily) Signed-off-by: Yilin Chen * fix another test Signed-off-by: Yilin Chen * only treat MsgTransferLeader with context with valid msg Signed-off-by: Yilin Chen * fix a tiny comment error Signed-off-by: Yilin Chen * make clippy happy Signed-off-by: Yilin Chen * res_meter: ignore the read keys test temporarily (#11779) ref #11765 Signed-off-by: mornyx Co-authored-by: Ti Chi Robot * logger: change critical to fatal (#11651) (#11780) * logger: change critical to fatal (#11651) ref #11651 Signed-off-by: Wenbo Zhang * logger: fix test_parse_log_level (#11651) ref #11651 Signed-off-by: Wenbo Zhang * update CHANGELOG.md (#11790) * update from tikv-server v5.0.1(20210423) from https://github.com/tikv/tikv/releases Signed-off-by: xiejiandong * update CHANGELOG.md close #11167 Signed-off-by: xiejiandong * config: relax the bounds of online configurable worker number (#11651) (#11798) close #11776 Signed-off-by: Wenbo Zhang * logger: invoke logger_compatible_adjust before run_tikv (#11651) (#11792) close #11789 Signed-off-by: Wenbo Zhang Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot * *: unify thread related native APIs (#11785) * *: unify thread related native APIs Now there are duplicated codes calling syscalls to fetch thread informations. This PR provides unified APIs and remove duplicated codes. Close #11784. Signed-off-by: Jay Lee * address comment Signed-off-by: Jay Lee * declare platform compatability Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot * Fix method on_tick will not update metric when enable_auto_tune is false (#11791) * Fix method on_tick will not update metric when enable_auto_tune is false ref #11787 Signed-off-by: yuqi1129 * resolve format problem in discussion Signed-off-by: yuqi1129 * resolve format problem in discussion again Signed-off-by: yuqi1129 Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot * raftstore: add a new tick to renew lease in advance (#6427) * renew lease in raft tick Signed-off-by: 5kbpers * check lease before sending read index Signed-off-by: 5kbpers * renew lease on raft tick to cover lease till next tick Signed-off-by: linning * refactor Signed-off-by: linning * don't use inspect_lease Signed-off-by: linning * fix ci Signed-off-by: linning * add test Signed-off-by: linning * remove unneed comment Signed-off-by: linning * close #5388 Signed-off-by: 5kbpers * check pending_read_count before hibernating Signed-off-by: 5kbpers * increase the range to check lease expired Signed-off-by: 5kbpers * renew leader lease after not hibernate Signed-off-by: 5kbpers * fix hibernate states metrics Signed-off-by: 5kbpers * check writes and pre_read_index before proposing read index Signed-off-by: 5kbpers * fix test Signed-off-by: 5kbpers * use a individual tick to renewing lease Signed-off-by: 5kbpers * fix condition Signed-off-by: 5kbpers * stablize test_node_renew_lease Signed-off-by: 5kbpers * cleanup unused changes Signed-off-by: 5kbpers * add config item for interval of tick Signed-off-by: 5kbpers * rename tick Signed-off-by: 5kbpers * disable the tick for test_inconsistent_configuration Signed-off-by: 5kbpers * change deafault value of check_leader_lease_interval Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * disable the tick for test_read_hibernated_region Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * count renew lease read index into raft metrics Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * disable the tick for test_renew_lease Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * fix lint error Signed-off-by: 5kbpers * add a 100ms buffer for renew_bound Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * make clippy happy Signed-off-by: 5kbpers Co-authored-by: linning Co-authored-by: Ti Chi Robot * cloud: Add retry for azure blob server busy error (#11813) * add retry for azure blob server busy error Signed-off-by: Leavrth * close#11812 Signed-off-by: Leavrth * use to_string Signed-off-by: Leavrth Co-authored-by: Ti Chi Robot * tikv_util: make cgroup parsing more robust (#11786) * make cgroup parsing more robust Signed-off-by: tabokie * fix error message Signed-off-by: tabokie * parse cpu quota in floats Signed-off-by: tabokie * *: make it compile on macos (#11825) * *: make it compile on macos close #11823 Signed-off-by: Jay Lee * remove extra comment Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot * resolved_ts: add more metrics (#11809) * add metrics Signed-off-by: 5kbpers * add slow log Signed-off-by: 5kbpers * remove duplicate metric Signed-off-by: 5kbpers * add resolved ts metrics for leader peers Signed-off-by: 5kbpers * close #11767 Signed-off-by: 5kbpers * init endpoint with store meta Signed-off-by: 5kbpers Co-authored-by: Ti Chi Robot * rsmeter: fix perf regression caused by arc swap (#11833) * rsmeter: fix perf regression caused by arc swap Signed-off-by: Zhenchi * use atomic cell instead Signed-off-by: Zhenchi * debug assert Signed-off-by: Zhenchi Co-authored-by: Wenxuan * load base split: more accurate sample for large key ranges batch (#11039) * more accurate sample for large key ranges Signed-off-by: lhy1024 * update Signed-off-by: lhy1024 * ref #11521 Signed-off-by: lhy1024 * address comment Signed-off-by: lhy1024 * fix test Signed-off-by: lhy1024 Co-authored-by: Ti Chi Robot * raftclient: enable delay only in high load (#11772) and improve CPU efficiency under high load. But it turns out this can lead to significant regression in normal load. So this PR adds CPU stats and only enabling delay in high load. This is also the same strategy used in v4.x. Close #11769. Signed-off-by: Jay Lee * tikv-ctl: fix tikv-ctl's output is incomplete on calling process::exit (#11231) Signed-off-by: Yao Zongyou Co-authored-by: Xinye Tao Co-authored-by: Connor Co-authored-by: Ti Chi Robot * raftstore: skip deleting snapshot files in peer pending_remove is true (#11782) * raftstore: skip deleting snapshot files in peer when the peer is pending removal and the snapshot is being applied and canceled -- close #11746 This is to avoid the potential panic when the snapshot files are deleted, but the peer's status (Tombstone) is not persisted in disk due to tikv crash. Signed-off-by: tonyxuqqi * address code review feedback -- close #11746 Signed-off-by: qi.xu * address code review feedback 2 -- close #11746 Signed-off-by: qi.xu * address code review feedback 2 -- close #11746 Signed-off-by: qi.xu * address code review feedback 2 -- close #11746 Signed-off-by: qi.xu * address code review feedback 2 -- close #11746 Signed-off-by: qi.xu Co-authored-by: qi.xu * load_statis: fix compilation on macos (#11851) close #11772 Signed-off-by: Jay Lee * config: override rocksdb config when flow control enabled (#11840) Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * rocksdb: consider env sharing when configuring background threads (#11760) #9649 increased the default setting of kvdb background flushes (3 for 8c and 4 for 16c). This PR takes another approach: use smaller concurrency for individual dbs, but configure the global thread pool size with the sum of kvdb and raftdb concurrency. This way we won't over-configure flush workers when raft-engine is enabled. * consider env sharing when configuring background threads ref #11119 Signed-off-by: tabokie * server/config: keep compatible using option (#11862) * server/config: keep compatible using option Using `Option` for `heavy_load_wait_duration` to keep compatible with versions prior to v5.3. Close #11861. Signed-off-by: Jay Lee * fix format Signed-off-by: Jay Lee * fix clippy Signed-off-by: Jay Lee Co-authored-by: zhouqiang * raftstore: fix missing workers' stop on shutdown (#11864) ref #11159 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * raftstore: move in-memory pessimistic locks to split regions (#11655) ref #11452 Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * *: Remove the entrance of enabling io snooper. (#11394) * *: Remove the entrance of enabling io snooper. close #10867 Signed-off-by: MuZhou233 * fix Signed-off-by: MuZhou233 * fix test Signed-off-by: MuZhou233 * fix test Signed-off-by: MuZhou233 Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot * raftstore: change PeerTicks to enum (#11849) * raftstore: change PeerTicks to enum close #11848 Signed-off-by: Yilin Chen * use bool array for registry Signed-off-by: Yilin Chen * rename PeerTicks to PeerTick Signed-off-by: Yilin Chen * add associated const for number of PeerTick types Signed-off-by: Yilin Chen * change naming Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * *: check memory limit when inserting in-memory pessimistic locks (#11841) * *: check memory limit when inserting in-memory pessimistic locks ref #11452 Signed-off-by: Yilin Chen * fix merged upstream Signed-off-by: Yilin Chen * fix test Signed-off-by: Yilin Chen * fix typo Signed-off-by: Yilin Chen * *: fix invalid failpoint caused by typo (#11709) * fix invalid failpoint caused by typo close #11734 Signed-off-by: Ryan Leung * address the comment Signed-off-by: Ryan Leung * address the comment Signed-off-by: Ryan Leung Co-authored-by: Ti Chi Robot * raftstore: Extract significant router (#11750) * ref#11409 add significant router Signed-off-by: Connor1996 * make format Signed-off-by: Connor1996 * address comment Signed-off-by: Connor1996 * raftstore: move scan delete to raft log gc worker (#11853) * raftstore: move scan delete to raft log gc worker When clearing raft metas, raftstore will scan raft logs and delete them one by one. Seeking can be slow if there are a lot of tombstone keys. This PR moves the operation to raft log gc worker to reduce the impact. The final solution should be also moving remaining IO operations to async write IO threads. Close #10210. Signed-off-by: Jay Lee * address comment Signed-off-by: Jay Lee * speed up destroy Signed-off-by: Jay Lee * fix compile Signed-off-by: Jay Lee * further speed up Signed-off-by: Jay Lee * revert test case configuration Signed-off-by: Jay Lee * address comment Signed-off-by: Jay Lee * address comment Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot * doc: update new rules for linking issue and commit message (#11832) close #11097, close #11831 Signed-off-by: zhangyangyu Co-authored-by: Mini256 Co-authored-by: Mini256 Co-authored-by: Xiaoguang Sun * *: update rust toolchain to 2022-01-07 (#11875) * update rust toolchain to 2022-01-17 Signed-off-by: tabokie * address comment: clean up unnecessary `as_ref()` Signed-off-by: tabokie * try fixing the underflow Signed-off-by: tabokie * mute underflow warning for raft metrics Signed-off-by: tabokie * clean up unused data members Signed-off-by: tabokie * format Signed-off-by: tabokie * step back to 2022-01-07 Signed-off-by: tabokie * readme: change images based on github theme (#11795) Signed-off-by: sloorush Co-authored-by: Ti Chi Robot * raftstore: don't remove other peer's read delegate (#11882) * don't remove read delegate besides peer_destroy Signed-off-by: linning * add test case Signed-off-by: linning * make clippy happy Signed-off-by: linning * address comment Signed-off-by: linning Co-authored-by: Ti Chi Robot * logger: use eprintln! if the logger is not initialized (#11869) ref #11651 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * copr: fix the calculation of total column size in analyze (#11884) Signed-off-by: xuyifan Co-authored-by: Ti Chi Robot * engine: update raft-engine for data consistency fix (#11885) Fix https://github.com/tikv/raft-engine/issues/142. When encountering this bug, TiKV will panic with message "applied index > max(commit index, recorded commit index)" after restart. * update raft-engine Signed-off-by: tabokie * update one more commit Signed-off-by: tabokie * check docker build during clippy, ref #11312 (#11819) Signed-off-by: tabokie * gc_worker: fix incorrect scheduled_tasks counting (#11904) * gc_worker: fix incorrect scheduled_tasks counting close #11903 Signed-off-by: Yilin Chen * remove check_is_busy totally Signed-off-by: Yilin Chen * do not use wildcard match in error handling Signed-off-by: Yilin Chen * raftstore: Introduce raft log fetcher (#11900) * ref#11409 introduce raft log fetcher Signed-off-by: Connor1996 * update kvproto Signed-off-by: Connor1996 * address comment Signed-off-by: Connor1996 * fix test build Signed-off-by: Connor1996 * fix clippy Signed-off-by: Connor1996 * limit capacity Signed-off-by: Connor1996 * update kvproto Signed-off-by: Connor1996 * call stop on worker Signed-off-by: Connor1996 * rename worker Signed-off-by: Connor1996 * status_server: add pprof flamegraph response header (#10951) * tweak(status_server): add pprof flamegraph response header Signed-off-by: Suhaha * test(status_server): add Content-Type asset to test_pprof_profile_service Signed-off-by: Suhaha * tweak(status_server): add pprof flamegraph response header Signed-off-by: Suhaha * tweak(status_server): add pprof flamegraph response header close #11917 Signed-off-by: Suhaha * close #11917 Signed-off-by: Suhaha Co-authored-by: goroutine Co-authored-by: Connor * raftstore: renew leader lease in advance when handle read request (#9307) * renew lease advance Signed-off-by: linning * add log Signed-off-by: linning * make clippy happy Signed-off-by: linning * ref #11579 Signed-off-by: 5kbpers * reset raft tick Signed-off-by: 5kbpers * set has ready Signed-off-by: 5kbpers * address comments Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * add renew_leader_lease_advance_duration config Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * fix panic Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * address comment Signed-off-by: 5kbpers * disable renewing for test_lease_unsafe_during_leader_transfers Signed-off-by: 5kbpers Co-authored-by: 5kbpers * pprof: ignore cpu profiling on non-x86 arch (#11925) * Disable cpu profiling on non-x86_64 arch Signed-off-by: mornyx * Fix warns Signed-off-by: mornyx Co-authored-by: Ti Chi Robot * github: add new pr requirement for linking issue and commit message (#11887) Signed-off-by: zhangyangyu Co-authored-by: Xiaoguang Sun * engine: enable raft engine by default (#11928) * enable raft engine by default and synchronize docs Signed-off-by: tabokie * update raft engine Signed-off-by: tabokie * update raft-engine and fix tests Signed-off-by: tabokie * gc_worker: Limit the key range to scan for GcKeys tasks (#11922) close tikv/tikv#11752, close tikv/tikv#11902 Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot * raftstore: TransferLeader support multiple target peers (#11063) ref tikv/tikv#822, ref tikv/tikv#4214, close tikv/tikv#10602 Signed-off-by: MrCroxx * raftstore: remove the leaders field of `StoreMeta` (#11934) close tikv/tikv#11933 Remove `StoreMeta.leaders` ``` ### Related changes ### Check List Tests - Manual test (add detailed scripts or steps below) `cargo check` ### Release note ```release-note None Signed-off-by: linning Co-authored-by: Ti Chi Robot * raftstore: Fetch raft log in async manner (#11409) close tikv/tikv#10408, close tikv/tikv#11320 Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * copr: add quarter function (#11935) ref tikv/tikv#5751 Signed-off-by: zhongyong jin * Enable full debug info for dev and test (#11949) ref tikv/tikv#5049, ref tikv/tikv#5572, close tikv/tikv#5572 ``` ### Related changes - PR to update `pingcap/docs`/`pingcap/docs-cn`: - PR to update `pingcap/tidb-ansible`: - Need to cherry-pick to the release branch --> ### Check List Tests - Manual test (add detailed scripts or steps below) - No code Manual test: Linux box with 8 cores (3:50 -> 4:00): ``` # Before this PR: $ make clean cargo clean rm -rf bin dist $ time make build cargo build --no-default-features --features "jemalloc mem-profiling portable sse test-engines-rocksdb cloud-aws cloud-gcp cloud-azure" Compiling libc v0.2.106 ... Compiling server v0.0.1 (tikv/components/server) Finished dev [unoptimized + debuginfo] target(s) in 3m 50s real 3m50,487s user 38m11,859s sys 3m9,540s # After this PR: $ git diff diff --git a/Cargo.toml b/Cargo.toml index 71f5329d3..67cb9d183 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ default-members = ["cmd/tikv-server", "cmd/tikv-ctl"] [profile.dev] opt-level = 0 -debug = 1 # required for line numbers in tests, see tikv #5049 +debug = true codegen-units = 4 lto = false incremental = true @@ -293,7 +293,7 @@ codegen-units = 4 [profile.test] opt-level = 0 -debug = 1 # enable line numbers by default for easy test debugging +debug = true codegen-units = 16 lto = false incremental = true mattias@msig:~/repos/tikv$ time make clean cargo clean rm -rf bin dist real 0m0,975s user 0m0,148s sys 0m0,828s mattias@msig:~/repos/tikv$ time make build cargo build --no-default-features --features "jemalloc mem-profiling portable sse test-engines-rocksdb cloud-aws cloud-gcp cloud-azure" Compiling libc v0.2.106 ... Compiling server v0.0.1 (tikv/components/server) Finished dev [unoptimized + debuginfo] target(s) in 4m 00s real 4m0,201s user 39m45,037s sys 3m16,397s ``` Macbook Air M1: ``` # Before: % time make build cargo build --no-default-features --features " jemalloc test-engines-rocksdb cloud-aws cloud-gcp cloud-azure" Compiling libc v0.2.106 .... Compiling server v0.0.1 (tikv/components/server) Finished dev [unoptimized + debuginfo] target(s) in 4m 01s make build 1107.42s user 116.20s system 506% cpu 4:01.46 total # After: % time make build cargo build --no-default-features --features " jemalloc test-engines-rocksdb cloud-aws cloud-gcp cloud-azure" Compiling libc v0.2.106 .... Compiling server v0.0.1 (tikv/components/server) Finished dev [unoptimized + debuginfo] target(s) in 4m 10s make build 1179.39s user 120.74s system 518% cpu 4:10.98 total ``` Side effects - Performance regression, Only when building, to the benefit of full debug info by default. ### Release note ```release-note None Signed-off-by: Mattias Jonsson Co-authored-by: Ti Chi Robot * correct a metric about compaction filter (#11938) correct a metric about compaction filter Signed-off-by: qupeng * raft: Fix possible panic on entries fetched callback (#11961) close tikv/tikv#11951 Signed-off-by: Connor1996 * split_controller: refine the sample function and add some comments (#11952) Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * metrics: fix grid position in TiKV Details (#11936) ref tikv/tikv#11119 Fix grid positions in TiKV Details dashboard. Signed-off-by: tabokie Co-authored-by: Ti Chi Robot * cdc: capture old value from txn layer dynamically (#11896) close tikv/tikv#10091 Signed-off-by: hi-rustin Signed-off-by: qupeng Co-authored-by: hi-rustin * split_controller: reorganize the structs and add more comments (#11967) Reorganize the structs and add more comments. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * copr: support Bit column push downBit column (#11968) close tikv/tikv#11893, ref tikv/tikv#11893, ref pingcap/tidb#31884 Signed-off-by: xiejiandong * copr: support push down mod, sysdate to TiKV (#11970) close tikv/tikv#11916, ref tikv/tikv#11916 Signed-off-by: xiejiandong * raftstore: propose in-memory pessimistic locks before prepare merge (#11758) ref tikv/tikv#11452 Signed-off-by: Yilin Chen * stats_monitor: reformat stats_monitor tick (#11972) Signed-off-by: lhy1024 * copr: support extract scalar value from bit (#11980) close tikv/tikv#11893 Signed-off-by: yisaer Co-authored-by: Ti Chi Robot * diagnosis: support get cpu time of threads for macOS (#11978) close tikv/tikv#11977 Add the variant for macOS into `components/tikv_util/src/sys/thread.rs` Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot Co-authored-by: Jay * rust: update toolchain to fix missing rls (#11954) close tikv/tikv#11953 Signed-off-by: Connor1996 Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot Co-authored-by: Yilin Chen * cdc: separate resolved region outliers (#11991) Separate broadcasing outlier regions and normal regions, so 1) downstreams know where they should send resolve lock requests, and 2) resolved ts of normal regions does not fallback. close pingcap/tiflow#4516 close pingcap/tiflow#4311 ref pingcap/tiflow#4146 Signed-off-by: Neil Shen * *: collect key ranges for the read request in ResourceMeteringTag (#11995) close tikv/tikv#11988 *: collect key ranges for the read request in ResourceMeteringTag Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * split_controller: move sample_threshold check out of the split_key iteration (#11986) split_controller: move sample_threshold check out of the split_key iteration Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * copr: support insertutf8 and lowerutf8 (#11987) ref tikv/tikv#5751 Signed-off-by: yisaer Co-authored-by: Ti Chi Robot * copr: support rest greatest/least functions (#12003) ref tikv/tikv#5751 Signed-off-by: yisaer Co-authored-by: Ti Chi Robot * raftstore: force compact with gentleness (#12006) * force compact with gentleness Signed-off-by: tabokie * address comment Signed-off-by: tabokie * split_controller: refine the LOAD_BASE_SPLIT_EVENT metrics label definitions (#12010) split_controller: refine the LOAD_BASE_SPLIT_EVENT metrics label definitions Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * split_controller: fix the incorrect sampled key ranges number check (#12013) close tikv/tikv#12012 split_controller: fix the incorrect sampled key ranges number check Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * raftstore: Support adjusting max_inflight_msgs dynamically (#11866) close tikv/tikv#11865 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * copr: fix missing flag solvement for bit column in decoding (#12016) ref tikv/tikv#7, close pingcap/tidb#32506 Signed-off-by: yisaer Co-authored-by: Ti Chi Robot * file_system: overhaul proc-based io tracing utilities (#11873) ref tikv/tikv#10867, fix tikv/tikv#11775 Signed-off-by: tabokie * raftstore: Wait for the apply index equals to the commit index (#11716) ref tikv/tikv#10483 Signed-off-by: v01dstar * copr: revert 11968/11980/12016 (#12030) ref tikv/tikv#11968, ref tikv/tikv#11980, ref tikv/tikv#12016, ref pingcap/tidb#32506 Signed-off-by: yisaer * build: fix a Mac build issue (#12027) ref tikv/tikv#10867, ref tikv/tikv#11873 Signed-off-by: tabokie Co-authored-by: Ti Chi Robot * copr: Extra physical table id column (#11931) close tikv/tikv#11888 Added EXTRA_PHYSICAL_TABLE_ID_COL_ID support, to support TiDB's table partition dynamic prune mode, where a single request includes multiple partitions, but when Pessimistic lock (like `SELECT FOR UPDATE`) or ongoing transaction (having something in the tidb session local transaction buffer) each row needs its partition id / physical table ID to either lock that row or to check against the transaction buffer. Signed-off-by: Mattias Jonsson Co-authored-by: Ti Chi Robot * *: hotfix panic from tokio-timer (#12004) close tikv/tikv#11940, ref tikv/tikv#11940 *: hotfix panic from tokio-timer by enlarging the length of the level vector Signed-off-by: you06 * build: add arm64 check on SSE in makefile (#12035) close tikv/tikv#12034 Signed-off-by: Jin Dong Co-authored-by: Yilin Chen Co-authored-by: Ti Chi Robot * replication mode: sync state to pd (#11751) Signed-off-by: disksing Co-authored-by: Ti Chi Robot * raftstore: revert pessimistic locks status when failing to propose PrepareMerge (#11985) ref tikv/tikv#11452 Signed-off-by: Yilin Chen * copr: support push bit column down (#12037) ref pingcap/tidb#30738 Signed-off-by: yisaer Co-authored-by: Ti Chi Robot * split_controller: add more sample function test cases (#12058) split_controller: add more sample function test cases Signed-off-by: JmPotato * copr: fix greatest/least time/date args type (#12056) Signed-off-by: yisaer * cdc: advancing resolved ts correctly for clusters with tiflash (#12050) close pingcap/tiflow#4461 cdc: advancing resolved ts correctly for clusters with tiflash Signed-off-by: qupeng Co-authored-by: Ti Chi Robot * *: bump master branch version to v6.0.0-alpha (#12077) close tikv/tikv#12075 Signed-off-by: Yilin Chen * raftstore: check uninitialized destroy for merge (#12055) close tikv/tikv#12048 When a peer is destroyed without being initialized, it will store itself to peer list and the region epoch is missing. In merge if such state is detected, it should abort merging instead of panicking. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot * *: Add TruncateManager and TruncateWorker (#11553) ref tikv/rfcs#81 Signed-off-by: longfangsong Co-authored-by: Ti Chi Robot * TiKV supports buckets (#11763) ref tikv/tikv#11759 Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: Ti Chi Robot * api_version: Codec for RawKV key (#12036) ref tikv/tikv#11965 Key of RawKV is encoded as `user-key + memcomparable-padding + timestamp` in API v2. Signed-off-by: pingyu Co-authored-by: Andy Lok Co-authored-by: Ti Chi Robot * *: fix some typos (#12066) Signed-off-by: cuishuang Co-authored-by: Ti Chi Robot * raftstore: fix stale message cause panic (#12054) close tikv/tikv#12023 raftstore: fix stale message cause panic Signed-off-by: linning * raftstore: ignore async fetch result when the peer is pending removed (#12038) close tikv/tikv#11973, close tikv/tikv#12026 Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * *: remove part about wechat in doc (#12101) close tikv/tikv#12100 Signed-off-by: jackwener * engine: upgrade raft engine (#12095) ref tikv/tikv#165, ref tikv/raft-engine#165, ref tikv/tikv#11119 Signed-off-by: Randy Co-authored-by: Xinye Tao * *: Fix possible undefined behavior for transmuting vec (#12096) close tikv/tikv#12070 Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * github: Remove the pingcap/tidb-ansible from the PR template (#12102) close tikv/tikv#12103 Signed-off-by: hi-rustin Co-authored-by: Ti Chi Robot * copr: extract reschedule to inject a fail point (#12108) Signed-off-by: Zhenchi * raftstore: reactivate in-memory pessimistic locking when leader transfer fails (#11883) ref tikv/tikv#11452 Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * *: check memory locks for replica read only on the leader (#12115) close tikv/tikv#12109 Consider a reader sends read index to the leader it supposed to be, but when the leader receives the read index message, it has stepped down to a follower. Without this commit, a peer will check the locks and re-fill the read index context with the result no matter what role it is. So, when the read index request is redirected to the new leader again, it no longer carries the context of lock checking. This commmit changes to only do the check when the peer is a leader. Then, the read index request will remain unchanged before being redirected to the leader. If the lease of the leader expires, it is still safe to check on the uncertained leader. If the heartbeat check passes and no new leader is elected, then the check works fine. If there is a new leader, when the old leader becomes a follower, it will clear its pending read index. Then, the reader has to resend the read index again to the correct leader. So, generally it is safe as long as we guarantee the check only happens on the leader. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * apiv2 rawkv: logical deletion flag (#12090) ref tikv/tikv#11965 Signed-off-by: haojinming Co-authored-by: Ti Chi Robot * copr: add projection (#10689) close tikv/tikv#12114 Signed-off-by: ichn-hu Co-authored-by: Ti Chi Robot Co-authored-by: Alex Chi * raftstore: adjust raft max msg size dynamically (#12018) close tikv/tikv#12017 Signed-off-by: glorv Co-authored-by: Ti Chi Robot * *: update deps (#12098) - Upgrade grpcio to 0.10 - Upgrade openssl to 1.1.1m - Remove dependency on prost Signed-off-by: Jay Lee * tests: make raftstore tests about transactions more stable (#12122) close tikv/tikv#12120 A new fail point "after_propose" is added to insert callback from the test. Then we can ensure we release other key fail points after they take effect. The other change is to retry request automatically to avoid stale command errors. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * metrics: support multi k8s in grafana dashboards (#12008) close tikv/tikv#12104 Signed-off-by: just1900 * storage: set in-memory pessimistic lock feature gate to 6.0.0 (#12078) ref tikv/tikv#11452 The in-memory pessimistic lock feature should be only enabled after all TiKV instances have been upgraded to the new version. Otherwise, transferring leader may be blocked because of the new protocol and procedure when in-memory pessimistic locks exist. So, we need to use a feature gate to make sure this feature is only enabled after all TiKVs have been upgraded to the new version. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * raftstore,cdc: pass leader transferee to cdc observer (#12124) Signed-off-by: qupeng Co-authored-by: Ti Chi Robot * config: simplify handling of nullable configurations (#12133) Support deriving `OnlineConfig` on any `Option`. One caveat is such `T` must implement `Default`. * simplify handling of nullable configurations Signed-off-by: tabokie * add credit for copied code Signed-off-by: tabokie * patch unit test Signed-off-by: tabokie Co-authored-by: Ti Chi Robot * raftstore: support dynamic adjust max_batch_size (#11974) close tikv/tikv#11982 Signed-off-by: glorv * server: disallow coexist of raft engine and raftdb (#12047) * disallow coexist of raft engine and raftdb Signed-off-by: tabokie * add defer block Signed-off-by: tabokie * minor cleanup Signed-off-by: tabokie * check not clear Signed-off-by: tabokie * print out rm failure Signed-off-by: tabokie * print log for non-empty target engine Signed-off-by: tabokie * even more logs Signed-off-by: tabokie * use a state machine to monitor the switch Signed-off-by: tabokie * fix and add test Signed-off-by: tabokie * clippy Signed-off-by: tabokie * address comments Signed-off-by: tabokie * address comments Signed-off-by: tabokie * remove tempfile from engine_traits Signed-off-by: tabokie * address comments Signed-off-by: tabokie Co-authored-by: Ti Chi Robot * *: add more metrics and logs for in-memory pessimistic locking (#12089) ref tikv/tikv#11452 It adds logs when proposing locks during transferring leader and moving locks to new split regions. The metrics about the memory used by pessimistic locks and how many locks fail to be inserted due to reaching the memory limit are displayed in the grafana. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * *: notify snapshot apply (#12126) close tikv/tikv#12057 In the past, it relies on base tick to check if a snapshot is applied. If the base tick is set to a very large value, it will take a long time to detect snapshot is finished. This PR adds a casual message to notify peer actively. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot * txn: Fix assertion fail on non-pessimistic key with conflict (#12123) ref tikv/tikv#12113 txn: Fix assertion fail on non-pessimistic key with conflict (ref #12113) Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot * cmd: link c++ explicitly (#12145) close tikv/tikv#12144 In the past, we rely on librocksdb-sys to include the required stdc++ library. This is not reliable after upgrading grpcio, which also needs the same library. I suspect it doesn't include all the required symbols. Anyway, this PR chooses to link to stdc++ explicitly in binary. Signed-off-by: Jay Lee * Fix logic of error string match in `bad-ssts` (#12049) Signed-off-by: Xintao Co-authored-by: Ti Chi Robot * APIV2: Implement RawMvccSnapshot & RawMvccIterator (#12087) ref tikv/tikv#11965 Signed-off-by: haojinming * *: add tip in contributing.md (#12129) close tikv/tikv#12142 polish contributing.md Signed-off-by: jackwener Co-authored-by: Ti Chi Robot * dr-ausosync: report min resolved ts (#12052) ref tikv/pd#4686 Signed-off-by: lhy1024 Co-authored-by: Ti Chi Robot * txn: support read-consistency read with tso checking (#12097) Signed-off-by: cfzjywxk * raftstore: collect statistics of buckets (#12080) ref tikv/tikv#11718 Collect read/write flow of buckets. Signed-off-by: 5kbpers Co-authored-by: Ti Chi Robot * config: enable in-memory pessimistic locks by default (#12153) close tikv/tikv#11452 Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot * rsmeter: add integration tests for cpu (#12140) Signed-off-by: Zhenchi Co-authored-by: Ti Chi Robot * cdc: don't emit resolved timestamps before scan (#12156) Signed-off-by: qupeng * resolved_ts: fix failpoints (#12170) Signed-off-by: lhy1024 Co-authored-by: Ti Chi Robot * cdc: add a metrics for count of pending tasks (#12169) Signed-off-by: qupeng * raft_log_engine: fix managedfs create encrypted file (#12163) close tikv/tikv#12162 Signed-off-by: glorv Co-authored-by: Xinye Tao * config: limit backup workers cap (#12141) ref #11804 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot * split_controller: refine the sample function (#12177) split_controller: refine the sample function Signed-off-by: JmPotato * cdc: remove useless deps and fix typos (#12165) close tikv/tikv#12183 Remove usless deps and fix typos Signed-off-by: hi-rustin Co-authored-by: Ti Chi Robot * storage: support scale store scheduler pool size (#12068) close tikv/tikv#12067 Signed-off-by: glorv * *: update jemalloc version to fix profiling problem (#12179) close tikv/tikv#12180 *: upgrade jemalloc version to fix sampling problem during profiling Signed-off-by: cosven Co-authored-by: Jay * split_controller: refine and add some test cases for sample function (#12189) ref tikv/tikv#12185 split_controller: refine and add some test cases for sample function Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot * Foreground Quota Limiter: limit the cpu, bandwidth to provide stable QPS (#12151) close tikv/tikv#11855, ref tikv/tikv#11856 Add a global forefront quota limiter based speed limiter to record the CPU resource overhead occupied by various requests. When the limiter reaches the limit quota, the request will be forced to sleep for a period of time to compensate. Statistics are only done on the part of process and are therefore approximate. Signed-off-by: tonyxuqqi Signed-off-by: qi.xu Signed-off-by: zhangjinpeng1987 Signed-off-by: Xintao Co-authored-by: tonyxuqqi Co-authored-by: qi.xu Co-authored-by: zhangjinpeng1987 * UnifiedReadPool: support adjusting the pool size dynamically (#11783) close tikv/tikv#11781 Signed-off-by: Wenbo Zhang * raftstore: drop raft message by replication state (#12076) close tikv/tikv#12074 Signed-off-by: disksing * copr: return buckets version for unary call (#12184) Signed-off-by: youjiali1995 * tikv-ctl: support raft engine ctl (#12019) * tikv-ctl: support raft engine ctl Signed-off-by: tabokie * kebab case Signed-off-by: tabokie * fix build Signed-off-by: tabokie * switch to 0.1.x branch Signed-off-by: tabokie Co-authored-by: Ti Chi Robot * dr-ausosync: make report interval configurable (#12157) ref tikv/pd#4686 Signed-off-by: lhy1024 Co-authored-by: Ti Chi Robot * resolved_ts: fix oom and add more metrics (#12190) * add metrics Signed-off-by: 5kbpers * fix Signed-off-by: 5kbpers * remove unused code Signed-off-by: 5kbpers * add timeout Signed-off-by: 5kbpers Co-authored-by: qupeng Co-authored-by: Ti Chi Robot * avoid to banish region leaders when rolling upgrade (#12167) Signed-off-by: qupeng Co-authored-by: Ti Chi Robot * rocksdb: use new pipelined-write to replace multibatch-write (#12060) close tikv/tikv#12059 Signed-off-by: Little-Wallace * add vdso to the pprof blocklist (#12201) (#12209) close tikv/tikv#9765, ref tikv/tikv#12201 Signed-off-by: ti-srebot Co-authored-by: YangKeao * raftstore:Fix memory metrics overflow on raft msgs (#12186) (#12224) close tikv/tikv#12160, ref tikv/tikv#12186 Fix raft msg memory metrics overflow. Signed-off-by: ti-srebot Co-authored-by: tier-cap Co-authored-by: Ti Chi Robot * do not force compact unreplicated entries (#12181) (#12228) close tikv/tikv#12161, ref tikv/tikv#12181 Signed-off-by: ti-srebot Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot * resolved_ts: fix shutdown panic (#12244) (#12245) close tikv/tikv#12231, ref tikv/tikv#12244 resolved_ts: fix shutdown panic Signed-off-by: ti-srebot Co-authored-by: NingLin-P * Quota limiter: make method to delay cpu more precise (#12219) (#12242) close tikv/tikv#12218, ref tikv/tikv#12219 Use the results returned by the cpu limiter directly without extra processing Signed-off-by: ti-srebot Co-authored-by: Xintao Co-authored-by: you06 * raftstore: validate bucket config only when it's enabled (#12237) (#12239) close tikv/tikv#12235, ref tikv/tikv#12236, ref tikv/tikv#12237 Shouldn't report config related errors when bucket is disabled. Signed-off-by: ti-srebot Co-authored-by: Lei Zhao Co-authored-by: Ti Chi Robot * res_meter: fix assertion failure (#12252) (#12261) close tikv/tikv#12234, ref tikv/tikv#12252 Signed-off-by: ti-srebot Co-authored-by: Yexiang Zhang * cdc: remove downstreams correctly when uninitialized (#12262) (#12268) ref tikv/tikv#12262 cdc: remove downstreams correctly when uninitialized Signed-off-by: ti-srebot Co-authored-by: qupeng * Update procinfo-rs to support pid status parse in kernel 5.x (#12263) (#12265) ref tikv/tikv#12263 Update procinfo-rs to support pid status parse in kernel 5.x. Signed-off-by: ti-srebot Co-authored-by: Xintao Co-authored-by: Ti Chi Robot * Quota limiter: increase refill duration to avoid much context switch cost (#12264) (#12266) ref tikv/tikv#12264 Quota limiter: increase refill duration to avoid much context switch cost. Signed-off-by: ti-srebot Co-authored-by: Xintao Co-authored-by: Ti Chi Robot * raftstore: only persist merge target if the merge is known to be succeeded (#12251) (#12271) close tikv/tikv#12232, ref tikv/tikv#12251 raftstore: only persist merge target if the merge is known to be succeeded Signed-off-by: ti-srebot Co-authored-by: NingLin-P Co-authored-by: you06 * storage: send old value to CDC after in-memory pessimistic locking (#12280) (#12282) close tikv/tikv#12279, ref tikv/tikv#12280 Send old values through the TxnExtraScheduler after in-memory pessimisitc locking successfully. This will help improve the old value hit rate of CDC when enabling in-memory pessimistic locks. Signed-off-by: ti-srebot Co-authored-by: Yilin Chen * server: fix issues with critical config check (#12250) (#12284) close tikv/tikv#12238, ref tikv/tikv#12238, ref tikv/tikv#12250 Signed-off-by: ti-srebot Co-authored-by: Xinye Tao * metrics: remove the incorrect rate() for worker pending tasks (#12152) (#12249) close tikv/tikv#11915, ref tikv/tikv#12152 tikv_worker_pending_task_total is a gauge. We should directly use its value to represent the pending tasks instead of using rate(). This PR removes the rate() on these metrics. Signed-off-by: ti-srebot Co-authored-by: Yilin Chen Co-authored-by: Ti Chi Robot * server: do not rename directories when migrating raft data (#12273) (#12298) close tikv/tikv#12269, ref tikv/tikv#12269, ref tikv/tikv#12273 Signed-off-by: ti-srebot Co-authored-by: Xinye Tao * *: update grpcio (#12203) (#12299) close tikv/tikv#12198, close tikv/tikv#12202, ref tikv/tikv#12203 update grpcio to fix UAF Signed-off-by: ti-srebot Co-authored-by: Jay * refine some versions Signed-off-by: CalvinNeo * add except exec_ctx Signed-off-by: CalvinNeo * use exec_log_index/term Signed-off-by: CalvinNeo * remove exec_ctc Signed-off-by: CalvinNeo * before handle cdc Signed-off-by: CalvinNeo * it compiles Signed-off-by: CalvinNeo * fmt Signed-off-by: CalvinNeo * Default disable raft engine (#12317) * disable raft engine Signed-off-by: tabokie * fix lint and test Signed-off-by: tabokie * bump version v6.0.0 Signed-off-by: purelind * fix linux Signed-off-by: CalvinNeo * address test deps Signed-off-by: CalvinNeo * Fix test Signed-off-by: CalvinNeo * f Signed-off-by: CalvinNeo * fix linux Signed-off-by: CalvinNeo * fix Signed-off-by: CalvinNeo * fix some files Signed-off-by: CalvinNeo * fix Signed-off-by: CalvinNeo * fix Signed-off-by: CalvinNeo * ignore some unsupported tests Signed-off-by: CalvinNeo * Fix test_delegate_subscribe_unsubscribe flaking test (#12309) (#12315) close tikv/tikv#12278, ref tikv/tikv#12309 Fix test_delegate_subscribe_unsubscribe flaking test. Use the corresponding id, otherwise there will be failures due to concurrent runs of the test. (We will assign downstream ids concurrently). Signed-off-by: ti-srebot Co-authored-by: 二手掉包工程师 Co-authored-by: qupeng Co-authored-by: Ti Chi Robot * fix fmt Signed-off-by: CalvinNeo * disable more disk_full tests Signed-off-by: CalvinNeo * f Signed-off-by: CalvinNeo * disable Signed-off-by: CalvinNeo * remove stale peer Signed-off-by: CalvinNeo * pick pr 12301 Signed-off-by: CalvinNeo * Fix build when openssl v1.1 and v3 are both installed (#59) Signed-off-by: Wish * Enable frame pointer (#60) Signed-off-by: mornyx * Upgrade pprof-rs to v0.9 (use frame-pointer), and enable CPU Profiling on ARM (#61) Signed-off-by: mornyx * reduce log Signed-off-by: CalvinNeo * raftstore:Fix append log entry oom (#12247) close tikv/tikv#11379, close tikv/tikv#11598, close tikv/tikv#12107 when memory usage is high water, add flow control on the append log entries. disable log entry cache evict functions. Signed-off-by: tier-cap Co-authored-by: Ti Chi Robot Signed-off-by: CalvinNeo * fix pd.rs of reporting to PD Signed-off-by: CalvinNeo * fix Signed-off-by: CalvinNeo * raftstore: Introduce force leader state (#11932) close tikv/tikv#6107, ref tikv/tikv#10483 Signed-off-by: Connor1996 * raftstore: Wait ticks for hibernated peer when doing force leader (#12364) ref tikv/tikv#10483 Force leader is rejected on a peer who is already a leader. For the hibernated leader, it doesn't step down to follower when quorum is missing due to not ticking election. So wait ticks in force leader process for hibernated peers to make sure election ticking is performed. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * raftstore: Make unsafe recovery wait apply cover snapshot apply cases ref #10483 (#12308) ref tikv/tikv#10483 Signed-off-by: v01dstar * raftstore: Execute recovery plan via raft (#12022) Signed-off-by: Connor1996 * Raftstore proxy 6.0 try (#63) * raftstore: Introduce force leader state (#11932) close tikv/tikv#6107, ref tikv/tikv#10483 Signed-off-by: Connor1996 * raftstore: Wait ticks for hibernated peer when doing force leader (#12364) ref tikv/tikv#10483 Force leader is rejected on a peer who is already a leader. For the hibernated leader, it doesn't step down to follower when quorum is missing due to not ticking election. So wait ticks in force leader process for hibernated peers to make sure election ticking is performed. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot * raftstore: Make unsafe recovery wait apply cover snapshot apply cases ref #10483 (#12308) ref tikv/tikv#10483 Signed-off-by: v01dstar * raftstore: Execute recovery plan via raft (#12022) Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot Co-authored-by: Yang Zhang * Hotfix pprof (#65) Signed-off-by: mornyx * remove log Signed-off-by: CalvinNeo Co-authored-by: Ping Yu Co-authored-by: andylokandy Co-authored-by: Ti Chi Robot Co-authored-by: qupeng Co-authored-by: Devillove084 Co-authored-by: Harold Dost Co-authored-by: zhangjinpeng1987 Co-authored-by: Yexiang Zhang Co-authored-by: 沐 Co-authored-by: Yilin Chen Co-authored-by: Zhenchi Co-authored-by: Harold Dost Co-authored-by: NingLin-P Co-authored-by: Peng Guanwen Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Ryan Leung Co-authored-by: Xinye Tao Co-authored-by: xufei Co-authored-by: 3pointer Co-authored-by: LemonHX Co-authored-by: you06 Co-authored-by: guo-shaoge Co-authored-by: Wenxuan Co-authored-by: Zwb Co-authored-by: Wenxuan Co-authored-by: Jay Co-authored-by: Liqi Geng Co-authored-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Co-authored-by: xiongjiwei Co-authored-by: Lei Zhao Co-authored-by: xiejiandong Co-authored-by: Qi Yu Co-authored-by: 5kbpers Co-authored-by: lhy1024 Co-authored-by: Yao Zongyou Co-authored-by: Connor Co-authored-by: tonyxuqqi Co-authored-by: qi.xu Co-authored-by: zhouqiang Co-authored-by: Xiang Zhang Co-authored-by: Mini256 Co-authored-by: Xiaoguang Sun Co-authored-by: Aarush Bhat Co-authored-by: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Co-authored-by: Suhaha Co-authored-by: goroutine Co-authored-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: Croxx Co-authored-by: zhongyong jin Co-authored-by: Mattias Jonsson Co-authored-by: JmPotato Co-authored-by: hi-rustin Co-authored-by: Song Gao Co-authored-by: Neil Shen Co-authored-by: Yang Zhang Co-authored-by: Jin Dong Co-authored-by: disksing Co-authored-by: 龙方淞 Co-authored-by: cui fliter Co-authored-by: jakevin <30525741+jackwener@users.noreply.github.com> Co-authored-by: randy Co-authored-by: haojinming Co-authored-by: 虎 Co-authored-by: Alex Chi Co-authored-by: glorv Co-authored-by: Justin Huang Co-authored-by: Xintao Co-authored-by: haojinming Co-authored-by: cfzjywxk Co-authored-by: cosven Co-authored-by: Wallace Co-authored-by: ti-srebot <66930949+ti-srebot@users.noreply.github.com> Co-authored-by: YangKeao Co-authored-by: tier-cap Co-authored-by: purelind Co-authored-by: Wenxuan --- Cargo.lock | 3 +- components/error_code/src/raftstore.rs | 3 + components/pd_client/src/client.rs | 4 +- components/pd_client/src/util.rs | 2 + components/raftstore/Cargo.toml | 1 + components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/fsm/peer.rs | 899 ++++++++++---- components/raftstore/src/store/fsm/store.rs | 146 ++- components/raftstore/src/store/msg.rs | 45 +- components/raftstore/src/store/peer.rs | 348 +++++- .../raftstore/src/store/peer_storage.rs | 5 + components/raftstore/src/store/util.rs | 14 - components/raftstore/src/store/worker/mod.rs | 3 +- components/raftstore/src/store/worker/pd.rs | 191 ++- .../raftstore/src/store/worker/raftlog_gc.rs | 1 + components/test_raftstore/src/cluster.rs | 87 +- components/test_raftstore/src/pd.rs | 58 +- .../failpoints/cases/test_unsafe_recovery.rs | 444 ++++++- tests/integrations/raftstore/mod.rs | 2 +- .../raftstore/test_unsafe_recover.rs | 86 -- .../raftstore/test_unsafe_recovery.rs | 1070 +++++++++++++++++ 21 files changed, 2792 insertions(+), 629 deletions(-) delete mode 100644 tests/integrations/raftstore/test_unsafe_recover.rs create mode 100644 tests/integrations/raftstore/test_unsafe_recovery.rs diff --git a/Cargo.lock b/Cargo.lock index b2efc0d1717..aa9679d4a2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2231,7 +2231,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e22d8864c9579660424f83b62ca196c03ef85927" +source = "git+https://github.com/pingcap/kvproto.git#0e2f26c0a46ae7d666d6ca4410046a39e0c96f36" dependencies = [ "futures 0.3.15", "grpcio", @@ -3490,6 +3490,7 @@ dependencies = [ "fs2", "futures 0.3.15", "futures-util", + "getset", "into_other", "itertools 0.10.0", "keys", diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index eebd6fbe403..d7bf1af4ad5 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -28,6 +28,7 @@ define_error_codes!( DATA_IS_NOT_READY => ("DataIsNotReady", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), + RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -60,6 +61,8 @@ impl ErrorCodeExt for errorpb::Error { PROPOSAL_IN_MERGING_MODE } else if self.has_data_is_not_ready() { DATA_IS_NOT_READY + } else if self.has_recovery_in_progress() { + RECOVERY_IN_PROGRESS } else { UNKNOWN } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index b9fdce8740b..9d97682302e 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -719,7 +719,7 @@ impl PdClient for RpcClient { fn store_heartbeat( &self, mut stats: pdpb::StoreStats, - report_opt: Option, + store_report: Option, dr_autosync_status: Option, ) -> PdFuture { let timer = Instant::now(); @@ -730,7 +730,7 @@ impl PdClient for RpcClient { .mut_interval() .set_end_timestamp(UnixSecs::now().into_inner()); req.set_stats(stats); - if let Some(report) = report_opt { + if let Some(report) = store_report { req.set_store_report(report); } if let Some(status) = dr_autosync_status { diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index dc58506cdbd..970fcc8feff 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -548,6 +548,8 @@ impl PdConnector { let addr_trim = trim_http_prefix(addr); let channel = { let cb = ChannelBuilder::new(self.env.clone()) + .max_send_message_len(-1) + .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) .keepalive_timeout(Duration::from_secs(3)); self.security_mgr.connect(cb, addr_trim) diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 935e5f59f71..d90c0b3563d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -42,6 +42,7 @@ file_system = { path = "../file_system", default-features = false } fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = ["io"] } +getset = "0.1" into_other = { path = "../into_other", default-features = false } itertools = "0.10" keys = { path = "../keys", default-features = false } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 801626ac5fc..9929f43b7f7 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -60,6 +60,9 @@ pub enum Error { #[error("store ids {0:?}, errmsg {1}")] DiskFull(Vec, String), + #[error("region {0} is in the recovery progress")] + RecoveryInProgress(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -238,6 +241,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_region_not_initialized(e); } + Error::RecoveryInProgress(region_id) => { + let mut e = errorpb::RecoveryInProgress::default(); + e.set_region_id(region_id); + errorpb.set_recovery_in_progress(e); + } _ => {} }; @@ -271,6 +279,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::raftstore::REGION_NOT_FOUND, Error::NotLeader(..) => error_code::raftstore::NOT_LEADER, Error::DiskFull(..) => error_code::raftstore::DISK_FULL, + Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6b3ecdc32ad..c26c2a3ad21 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5,16 +5,14 @@ use std::borrow::Cow; use std::cell::Cell; use std::collections::Bound::{Excluded, Unbounded}; use std::collections::VecDeque; -use std::iter::Iterator; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::iter::{FromIterator, Iterator}; +use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{cmp, mem, u64}; use batch_system::{BasicMailbox, Fsm}; use collections::{HashMap, HashSet}; -use engine_traits::{ - Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatch, WriteBatchExt, WriteOptions, -}; +use engine_traits::{Engines, KvEngine, RaftEngine, SSTMetaInfo, WriteBatchExt}; use engine_traits::{CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; @@ -23,7 +21,7 @@ use kvproto::errorpb; use kvproto::import_sstpb::SwitchMode; use kvproto::kvrpcpb::DiskFullOpt; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::{CheckPolicy, StoreStats}; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, CmdType, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusResponse, @@ -58,7 +56,7 @@ use crate::store::cmd_resp::{bind_term, new_error}; use crate::store::fsm::store::{PollContext, StoreMeta}; use crate::store::fsm::{ apply, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, StoreInfo, + ExecResult, }; use crate::store::hibernate_state::{GroupState, HibernateState}; use crate::store::local_metrics::RaftMetrics; @@ -66,14 +64,16 @@ use crate::store::memory::*; use crate::store::metrics::*; use crate::store::msg::{Callback, ExtCallback, InspectedRaftMessage}; use crate::store::peer::{ - ConsistencyState, Peer, PersistSnapshotResult, StaleState, TRANSFER_LEADER_COMMAND_REPLY_CTX, + ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }; -use crate::store::peer_storage::write_peer_state; use crate::store::transport::Transport; -use crate::store::util::{is_learner, KeysInfoFormatter}; +use crate::store::util::{is_learner, KeysInfoFormatter, LeaseState}; use crate::store::worker::{ - ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, - SplitCheckTask, + new_change_peer_v2_request, ConsistencyCheckTask, RaftlogFetchTask, RaftlogGcTask, + ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }; use crate::store::PdTask; use crate::store::RaftlogFetchResult; @@ -148,12 +148,6 @@ where /// Before actually destroying a peer, ensure all log gc tasks are finished, so we /// can start destroying without seeking. logs_gc_flushed: bool, - - /// To make sure the reported store/peer meta is up to date, each peer has to wait for the log - /// at its target commit index to be applied. The last peer does so triggers the next procedure - /// which is reporting the store/peer meta to PD. - unsafe_recovery_target_commit_index: Option, - unsafe_recovery_wait_apply_counter: Option>, } pub struct BatchRaftCmdRequestBuilder @@ -272,8 +266,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -328,8 +320,6 @@ where trace: PeerMemoryTrace::default(), delayed_destroy: None, logs_gc_flushed: false, - unsafe_recovery_target_commit_index: None, - unsafe_recovery_wait_apply_counter: None, }), )) } @@ -677,12 +667,6 @@ where } } } - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - self.on_update_region_for_unsafe_recover(region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => { - self.on_unsafe_recovery_wait_apply(counter) - } } } // Propose batch request which may be still waiting for more raft-command @@ -744,189 +728,163 @@ where } } - fn on_update_region_for_unsafe_recover(&mut self, region: Region) { - let mut new_peer_list = HashSet::default(); - for peer in region.get_peers() { - new_peer_list.insert(peer.get_id()); - } - let to_be_removed: Vec = self - .region() - .get_peers() - .iter() - .filter(|&peer| !new_peer_list.contains(&peer.get_id())) - .map(|peer| peer.get_id()) - .collect(); - if to_be_removed.is_empty() - && self.region().get_start_key() == region.get_start_key() - && self.region().get_end_key() == region.get_end_key() - { - // Nothing to be updated, return directly. - return; - } - info!( - "updating the reigon for unsafe recover, original: {:?}, target: {:?}", - self.region(), - region - ); - if self.fsm.peer.has_valid_leader() { - panic!("region update for unsafe recover should only occur in leaderless reigons"); - } - if self.fsm.peer.raft_group.store().applied_index() - != self.fsm.peer.raft_group.store().commit_index() - { + fn on_unsafe_recovery_pre_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if self.fsm.peer.unsafe_recovery_state.is_some() { warn!( - "cannot proceed region update for unsafe recover, applied index is not equal to commit index" + "Unsafe recovery, demote failed voters has already been initiated"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); + syncer.abort(); return; } - let region_state_key = keys::region_state_key(region.get_id()); - let original_region_state = match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - { - Ok(Some(region_state)) => region_state, - Ok(None) => { - panic!("Can't find RegionLocalState while updating {:?}", region); - } - Err(e) => { - panic!( - "Fail to look up RegionLocalState while updating {:?} err {:?}", - region, e - ); - } - }; - let mut kv_wb = self.ctx.engines.kv.write_batch(); - write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None).unwrap_or_else(|e| { - panic!( - "fails to write RegionLocalState {:?} into write brach, err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to update RegionLocalstate {:?} err {:?}", region, e); + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, demoting failed voters failed, since this peer is not forced leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + ); + return; } - { - let mut meta = self.ctx.store_meta.lock().unwrap(); - meta.set_region( - &self.ctx.coprocessor_host, - region.clone(), - &mut self.fsm.peer, + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, already in joint state, exit first"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), ); - if meta - .region_ranges - .remove(&enc_end_key(original_region_state.get_region())) - .is_none() - { - panic!( - "{} original region does not exist in store meta", - self.fsm.peer.tag - ); - } - for (_, id) in meta.region_ranges.range(( - Excluded(keys::data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= keys::data_end_key(region.get_end_key()) { - break; + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to exit residual joint state"; + "err" => ?resp.response.get_header().get_error(), + ); } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } - if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) - .is_some() - { - panic!( - "key conflicts while inserting region {:?} into store meta", - region - ); - } - } - for peer_id in to_be_removed.clone() { - let mut cc = eraftpb::ConfChangeV2::default(); - let mut ccs = eraftpb::ConfChangeSingle::default(); - ccs.set_change_type(eraftpb::ConfChangeType::RemoveNode); - ccs.set_node_id(peer_id); - cc.set_transition(eraftpb::ConfChangeTransition::Auto); - cc.mut_changes().push(ccs); - if let Err(e) = self.fsm.peer.raft_group.apply_conf_change(&cc) { - panic!("fail to apply conf change for unsafe recover {:?}", e); + })); + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + callback, + DiskFullOpt::AllowedOnAlmostFull, + ); + + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: true, + }); } + } else { + self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); } - self.fsm - .peer - .peer_heartbeats - .retain(|&k, _| new_peer_list.contains(&k)); - self.fsm - .peer - .peers_start_pending_time - .retain(|&(k, _)| new_peer_list.contains(&k)); - for peer in to_be_removed { - self.fsm.peer.remove_peer_from_cache(peer); - } - self.fsm.peer.post_split(); - self.fsm.reset_hibernate_state(GroupState::Chaos); - self.register_raft_base_tick(); } - fn finish_unsafe_recovery_wait_apply(&mut self) { - if self - .fsm - .unsafe_recovery_wait_apply_counter - .as_ref() - .unwrap() - .fetch_sub(1, Ordering::Relaxed) - == 1 + fn unsafe_recovery_demote_failed_voters( + &mut self, + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + ) { + if let Some(req) = + demote_failed_voters_request(self.region(), &self.fsm.peer.peer, failed_voters) { - let mut stats = StoreStats::default(); - stats.set_store_id(self.store_id()); - let store_info = StoreInfo { - kv_engine: self.ctx.engines.kv.clone(), - raft_engine: self.ctx.engines.raft.clone(), - capacity: self.ctx.cfg.capacity.0, - }; - let task = PdTask::StoreHeartbeat { - stats, - store_info, - send_detailed_report: true, - dr_autosync_status: self - .ctx - .global_replication_state - .lock() - .unwrap() - .store_dr_autosync_status(), - }; - if let Err(e) = self.ctx.pd_scheduler.schedule(task) { - panic!("fail to send detailed report to pd {:?}", e); + info!( + "Unsafe recovery, demoting failed voters"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + "req" => ?req); + let failed = Arc::new(Mutex::new(false)); + let failed_clone = failed.clone(); + let callback = Callback::::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + *failed_clone.lock().unwrap() = true; + error!( + "Unsafe recovery, fail to finish demotion"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })); + self.propose_raft_command_internal(req, callback, DiskFullOpt::AllowedOnAlmostFull); + if !*failed.lock().unwrap() { + self.fsm.peer.unsafe_recovery_state = + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters: vec![], // No longer needed since here. + target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), + demote_after_exit: false, + }); } + } else { + warn!( + "Unsafe recovery, no need to demote failed voters"; + "region" => ?self.region(), + ); } - self.fsm.unsafe_recovery_target_commit_index = None; - self.fsm.unsafe_recovery_wait_apply_counter = None; } - fn on_unsafe_recovery_wait_apply(&mut self, counter: Arc) { - self.fsm.unsafe_recovery_target_commit_index = - Some(self.fsm.peer.raft_group.store().commit_index()); - self.fsm.unsafe_recovery_wait_apply_counter = Some(counter); - // If the applied index equals to the commit index, there is nothing to wait for, proceeds - // to the next step immediately. If they are not equal, further checks will be performed in - // on_apply_res(). - if self.fsm.stopped - || self.fsm.peer.raft_group.store().applied_index() - == self.fsm.peer.raft_group.store().commit_index() - { - self.finish_unsafe_recovery_wait_apply(); + fn on_unsafe_recovery_destroy(&mut self, syncer: UnsafeRecoveryExecutePlanSyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't destroy, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; } + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Destroy(syncer)); + self.handle_destroy_peer(DestroyPeerJob { + initialized: self.fsm.peer.is_initialized(), + region_id: self.region_id(), + peer: self.fsm.peer.peer.clone(), + }); + } + + fn on_unsafe_recovery_wait_apply(&mut self, syncer: UnsafeRecoveryWaitApplySyncer) { + if self.fsm.peer.unsafe_recovery_state.is_some() { + warn!( + "Unsafe recovery, can't wait apply, another plan is executing in progress"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + syncer.abort(); + return; + } + let target_index = if self.fsm.peer.force_leader.is_some() { + // For regions that lose quorum (or regions have force leader), whatever has been + // proposed will be committed. Based on that fact, we simply use "last index" here to + // avoid implementing another "wait commit" process. + self.fsm.peer.raft_group.raft.raft_log.last_index() + } else { + self.fsm.peer.raft_group.raft.raft_log.committed + }; + + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ self.fsm.stopped); + } + + fn on_unsafe_recovery_fill_out_report(&mut self, syncer: UnsafeRecoveryFillOutReportSyncer) { + let mut self_report = pdpb::PeerReport::default(); + self_report.set_raft_state(self.fsm.peer.get_store().raft_state().clone()); + let mut region_local_state = RegionLocalState::default(); + region_local_state.set_region(self.region().clone()); + self_report.set_region_state(region_local_state); + self_report.set_is_force_leader(self.fsm.peer.force_leader.is_some()); + syncer.report_for_self(self_report); } fn on_casual_msg(&mut self, msg: CasualMessage) { @@ -1034,6 +992,10 @@ where CasualMessage::SnapshotApplied => { self.fsm.has_ready = true; } + CasualMessage::Campaign => { + let _ = self.fsm.peer.raft_group.campaign(); + self.fsm.has_ready = true; + } } } @@ -1269,6 +1231,383 @@ where SignificantMsg::RaftlogFetched { context, res } => { self.on_raft_log_fetched(context, res); } + SignificantMsg::EnterForceLeaderState { + syncer, + failed_stores, + } => { + self.on_enter_pre_force_leader(syncer, failed_stores); + } + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer, + failed_voters, + } => self.on_unsafe_recovery_pre_demote_failed_voters(syncer, failed_voters), + SignificantMsg::UnsafeRecoveryDestroy(syncer) => { + self.on_unsafe_recovery_destroy(syncer) + } + SignificantMsg::UnsafeRecoveryWaitApply(syncer) => { + self.on_unsafe_recovery_wait_apply(syncer) + } + SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { + self.on_unsafe_recovery_fill_out_report(syncer) + } + } + } + + fn on_enter_pre_force_leader( + &mut self, + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + ) { + match self.fsm.peer.force_leader { + Some(ForceLeaderState::PreForceLeader { .. }) => { + self.on_force_leader_fail(); + } + Some(ForceLeaderState::ForceLeader { .. }) => { + // already is a force leader, do nothing + return; + } + Some(ForceLeaderState::WaitTicks { .. }) => { + self.fsm.peer.force_leader = None; + } + None => {} + } + + if !self.fsm.peer.is_initialized() { + warn!( + "Unsafe recovery, cannot force leader since this peer is not initialized"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + let ticks = if self.fsm.peer.is_leader() { + if self.fsm.hibernate_state.group_state() == GroupState::Ordered { + warn!( + "Unsafe recovery, reject pre force leader due to already being leader"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait two rounds of election timeout to trigger check quorum to step down the leader + // note: check quorum is triggered every `election_timeout` instead of `randomized_election_timeout` + Some( + self.fsm.peer.raft_group.raft.election_timeout() * 2 + - self.fsm.peer.raft_group.raft.election_elapsed, + ) + // When election timeout is triggered, leader_id is set to INVALID_ID. + // But learner(not promotable) is a exception here as it wouldn't tick + // election. + } else if self.fsm.peer.raft_group.raft.promotable() + && self.fsm.peer.leader_id() != raft::INVALID_ID + { + if self.fsm.hibernate_state.group_state() == GroupState::Ordered + || self.fsm.hibernate_state.group_state() == GroupState::Chaos + { + warn!( + "Unsafe recovery, reject pre force leader due to leader lease may not expired"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + // wait one round of election timeout to make sure leader_id is invalid + Some( + self.fsm.peer.raft_group.raft.randomized_election_timeout() + - self.fsm.peer.raft_group.raft.election_elapsed, + ) + } else { + None + }; + + if let Some(ticks) = ticks { + info!( + "Unsafe recovery, enter wait ticks"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "ticks" => ticks, + ); + self.fsm.peer.force_leader = Some(ForceLeaderState::WaitTicks { + syncer, + failed_stores, + ticks, + }); + self.reset_raft_tick(if self.fsm.peer.is_leader() { + GroupState::Ordered + } else { + GroupState::Chaos + }); + self.fsm.has_ready = true; + return; + } + + let expected_alive_voter = self.get_force_leader_expected_alive_voter(&failed_stores); + if !expected_alive_voter.is_empty() + && self + .fsm + .peer + .raft_group + .raft + .prs() + .has_quorum(&expected_alive_voter) + { + warn!( + "Unsafe recovery, reject pre force leader due to has quorum"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } + + info!( + "Unsafe recovery, enter pre force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + ); + + // Do not use prevote as prevote won't set `vote` to itself. + // When PD issues force leader on two different peer, it may cause + // two force leader in same term. + self.fsm.peer.raft_group.raft.pre_vote = false; + // trigger vote request to all voters, will check the vote result in `check_force_leader` + if let Err(e) = self.fsm.peer.raft_group.campaign() { + warn!( + "Unsafe recovery, campaign failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => ?e, + ); + } + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + if !self + .fsm + .peer + .raft_group + .raft + .prs() + .votes() + .get(&self.fsm.peer.peer_id()) + .unwrap() + { + warn!( + "Unsafe recovery, pre force leader failed to campaign"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.on_force_leader_fail(); + return; + } + + self.fsm.peer.force_leader = Some(ForceLeaderState::PreForceLeader { + syncer, + failed_stores, + }); + self.fsm.has_ready = true; + } + + fn on_force_leader_fail(&mut self) { + self.fsm.peer.raft_group.raft.pre_vote = true; + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.force_leader = None; + } + + fn on_enter_force_leader(&mut self) { + info!( + "Unsafe recovery, enter force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + assert_eq!(self.fsm.peer.get_role(), StateRole::Candidate); + + let failed_stores = match self.fsm.peer.force_leader.take() { + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + _ => unreachable!(), + }; + + let peer_ids: Vec<_> = self.fsm.peer.voters().iter().collect(); + for peer_id in peer_ids { + let store_id = self + .region() + .get_peers() + .iter() + .find(|p| p.get_id() == peer_id) + .unwrap() + .get_store_id(); + if !failed_stores.contains(&store_id) { + continue; + } + + // make fake vote response + let mut msg = raft::eraftpb::Message::new(); + msg.msg_type = MessageType::MsgRequestVoteResponse; + msg.reject = false; + msg.term = self.fsm.peer.term(); + msg.from = peer_id; + msg.to = self.fsm.peer.peer_id(); + self.fsm.peer.raft_group.step(msg).unwrap(); + } + + // after receiving all votes, should become leader + assert!(self.fsm.peer.is_leader()); + self.fsm.peer.raft_group.raft.set_check_quorum(false); + + // make sure it's not hibernated + self.reset_raft_tick(GroupState::Ordered); + + self.fsm.peer.force_leader = Some(ForceLeaderState::ForceLeader { + time: TiInstant::now_coarse(), + failed_stores, + }); + self.fsm.has_ready = true; + } + + fn on_exit_force_leader(&mut self) { + if self.fsm.peer.force_leader.is_none() { + return; + } + + info!( + "exit force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + self.fsm.peer.force_leader = None; + // make sure it's not hibernated + assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); + // leader lease shouldn't be renewed in force leader state. + assert_eq!( + self.fsm.peer.leader_lease().inspect(None), + LeaseState::Expired + ); + self.fsm + .peer + .raft_group + .raft + .become_follower(self.fsm.peer.term(), raft::INVALID_ID); + + self.fsm.peer.raft_group.raft.set_check_quorum(true); + self.fsm.peer.raft_group.raft.pre_vote = true; + if self.fsm.peer.raft_group.raft.promotable() { + // Do not campaign directly here, otherwise on_role_changed() won't called for follower state + let _ = self.ctx.router.send( + self.region_id(), + PeerMsg::CasualMessage(CasualMessage::Campaign), + ); + } + self.fsm.has_ready = true; + } + + #[inline] + fn get_force_leader_expected_alive_voter(&self, failed_stores: &HashSet) -> HashSet { + let region = self.region(); + self.fsm + .peer + .voters() + .iter() + .filter(|peer_id| { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + !failed_stores.contains(&store_id) + }) + .collect() + } + + #[inline] + fn check_force_leader(&mut self) { + if let Some(ForceLeaderState::WaitTicks { + syncer, + failed_stores, + ticks, + }) = &mut self.fsm.peer.force_leader + { + if *ticks == 0 { + let syncer_clone = syncer.clone(); + let s = mem::take(failed_stores); + self.on_enter_pre_force_leader(syncer_clone, s); + } else { + *ticks -= 1; + } + return; + }; + + let failed_stores = match &self.fsm.peer.force_leader { + None => return, + Some(ForceLeaderState::ForceLeader { .. }) => { + if self.fsm.peer.maybe_force_forward_commit_index() { + self.fsm.has_ready = true; + } + return; + } + Some(ForceLeaderState::PreForceLeader { failed_stores, .. }) => failed_stores, + Some(ForceLeaderState::WaitTicks { .. }) => unreachable!(), + }; + + if self.fsm.peer.raft_group.raft.election_elapsed + 1 + < self.ctx.cfg.raft_election_timeout_ticks + { + // wait as longer as it can to collect responses of request vote + return; + } + + let expected_alive_voter: HashSet<_> = + self.get_force_leader_expected_alive_voter(failed_stores); + let check = || { + if self.fsm.peer.raft_group.raft.state != StateRole::Candidate { + Err(format!( + "unexpected role {:?}", + self.fsm.peer.raft_group.raft.state + )) + } else { + let mut granted = 0; + for (id, vote) in self.fsm.peer.raft_group.raft.prs().votes() { + if expected_alive_voter.contains(id) { + if *vote { + granted += 1; + } else { + return Err(format!("receive reject response from {}", *id)); + } + } else if *id == self.fsm.peer_id() { + // self may be a learner + continue; + } else { + return Err(format!( + "receive unexpected vote from {} vote {}", + *id, *vote + )); + } + } + Ok(granted) + } + }; + + match check() { + Err(err) => { + warn!( + "Unsafe recovery, pre force leader check failed"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "alive_voter" => ?expected_alive_voter, + "reason" => err, + ); + self.on_force_leader_fail(); + } + Ok(granted) => { + info!( + "Unsafe recovery, expected live voters:"; + "voters" => ?expected_alive_voter, + "granted" => granted + ); + if granted == expected_alive_voter.len() { + self.on_enter_force_leader(); + } + } } } @@ -1382,6 +1721,19 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); } + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { + if r != StateRole::Leader { + // for some reason, it's not leader anymore + info!( + "step down in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "state" => ?r, + ); + self.on_force_leader_fail(); + } + } } } @@ -1512,6 +1864,8 @@ where self.fsm.peer.retry_pending_reads(&self.ctx.cfg); + self.check_force_leader(); + let mut res = None; if self.ctx.cfg.hibernate_regions { if self.fsm.hibernate_state.group_state() == GroupState::Idle { @@ -1519,14 +1873,14 @@ where // follower may tick more than an election timeout in chaos state. // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` - 2 // - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) - // and the follwer's `election_elapsed` in raft-rs is 1. + // and the follower's `election_elapsed` in raft-rs is 1. // After the group state becomes Chaos, the next tick will call `raft_group.tick` // `missing_tick` + 1 times(default 7). // Then the follower's `election_elapsed` will be 1 + `missing_tick` + 1 // (default 1 + 6 + 1 = 8) which is less than the min election timeout. // The reason is that we don't want let all followers become (pre)candidate if one // follower may receive a request, then becomes (pre)candidate and sends (pre)vote msg - // to others. As long as the leader can wake up and broadcast hearbeats in one `raft_heartbeat_ticks` + // to others. As long as the leader can wake up and broadcast heartbeats in one `raft_heartbeat_ticks` // time(default 2s), no more followers will wake up and sends vote msg again. if self.fsm.missing_ticks + 1 /* for the next tick after the peer isn't Idle */ + self.fsm.peer.raft_group.raft.election_elapsed @@ -1594,6 +1948,69 @@ where } } + fn check_unsafe_recovery_state(&mut self) { + match &self.fsm.peer.unsafe_recovery_state { + Some(UnsafeRecoveryState::WaitApply { .. }) => self + .fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false), + Some(UnsafeRecoveryState::DemoteFailedVoters { + syncer, + failed_voters, + target_index, + demote_after_exit, + }) => { + if self.fsm.peer.raft_group.raft.raft_log.applied >= *target_index { + if *demote_after_exit { + if !self.fsm.peer.is_force_leader() { + error!( + "Unsafe recovery, lost forced leadership after exiting joint state"; + "region_id" => self.region().get_id(), + ); + return; + } + let syncer_clone = syncer.clone(); + let failed_voters_clone = failed_voters.clone(); + self.unsafe_recovery_demote_failed_voters( + syncer_clone, + failed_voters_clone, + ); + } else { + if self.fsm.peer.in_joint_state() { + info!( + "Unsafe recovery, exiting joint state"; + "region_id" => self.region().get_id() + ); + if self.fsm.peer.is_force_leader() { + self.propose_raft_command_internal( + exit_joint_request(self.region(), &self.fsm.peer.peer), + Callback::::write(Box::new(|resp| { + if resp.response.get_header().has_error() { + error!( + "Unsafe recovery, fail to exit joint state"; + "err" => ?resp.response.get_header().get_error(), + ); + } + })), + DiskFullOpt::AllowedOnAlmostFull, + ); + } else { + error!( + "Unsafe recovery, lost forced leadership while trying to exit joint state"; + "region_id" => self.region().get_id(), + ); + } + } + + self.fsm.peer.unsafe_recovery_state = None; + } + } + } + // Destroy does not need be processed, the state is cleaned up together with peer. + Some(_) | None => {} + } + } + fn on_apply_res(&mut self, res: ApplyTaskRes) { fail_point!("on_apply_res", |_| {}); match res { @@ -1658,11 +2075,8 @@ where } } } - // After a log has been applied, check if we need to trigger the unsafe recovery reporting procedure. - if let Some(target_commit_index) = self.fsm.unsafe_recovery_target_commit_index { - if self.fsm.peer.raft_group.store().applied_index() >= target_commit_index { - self.finish_unsafe_recovery_wait_apply(); - } + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.check_unsafe_recovery_state(); } } @@ -1847,9 +2261,7 @@ where Either::Right(v) => v, }; - if util::is_vote_msg(msg.get_message()) - || msg.get_message().get_msg_type() == MessageType::MsgTimeoutNow - { + if util::is_vote_msg(msg.get_message()) || msg_type == MessageType::MsgTimeoutNow { if self.fsm.hibernate_state.group_state() != GroupState::Chaos { self.fsm.reset_hibernate_state(GroupState::Chaos); self.register_raft_base_tick(); @@ -1861,7 +2273,7 @@ where let from_peer_id = msg.get_from_peer().get_id(); self.fsm.peer.insert_peer_cache(msg.take_from_peer()); - let result = if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + let result = if msg_type == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(msg.get_message(), peer_disk_usage); Ok(()) } else { @@ -2747,8 +3159,10 @@ where assert!(!self.fsm.peer.is_handling_snapshot()); // No need to wait for the apply anymore. - if self.fsm.unsafe_recovery_target_commit_index.is_some() { - self.finish_unsafe_recovery_wait_apply(); + if self.fsm.peer.unsafe_recovery_state.is_some() { + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ true); } { @@ -3011,7 +3425,7 @@ where // until new leader elected, but we can't revert this operation // because its result is already persisted in apply worker // TODO: should we transfer leader here? - let demote_self = is_learner(&self.fsm.peer.peer); + let demote_self = is_learner(&self.fsm.peer.peer) && !self.fsm.peer.is_force_leader(); if remove_self || demote_self { warn!( "Removing or demoting leader"; @@ -4161,6 +4575,16 @@ where let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + if self.fsm.peer.force_leader.is_some() { + // in force leader state, forbid requests to make the recovery progress less error-prone + if !(msg.has_admin_request() + && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + { + return Err(Error::RecoveryInProgress(self.region_id())); + } + } + // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; @@ -4839,6 +5263,18 @@ where return; } + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // If the force leader state lasts a long time, it probably means PD recovery process aborts for some reasons. + // So just exit to avoid blocking the read and write requests for this peer. + if time.saturating_elapsed() > self.ctx.cfg.peer_stale_state_check_interval.0 { + warn!( + "Unsafe recovery, step down as force leader due to holding it too long"; + "duration" => ?time.saturating_elapsed(), + ); + self.on_exit_force_leader(); + } + } + if self.ctx.cfg.hibernate_regions { let group_state = self.fsm.hibernate_state.group_state(); if group_state == GroupState::Idle { @@ -5222,6 +5658,57 @@ fn new_compact_log_request( request } +fn demote_failed_voters_request( + region: &metapb::Region, + peer: &metapb::Peer, + failed_voters: Vec, +) -> Option { + let failed_voter_ids = HashSet::from_iter(failed_voters.iter().map(|voter| voter.get_id())); + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let mut change_peer_reqs: Vec = region + .get_peers() + .iter() + .filter_map(|peer| { + if failed_voter_ids.contains(&peer.get_id()) + && peer.get_role() == metapb::PeerRole::Voter + { + let mut peer_clone = peer.clone(); + peer_clone.set_role(metapb::PeerRole::Learner); + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddLearnerNode); + cp.set_peer(peer_clone); + return Some(cp); + } + None + }) + .collect(); + + // Promote self if it is a learner. + if peer.get_role() == metapb::PeerRole::Learner { + let mut cp = pdpb::ChangePeer::default(); + cp.set_change_type(ConfChangeType::AddNode); + let mut promote = peer.clone(); + promote.set_role(metapb::PeerRole::Voter); + cp.set_peer(promote); + change_peer_reqs.push(cp); + } + if change_peer_reqs.is_empty() { + return None; + } + req.set_admin_request(new_change_peer_v2_request(change_peer_reqs)); + Some(req) +} + +fn exit_joint_request(region: &metapb::Region, peer: &metapb::Peer) -> RaftCmdRequest { + let mut req = new_admin_request(region.get_id(), peer.clone()); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.set_admin_request(new_change_peer_v2_request(vec![])); + req +} + impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> where EK: KvEngine, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3aee7df4259..c3836807647 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -16,7 +16,10 @@ use batch_system::{ HandlerBuilder, PollHandler, Priority, }; use crossbeam::channel::{unbounded, Sender, TryRecvError, TrySendError}; -use engine_traits::{Engines, KvEngine, Mutable, PerfContextKind, WriteBatch}; +use engine_traits::{ + CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, + RaftLogBatch, Range, WriteBatch, WriteOptions, +}; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use fail::fail_point; use futures::compat::Future01CompatExt; @@ -24,8 +27,7 @@ use futures::FutureExt; use kvproto::import_sstpb::SstMeta; use kvproto::import_sstpb::SwitchMode; use kvproto::metapb::{self, Region, RegionEpoch}; -use kvproto::pdpb::QueryStats; -use kvproto::pdpb::StoreStats; +use kvproto::pdpb::{self, QueryStats, StoreStats}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest}; use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}; use kvproto::replication_modepb::{ReplicationMode, ReplicationStatus}; @@ -34,8 +36,6 @@ use raft::StateRole; use time::{self, Timespec}; use collections::HashMap; -use engine_traits::CompactedEvent; -use engine_traits::{RaftEngine, RaftLogBatch, WriteOptions}; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use pd_client::{FeatureGate, PdClient}; use sst_importer::SSTImporter; @@ -621,7 +621,11 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> inspector.record_store_wait(send_time.saturating_elapsed()); self.ctx.pending_latency_inspect.push(inspector); } - StoreMsg::CreatePeer(region) => self.on_create_peer(region), + StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), + StoreMsg::UnsafeRecoveryCreatePeer { syncer, create } => { + self.on_unsafe_recovery_create_peer(create); + drop(syncer); + } } } } @@ -2157,7 +2161,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - fn store_heartbeat_pd(&mut self) { + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); stats.set_store_id(self.ctx.store_id()); @@ -2235,7 +2239,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let task = PdTask::StoreHeartbeat { stats, store_info, - send_detailed_report: false, + report, dr_autosync_status: self .ctx .global_replication_state @@ -2252,7 +2256,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } fn on_pd_store_heartbeat_tick(&mut self) { - self.store_heartbeat_pd(); + self.store_heartbeat_pd(None); self.register_pd_store_heartbeat_tick(); } @@ -2639,39 +2643,35 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.router.report_status_update() } - fn on_create_peer(&self, region: Region) { - info!("creating a peer"; "peer" => ?region); - let mut kv_wb = self.ctx.engines.kv.write_batch(); - let region_state_key = keys::region_state_key(region.get_id()); - match self - .ctx - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) + fn on_unsafe_recovery_create_peer(&self, region: Region) { + info!("Unsafe recovery, creating a peer"; "peer" => ?region); + let mut meta = self.ctx.store_meta.lock().unwrap(); + if let Some((_, id)) = meta + .region_ranges + .range(( + Excluded(data_key(region.get_start_key())), + Unbounded::>, + )) + .next() { - Ok(Some(region_state)) => { - info!( - "target region already exists, existing region: {:?}, want to create: {:?}", - region_state, region - ); - return; - } - Ok(None) => {} - Err(e) => { - panic!("cannot determine whether {:?} exists, err {:?}", region, e) + let exist_region = &meta.regions[id]; + if enc_start_key(exist_region) < data_end_key(region.get_end_key()) { + if exist_region.get_id() == region.get_id() { + warn!( + "Unsafe recovery, region has already been created."; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } else { + error!( + "Unsafe recovery, region to be created overlaps with an existing region"; + "region" => ?region, + "exist_region" => ?exist_region, + ); + return; + } } - }; - peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) - .unwrap_or_else(|e| { - panic!( - "fail to add peer state into write batch while creating {:?} err {:?}", - region, e - ) - }); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - if let Err(e) = kv_wb.write_opt(&write_opts) { - panic!("fail to write while creating {:?} err {:?}", region, e); } let (sender, mut peer) = match PeerFsm::create( self.ctx.store.get_id(), @@ -2683,33 +2683,29 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ) { Ok((sender, peer)) => (sender, peer), Err(e) => { - panic!( - "fail to create peer fsm while creating {:?} err {:?}", - region, e + error!( + "Unsafe recovery, fail to create peer fsm"; + "region" => ?region, + "err" => ?e, ); + return; } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); peer.peer.init_replication_mode(&mut *replication_state); + drop(replication_state); peer.peer.activate(self.ctx); - let mut meta = self.ctx.store_meta.lock().unwrap(); - for (_, id) in meta.region_ranges.range(( - Excluded(data_key(region.get_start_key())), - Unbounded::>, - )) { - let exist_region = &meta.regions[id]; - if enc_start_key(exist_region) >= data_end_key(region.get_end_key()) { - break; - } - panic!( - "{:?} is overlapped with an existing region {:?}", - region, exist_region - ); - } + + let start_key = keys::enc_start_key(®ion); + let end_key = keys::enc_end_key(®ion); if meta - .region_ranges - .insert(enc_end_key(®ion), region.get_id()) + .regions + .insert(region.get_id(), region.clone()) .is_some() + || meta + .region_ranges + .insert(end_key.clone(), region.get_id()) + .is_some() || meta .readers .insert(region.get_id(), ReadDelegate::from_peer(peer.get_peer())) @@ -2720,10 +2716,38 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .is_some() { panic!( - "key conflicts while insert region {:?} into store meta", - region + "Unsafe recovery, key conflicts while inserting region {:?} into store meta", + region, ); } + drop(meta); + + if let Err(e) = self.ctx.engines.kv.delete_all_in_range( + DeleteStrategy::DeleteByKey, + &[Range::new(&start_key, &end_key)], + ) { + panic!( + "Unsafe recovery, fail to clean up stale data while creating the new region {:?}, the error is {:?}", + region, e, + ); + } + let mut kv_wb = self.ctx.engines.kv.write_batch(); + if let Err(e) = peer_storage::write_peer_state(&mut kv_wb, ®ion, PeerState::Normal, None) + { + panic!( + "Unsafe recovery, fail to add peer state for {:?} into write batch, the error is {:?}", + region, e, + ); + } + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + if let Err(e) = kv_wb.write_opt(&write_opts) { + panic!( + "Unsafe recovery, fail to write to disk while creating peer {:?}, the error is {:?}", + region, e, + ); + } + let mailbox = BasicMailbox::new(sender, peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(region.get_id(), mailbox); self.ctx diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 35e7843a0a4..7efd4e4882c 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -3,14 +3,12 @@ // #[PerformanceCriticalPath] use std::borrow::Cow; use std::fmt; -use std::sync::atomic::AtomicUsize; -use std::sync::Arc; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use kvproto::kvrpcpb::ExtraOp as TxnExtraOp; use kvproto::metapb; use kvproto::metapb::RegionEpoch; -use kvproto::pdpb::CheckPolicy; +use kvproto::pdpb::{self, CheckPolicy}; use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::ReplicationStatus; @@ -18,15 +16,19 @@ use kvproto::{import_sstpb::SstMeta, kvrpcpb::DiskFullOpt}; use raft::{GetEntriesContext, SnapshotStatus}; use smallvec::{smallvec, SmallVec}; +use super::{AbstractPeer, RegionSnapshot}; use crate::store::fsm::apply::TaskRes as ApplyTaskRes; use crate::store::fsm::apply::{CatchUpLogs, ChangeObserver}; use crate::store::metrics::RaftEventDurationType; +use crate::store::peer::{ + UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, +}; use crate::store::util::{KeysInfoFormatter, LatencyInspector}; use crate::store::{RaftlogFetchResult, SnapKey}; +use collections::HashSet; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use super::{AbstractPeer, RegionSnapshot}; - #[derive(Debug)] pub struct ReadResponse { pub response: RaftCmdResponse, @@ -313,6 +315,18 @@ where context: GetEntriesContext, res: Box, }, + EnterForceLeaderState { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + }, + ExitForceLeaderState, + UnsafeRecoveryDemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + }, + UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), + UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), + UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), } /// Message that will be sent to a peer. @@ -392,6 +406,9 @@ pub enum CasualMessage { // Snapshot is applied SnapshotApplied, + + // Trigger raft to campaign which is used after exiting force leader + Campaign, } impl fmt::Debug for CasualMessage { @@ -450,6 +467,7 @@ impl fmt::Debug for CasualMessage { CasualMessage::RefreshRegionBuckets { .. } => write!(fmt, "RefreshRegionBuckets"), CasualMessage::RenewLease => write!(fmt, "RenewLease"), CasualMessage::SnapshotApplied => write!(fmt, "SnapshotApplied"), + CasualMessage::Campaign => write!(fmt, "Campaign"), } } } @@ -539,8 +557,6 @@ pub enum PeerMsg { /// Asks region to change replication mode. UpdateReplicationMode, Destroy(u64), - UpdateRegionForUnsafeRecover(metapb::Region), - UnsafeRecoveryWaitApply(Arc), } impl fmt::Debug for PeerMsg { @@ -569,10 +585,6 @@ impl fmt::Debug for PeerMsg { PeerMsg::HeartbeatPd => write!(fmt, "HeartbeatPd"), PeerMsg::UpdateReplicationMode => write!(fmt, "UpdateReplicationMode"), PeerMsg::Destroy(peer_id) => write!(fmt, "Destroy {}", peer_id), - PeerMsg::UpdateRegionForUnsafeRecover(region) => { - write!(fmt, "Update Region {} to {:?}", region.get_id(), region) - } - PeerMsg::UnsafeRecoveryWaitApply(counter) => write!(fmt, "WaitApply {:?}", *counter), } } } @@ -617,7 +629,11 @@ where #[cfg(any(test, feature = "testexport"))] Validate(Box), - CreatePeer(metapb::Region), + UnsafeRecoveryReport(pdpb::StoreReport), + UnsafeRecoveryCreatePeer { + syncer: UnsafeRecoveryExecutePlanSyncer, + create: metapb::Region, + }, } impl fmt::Debug for StoreMsg @@ -646,7 +662,10 @@ where StoreMsg::Validate(_) => write!(fmt, "Validate config"), StoreMsg::UpdateReplicationMode(_) => write!(fmt, "UpdateReplicationMode"), StoreMsg::LatencyInspect { .. } => write!(fmt, "LatencyInspect"), - StoreMsg::CreatePeer(_) => write!(fmt, "CreatePeer"), + StoreMsg::UnsafeRecoveryReport(..) => write!(fmt, "UnsafeRecoveryReport"), + StoreMsg::UnsafeRecoveryCreatePeer { .. } => { + write!(fmt, "UnsafeRecoveryCreatePeer") + } } } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9c3520ba330..d9c635a97db 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -6,7 +6,7 @@ use std::collections::VecDeque; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use std::{cmp, mem, u64, usize}; +use std::{cmp, fmt, mem, u64, usize}; use bitflags::bitflags; use bytes::Bytes; @@ -17,10 +17,11 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; +use getset::Getters; use kvproto::errorpb; use kvproto::kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}; use kvproto::metapb::{self, PeerRole}; -use kvproto::pdpb::PeerStats; +use kvproto::pdpb::{self, PeerStats}; use kvproto::raft_cmdpb::{ self, AdminCmdType, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, PutRequest, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, TransferLeaderResponse, @@ -32,7 +33,7 @@ use kvproto::replication_modepb::{ DrAutoSyncState, RegionReplicationState, RegionReplicationStatus, ReplicationMode, }; use parking_lot::RwLockUpgradableReadGuard; -use protobuf::Message; +use protobuf::{Message, RepeatedField}; use raft::eraftpb::{self, ConfChangeType, Entry, EntryType, MessageType}; use raft::{ self, Changer, GetEntriesContext, LightReady, ProgressState, ProgressTracker, RawNode, Ready, @@ -49,11 +50,11 @@ use crate::errors::RAFTSTORE_IS_BUSY; use crate::store::async_io::write::WriteMsg; use crate::store::async_io::write_router::WriteRouter; use crate::store::fsm::apply::CatchUpLogs; -use crate::store::fsm::store::PollContext; +use crate::store::fsm::store::{PollContext, RaftRouter}; use crate::store::fsm::{apply, Apply, ApplyMetrics, ApplyTask, Proposal}; use crate::store::hibernate_state::GroupState; use crate::store::memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}; -use crate::store::msg::RaftCommand; +use crate::store::msg::{PeerMsg, RaftCommand, SignificantMsg, StoreMsg}; use crate::store::txn_ext::LocksStatus; use crate::store::util::{admin_cmd_epoch_lookup, RegionReadProgress}; use crate::store::worker::{ @@ -454,6 +455,226 @@ pub struct ReadyResult { pub has_write_ready: bool, } +#[derive(Debug)] +/// ForceLeader process would be: +/// 1. If it's hibernated, enter wait ticks state, and wake up the peer +/// 2. Enter pre force leader state, become candidate and send request vote to all peers +/// 3. Wait for the responses of the request vote, no reject should be received. +/// 4. Enter force leader state, become leader without leader lease +/// 5. Execute recovery plan(some remove-peer commands) +/// 6. After the plan steps are all applied, exit force leader state +pub enum ForceLeaderState { + WaitTicks { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + ticks: usize, + }, + PreForceLeader { + syncer: UnsafeRecoveryForceLeaderSyncer, + failed_stores: HashSet, + }, + ForceLeader { + time: TiInstant, + failed_stores: HashSet, + }, +} + +// Following shared states are used while reporting to PD for unsafe recovery and shared among +// all the regions per their life cycle. +// The work flow is like: +// 1. report phase +// start_unsafe_recovery_report +// -> broadcast wait-apply commands +// -> wait for all the peers' apply indices meet their targets +// -> broadcast fill out report commands +// -> wait for all the peers fill out the reports for themselves +// -> send a store report (through store heartbeat) +// 2. force leader phase +// dispatch force leader commands +// -> wait for all the peers that received the command become force leader +// -> start_unsafe_recovery_report +// 3. plan execution phase +// dispatch recovery plans +// -> wait for all the creates, deletes and demotes to finish, for the demotes, +// procedures are: +// -> exit joint state if it is already in joint state +// -> demote failed voters, and promote self to be a voter if it is a learner +// -> exit joint state +// -> start_unsafe_recovery_report + +// Intends to use RAII to sync unsafe recovery procedures between peers, in addition to that, +// it uses a closure to avoid having a raft router as a member variable, which is statically +// dispatched, thus needs to propagate the generics everywhere. +pub struct InvokeClosureOnDrop(Box); + +impl fmt::Debug for InvokeClosureOnDrop { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "InvokeClosureOnDrop") + } +} + +impl Drop for InvokeClosureOnDrop { + fn drop(&mut self) { + self.0(); + } +} + +pub fn start_unsafe_recovery_report( + router: &RaftRouter, + report_id: u64, + exit_force_leader_before_reporting: bool, +) { + if exit_force_leader_before_reporting { + router.broadcast_normal(|| PeerMsg::SignificantMsg(SignificantMsg::ExitForceLeaderState)); + } + let wait_apply = UnsafeRecoveryWaitApplySyncer::new(report_id, router.clone()); + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryWaitApply(wait_apply.clone())) + }); +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryForceLeaderSyncer(Arc); + +impl UnsafeRecoveryForceLeaderSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let inner = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, force leader finished."); + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, false); + })); + UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryExecutePlanSyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryExecutePlanSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, plan execution finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, plan execution aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + start_unsafe_recovery_report(&*router_ptr, report_id, true); + })); + UnsafeRecoveryExecutePlanSyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryWaitApplySyncer { + _closure: Arc, + abort: Arc>, +} + +impl UnsafeRecoveryWaitApplySyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let abort = Arc::new(Mutex::new(false)); + let abort_clone = abort.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, wait apply finished"); + if *abort_clone.lock().unwrap() { + warn!("Unsafe recovery, wait apply aborted"); + return; + } + let router_ptr = thread_safe_router.lock().unwrap(); + let fill_out_report = + UnsafeRecoveryFillOutReportSyncer::new(report_id, (*router_ptr).clone()); + (*router_ptr).broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::UnsafeRecoveryFillOutReport( + fill_out_report.clone(), + )) + }); + })); + UnsafeRecoveryWaitApplySyncer { + _closure: Arc::new(closure), + abort, + } + } + + pub fn abort(&self) { + *self.abort.lock().unwrap() = true; + } +} + +#[derive(Clone, Debug)] +pub struct UnsafeRecoveryFillOutReportSyncer { + _closure: Arc, + reports: Arc>>, +} + +impl UnsafeRecoveryFillOutReportSyncer { + pub fn new(report_id: u64, router: RaftRouter) -> Self { + let thread_safe_router = Mutex::new(router); + let reports = Arc::new(Mutex::new(vec![])); + let reports_clone = reports.clone(); + let closure = InvokeClosureOnDrop(Box::new(move || { + info!("Unsafe recovery, peer reports collected"); + let mut store_report = pdpb::StoreReport::default(); + { + let reports_ptr = reports_clone.lock().unwrap(); + store_report.set_peer_reports(RepeatedField::from_vec((*reports_ptr).to_vec())); + } + store_report.set_step(report_id); + let router_ptr = thread_safe_router.lock().unwrap(); + if let Err(e) = (*router_ptr).send_control(StoreMsg::UnsafeRecoveryReport(store_report)) + { + error!("Unsafe recovery, fail to schedule reporting"; "err" => ?e); + } + })); + UnsafeRecoveryFillOutReportSyncer { + _closure: Arc::new(closure), + reports, + } + } + + pub fn report_for_self(&self, report: pdpb::PeerReport) { + let mut reports_ptr = self.reports.lock().unwrap(); + (*reports_ptr).push(report); + } +} + +pub enum UnsafeRecoveryState { + // Stores the state that is necessary for the wait apply stage of unsafe recovery process. + // This state is set by the peer fsm. Once set, it is checked every time this peer applies a + // new entry or a snapshot, if the target index is met, this state is reset / droppeds. The + // syncer holds a reference counted inner object that is shared among all the peers, whose + // destructor triggers the next step of unsafe recovery report process. + WaitApply { + target_index: u64, + syncer: UnsafeRecoveryWaitApplySyncer, + }, + DemoteFailedVoters { + syncer: UnsafeRecoveryExecutePlanSyncer, + failed_voters: Vec, + target_index: u64, + // Failed regions may be stuck in joint state, if that is the case, we need to ask the + // region to exit joint state before proposing the demotion. + demote_after_exit: bool, + }, + Destroy(UnsafeRecoveryExecutePlanSyncer), +} + +#[derive(Getters)] pub struct Peer where EK: KvEngine, @@ -478,6 +699,7 @@ where proposals: ProposalQueue, leader_missing_time: Option, + #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, @@ -491,6 +713,15 @@ where /// 2. all read requests must be rejected. pub pending_remove: bool, + /// Force leader state is only used in online recovery when the majority of + /// peers are missing. In this state, it forces one peer to become leader out + /// of accordance with Raft election rule, and forbids any read/write proposals. + /// With that, we can further propose remove failed-nodes conf-change, to make + /// the Raft group forms majority and works normally later on. + /// + /// For details, see the comment of `ForceLeaderState`. + pub force_leader: Option, + /// Record the instants of peers being added into the configuration. /// Remove them after they are not pending any more. pub peers_start_pending_time: Vec<(u64, Instant)>, @@ -610,6 +841,7 @@ where pub region_buckets: Option, /// lead_transferee if the peer is in a leadership transferring. pub lead_transferee: u64, + pub unsafe_recovery_state: Option, } impl Peer @@ -682,6 +914,7 @@ where leader_unreachable: false, pending_remove: false, should_wake_up: false, + force_leader: None, pending_merge_state: None, want_rollback_merge_peers: HashSet::default(), pending_request_snapshot_count: Arc::new(AtomicUsize::new(0)), @@ -736,6 +969,7 @@ where apply_snap_ctx: None, region_buckets: None, lead_transferee: raft::INVALID_ID, + unsafe_recovery_state: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -1079,6 +1313,10 @@ where res.reason = "transfer leader"; return res; } + if self.force_leader.is_some() { + res.reason = "force leader"; + return res; + } // Unapplied entries can change the configuration of the group. if self.get_store().applied_index() < last_index { res.reason = "unapplied"; @@ -1605,6 +1843,39 @@ where false } + pub fn maybe_force_forward_commit_index(&mut self) -> bool { + let failed_stores = match &self.force_leader { + Some(ForceLeaderState::ForceLeader { failed_stores, .. }) => failed_stores, + _ => unreachable!(), + }; + + let region = self.region(); + let mut replicated_idx = self.raft_group.raft.raft_log.persisted; + for (peer_id, p) in self.raft_group.raft.prs().iter() { + let store_id = region + .get_peers() + .iter() + .find(|p| p.get_id() == *peer_id) + .unwrap() + .get_store_id(); + if failed_stores.contains(&store_id) { + continue; + } + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + } + + if self.raft_group.store().term(replicated_idx).unwrap_or(0) < self.term() { + // do not commit logs of previous term directly + return false; + } + + self.raft_group.raft.raft_log.committed = + std::cmp::max(self.raft_group.raft.raft_log.committed, replicated_idx); + true + } + pub fn check_stale_state(&mut self, ctx: &mut PollContext) -> StaleState { if self.is_leader() { // Leaders always have valid state. @@ -1950,6 +2221,11 @@ where self.term(), self.raft_group.store().region(), ); + + if self.unsafe_recovery_state.is_some() { + debug!("unsafe recovery finishes applying a snapshot"); + self.unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false); + } } // If `apply_snap_ctx` is none, it means this snapshot does not // come from the ready but comes from the unfinished snapshot task @@ -2495,6 +2771,11 @@ where let persist_index = self.raft_group.raft.raft_log.persisted; self.mut_store().update_cache_persisted(persist_index); + + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } } if self.apply_snap_ctx.is_some() && self.unpersisted_readies.is_empty() { @@ -2533,6 +2814,10 @@ where self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); let persist_index = self.raft_group.raft.raft_log.persisted; + if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { + // forward commit index, the committed entries will be applied in the next raft base tick round + self.maybe_force_forward_commit_index(); + } self.mut_store().update_cache_persisted(persist_index); self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); @@ -2841,6 +3126,13 @@ where "peer_id" => self.peer.get_id(), ); None + } else if self.force_leader.is_some() { + debug!( + "prevents renew lease while in force leader state"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + None } else { self.leader_lease.renew(ts); let term = self.term(); @@ -3080,7 +3372,7 @@ where let kind = ConfChangeKind::confchange_kind(change_peers.len()); if kind == ConfChangeKind::LeaveJoint { - if self.peer.get_role() == PeerRole::DemotingVoter { + if self.peer.get_role() == PeerRole::DemotingVoter && !self.is_force_leader() { return Err(box_err!( "{} ignore leave joint command that demoting leader", self.tag @@ -3153,7 +3445,7 @@ where let promoted_commit_index = after_progress.maximal_committed_index().0; if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. - || promoted_commit_index >= self.get_store().truncated_index() + || promoted_commit_index >= self.get_store().truncated_index() || self.force_leader.is_some() { return Ok(()); } @@ -3778,6 +4070,16 @@ where poll_ctx: &mut PollContext, mut req: RaftCmdRequest, ) -> Result> { + // Should not propose normal in force leader state. + // In `pre_propose_raft_command`, it rejects all the requests expect conf-change + // if in force leader state. + if self.force_leader.is_some() { + panic!( + "{} propose normal in force leader state {:?}", + self.tag, self.force_leader + ); + }; + if (self.pending_merge_state.is_some() && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge) || (self.prepare_merge_fence > 0 @@ -4139,6 +4441,10 @@ where resp } + pub fn voters(&self) -> raft::util::Union<'_> { + self.raft_group.raft.prs().conf().voters().ids() + } + pub fn term(&self) -> u64 { self.raft_group.raft.term } @@ -4370,6 +4676,34 @@ where .propose_check_epoch(cmd, self.term()) .is_none() } + + #[inline] + pub fn is_force_leader(&self) -> bool { + matches!( + self.force_leader, + Some(ForceLeaderState::ForceLeader { .. }) + ) + } + + pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { + if let Some(UnsafeRecoveryState::WaitApply { target_index, .. }) = + &self.unsafe_recovery_state + { + if self.raft_group.raft.raft_log.applied >= *target_index || force { + if self.is_force_leader() { + info!( + "Unsafe recovery, finish wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_index" => target_index, + "applied" => self.raft_group.raft.raft_log.applied, + "force" => force, + ); + } + self.unsafe_recovery_state = None; + } + } + } } #[derive(Default, Debug)] diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d056e0d5497..d8640fc979b 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1093,6 +1093,11 @@ where self.last_term } + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + #[inline] pub fn applied_index(&self) -> u64 { self.apply_state.get_applied_index() diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 9043982901c..2d0f5dd66c0 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -780,10 +780,6 @@ impl< } } -pub fn integration_on_half_fail_quorum_fn(voters: usize) -> usize { - (voters + 1) / 2 + 1 -} - #[derive(PartialEq, Eq, Debug)] pub enum ConfChangeKind { // Only contains one configuration change @@ -1935,16 +1931,6 @@ mod tests { } } - #[test] - fn test_integration_on_half_fail_quorum_fn() { - let voters = vec![1, 2, 3, 4, 5, 6, 7]; - let quorum = vec![2, 2, 3, 3, 4, 4, 5]; - for (voter_count, expected_quorum) in voters.into_iter().zip(quorum) { - let quorum = super::integration_on_half_fail_quorum_fn(voter_count); - assert_eq!(quorum, expected_quorum); - } - } - #[test] fn test_is_region_initialized() { let mut region = metapb::Region::default(); diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 9c46daf201e..1f2d357b80c 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -23,7 +23,8 @@ pub use self::cleanup_sst::{Runner as CleanupSSTRunner, Task as CleanupSSTTask}; pub use self::compact::{Runner as CompactRunner, Task as CompactTask}; pub use self::consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}; pub use self::pd::{ - FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, + new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, + Runner as PdRunner, Task as PdTask, }; pub use self::query_stats::QueryStats; pub use self::raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index bdca2d161f5..4652a35876a 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -3,12 +3,12 @@ use std::cmp::Ordering as CmpOrdering; use std::fmt::{self, Display, Formatter}; use std::sync::mpsc::{self, Receiver, Sender}; -use std::sync::{atomic::AtomicUsize, atomic::Ordering, Arc}; +use std::sync::{atomic::Ordering, Arc}; use std::thread::{Builder, JoinHandle}; use std::time::{Duration, Instant}; use std::{cmp, io}; -use engine_traits::{KvEngine, RaftEngine, CF_RAFT}; +use engine_traits::{KvEngine, RaftEngine}; #[cfg(feature = "failpoints")] use fail::fail_point; use kvproto::kvrpcpb::DiskFullOpt; @@ -16,7 +16,7 @@ use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, SplitRequest, }; -use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState}; +use kvproto::raft_serverpb::RaftMessage; use kvproto::replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}; use kvproto::{metapb, pdpb}; use ordered_float::OrderedFloat; @@ -26,31 +26,31 @@ use yatp::Remote; use crate::store::cmd_resp::new_error; use crate::store::metrics::*; -use crate::store::util::{ - is_epoch_stale, ConfChangeKind, KeysInfoFormatter, LatencyInspector, RaftstoreDuration, -}; +use crate::store::util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}; use crate::store::worker::query_stats::QueryStats; use crate::store::worker::split_controller::{SplitInfo, TOP_N}; use crate::store::worker::{AutoSplitController, ReadStats, WriteStats}; use crate::store::{ + peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, + transport::SignificantRouter, Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SnapManager, StoreInfo, StoreMsg, TxnExt, + RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }; use collections::HashMap; +use collections::HashSet; use concurrency_manager::ConcurrencyManager; use futures::compat::Future01CompatExt; use futures::FutureExt; use pd_client::metrics::*; use pd_client::{Error, PdClient, RegionStat}; -use protobuf::Message; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; use tikv_util::metrics::ThreadInfoStatistics; use tikv_util::time::UnixSecs; use tikv_util::timer::GLOBAL_TIMER_HANDLE; use tikv_util::topn::TopN; use tikv_util::worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}; -use tikv_util::{box_err, box_try, debug, error, info, thd_name, warn}; +use tikv_util::{box_err, debug, error, info, thd_name, warn}; type RecordPairVec = Vec; @@ -135,7 +135,7 @@ where StoreHeartbeat { stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + report: Option, dr_autosync_status: Option, }, ReportBatchSplit { @@ -1024,7 +1024,7 @@ where &mut self, mut stats: pdpb::StoreStats, store_info: StoreInfo, - send_detailed_report: bool, + store_report: Option, dr_autosync_status: Option, ) { let store_stats = self.engine_store_server_helper.handle_compute_store_stats(); @@ -1117,119 +1117,72 @@ where let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); - let mut optional_report = None; - if send_detailed_report { - let mut store_report = pdpb::StoreReport::new(); - store_info - .kv_engine - .scan_cf( - CF_RAFT, - keys::REGION_META_MIN_KEY, - keys::REGION_META_MAX_KEY, - false, - |key, value| { - let (_, suffix) = box_try!(keys::decode_region_meta_key(key)); - if suffix != keys::REGION_STATE_SUFFIX { - return Ok(true); - } - - let mut region_local_state = RegionLocalState::default(); - region_local_state.merge_from_bytes(value)?; - if region_local_state.get_state() == PeerState::Tombstone { - return Ok(true); - } - let raft_local_state = match store_info - .raft_engine - .get_raft_state(region_local_state.get_region().get_id()) - .unwrap() - { - None => return Ok(true), - Some(value) => value, - }; - let mut peer_report = pdpb::PeerReport::new(); - peer_report.set_region_state(region_local_state); - peer_report.set_raft_state(raft_local_state); - store_report.mut_peer_reports().push(peer_report); - Ok(true) - }, - ) - .unwrap(); - optional_report = Some(store_report); - } let router = self.router.clone(); - let scheduler = self.scheduler.clone(); - let stats_copy = stats.clone(); - let resp = - self.pd_client - .store_heartbeat(stats, optional_report, dr_autosync_status.clone()); + let resp = self + .pd_client + .store_heartbeat(stats, store_report, dr_autosync_status); let f = async move { match resp.await { Ok(mut resp) => { if let Some(status) = resp.replication_status.take() { let _ = router.send_control(StoreMsg::UpdateReplicationMode(status)); } - if resp.get_require_detailed_report() { - // This store needs to report detailed info of hosted regions to PD. - // - // The info has to be up to date, meaning that all committed changes til now have to be applied before the report is sent. - // The entire process may include: - // 1. `broadcast_normal` "wait apply" messsages to all peers. - // 2. `on_unsafe_recovery_wait_apply` examines whether the peer have not-yet-applied entries, if so, memorize the target index. - // 3. `on_apply_res` checks whether entries before the "unsafe recovery report target commit index" have all been applied. - // The one who finally finds out the number of remaining tasks is 0 schedules an unsafe recovery reporting store heartbeat. - info!("required to send detailed report in the next heartbeat"); - // Init the counter with 1 in case the msg processing is faster than the distributing thus cause FSMs race to send a report. - let counter = Arc::new(AtomicUsize::new(1)); - let counter_clone = counter.clone(); - router.broadcast_normal(|| { - let _ = counter_clone.fetch_add(1, Ordering::Relaxed); - PeerMsg::UnsafeRecoveryWaitApply(counter_clone.clone()) - }); - // Reporting needs to be triggered here in case there is no message to be sent or messages processing finished before above function returns. - if counter.fetch_sub(1, Ordering::Relaxed) == 1 { - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); + if let Some(mut plan) = resp.recovery_plan.take() { + info!("Unsafe recovery, received a recovery plan"); + if plan.has_force_leader() { + let mut failed_stores = HashSet::default(); + for failed_store in plan.get_force_leader().get_failed_stores() { + failed_stores.insert(*failed_store); } - } - } else if resp.has_plan() { - info!("asked to execute recovery plan"); - for create in resp.get_plan().get_creates() { - info!("asked to create region"; "region" => ?create); - if let Err(e) = - router.send_control(StoreMsg::CreatePeer(create.clone())) - { - error!("fail to send creat peer message for recovery"; "err" => ?e); + let syncer = UnsafeRecoveryForceLeaderSyncer::new( + plan.get_step(), + router.clone(), + ); + for region in plan.get_force_leader().get_enter_force_leaders() { + if let Err(e) = router.significant_send( + *region, + SignificantMsg::EnterForceLeaderState { + syncer: syncer.clone(), + failed_stores: failed_stores.clone(), + }, + ) { + error!("fail to send force leader message for recovery"; "err" => ?e); + } } - } - for delete in resp.get_plan().get_deletes() { - info!("asked to delete peer"; "peer" => delete); - if let Err(e) = router.force_send(*delete, PeerMsg::Destroy(*delete)) { - error!("fail to send delete peer message for recovery"; "err" => ?e); + } else { + let syncer = UnsafeRecoveryExecutePlanSyncer::new( + plan.get_step(), + router.clone(), + ); + for create in plan.take_creates().into_iter() { + if let Err(e) = + router.send_control(StoreMsg::UnsafeRecoveryCreatePeer { + syncer: syncer.clone(), + create, + }) + { + error!("fail to send create peer message for recovery"; "err" => ?e); + } } - } - for update in resp.get_plan().get_updates() { - info!("asked to update region's range"; "region" => ?update); - if let Err(e) = router.force_send( - update.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(update.clone()), - ) { - error!("fail to send update range message for recovery"; "err" => ?e); + for delete in plan.take_tombstones().into_iter() { + if let Err(e) = router.significant_send( + delete, + SignificantMsg::UnsafeRecoveryDestroy(syncer.clone()), + ) { + error!("fail to send delete peer message for recovery"; "err" => ?e); + } + } + for mut demote in plan.take_demotes().into_iter() { + if let Err(e) = router.significant_send( + demote.get_region_id(), + SignificantMsg::UnsafeRecoveryDemoteFailedVoters { + syncer: syncer.clone(), + failed_voters: demote.take_failed_voters().into_vec(), + }, + ) { + error!("fail to send update peer list message for recovery"; "err" => ?e); + } } - } - let task = Task::StoreHeartbeat { - stats: stats_copy, - store_info, - send_detailed_report: true, - dr_autosync_status, - }; - if let Err(e) = scheduler.schedule(task) { - error!("notify pd failed"; "err" => ?e); } } } @@ -1358,7 +1311,6 @@ where "try to change peer"; "region_id" => region_id, "changes" => ?change_peer_v2.get_changes(), - "kind" => ?ConfChangeKind::confchange_kind(change_peer_v2.get_changes().len()), ); let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); @@ -1787,14 +1739,9 @@ where Task::StoreHeartbeat { stats, store_info, - send_detailed_report, - dr_autosync_status, - } => self.handle_store_heartbeat( - stats, - store_info, - send_detailed_report, + report, dr_autosync_status, - ), + } => self.handle_store_heartbeat(stats, store_info, report, dr_autosync_status), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::ValidatePeer { region, peer } => self.handle_validate_peer(region, peer), Task::ReadStats { read_stats } => self.handle_read_stats(read_stats), @@ -1889,7 +1836,7 @@ fn new_change_peer_request(change_type: ConfChangeType, peer: metapb::Peer) -> A req } -fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { +pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ChangePeerV2); let change_peer_reqs = changes diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index 05d10f03dfb..067983d2805 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -89,6 +89,7 @@ impl Runner { Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) }); let deleted = box_try!(self.engines.raft.batch_gc(regions)); + fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); Ok(deleted) } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 55f5b991e7c..833920be724 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1460,6 +1460,42 @@ impl Cluster { .unwrap(); } + pub fn enter_force_leader(&mut self, region_id: u64, store_id: u64, failed_stores: Vec) { + let mut plan = pdpb::RecoveryPlan::default(); + let mut force_leader = pdpb::ForceLeader::default(); + force_leader.set_enter_force_leaders([region_id].to_vec()); + force_leader.set_failed_stores(failed_stores.to_vec()); + plan.set_force_leader(force_leader); + // Triggers the unsafe recovery plan execution. + self.pd_client.must_set_unsafe_recovery_plan(store_id, plan); + self.must_send_store_heartbeat(store_id); + } + + pub fn must_enter_force_leader( + &mut self, + region_id: u64, + store_id: u64, + failed_stores: Vec, + ) { + self.enter_force_leader(region_id, store_id, failed_stores); + let mut store_report = None; + for _ in 0..20 { + store_report = self.pd_client.must_get_store_report(store_id); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + } + + pub fn exit_force_leader(&mut self, region_id: u64, store_id: u64) { + let router = self.sim.rl().get_router(store_id).unwrap(); + router + .significant_send(region_id, SignificantMsg::ExitForceLeaderState) + .unwrap(); + } + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { let mut try_cnt = 0; let split_count = self.pd_client.get_split_count(); @@ -1700,57 +1736,6 @@ impl Cluster { StoreRouter::send(&router, StoreMsg::Tick(StoreTick::PdStoreHeartbeat)).unwrap(); } - pub fn must_update_region_for_unsafe_recover(&mut self, node_id: u64, region: &metapb::Region) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - router - .force_send( - region.get_id(), - PeerMsg::UpdateRegionForUnsafeRecover(region.clone()), - ) - .unwrap(); - } - if let Ok(Some(current)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - if current.get_start_key() == region.get_start_key() - && current.get_end_key() == region.get_end_key() - { - return; - } - } - if try_cnt > 500 { - panic!("region {:?} is not updated", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - - pub fn must_recreate_region_for_unsafe_recover( - &mut self, - node_id: u64, - region: &metapb::Region, - ) { - let router = self.sim.rl().get_router(node_id).unwrap(); - let mut try_cnt = 0; - loop { - if try_cnt % 50 == 0 { - // In case the message is ignored, re-send it every 50 tries. - StoreRouter::send(&router, StoreMsg::CreatePeer(region.clone())).unwrap(); - } - if let Ok(Some(_)) = block_on(self.pd_client.get_region_by_id(region.get_id())) { - return; - } - if try_cnt > 250 { - panic!("region {:?} is not created", region); - } - try_cnt += 1; - sleep_ms(20); - } - } - pub fn gc_peer( &mut self, region_id: u64, diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index a436dae8d82..ab494178820 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -325,7 +325,8 @@ struct PdCluster { pub check_merge_target_integrity: bool, unsafe_recovery_require_report: bool, - unsafe_recovery_store_reported: HashMap, + unsafe_recovery_store_reports: HashMap, + unsafe_recovery_plan: HashMap, } impl PdCluster { @@ -360,7 +361,8 @@ impl PdCluster { region_replication_status: HashMap::default(), check_merge_target_integrity: true, unsafe_recovery_require_report: false, - unsafe_recovery_store_reported: HashMap::default(), + unsafe_recovery_store_reports: HashMap::default(), + unsafe_recovery_plan: HashMap::default(), } } @@ -517,11 +519,11 @@ impl PdCluster { region.get_region_epoch().clone(), ); assert!(end_key > start_key); - let created_by_unsafe_recover = (!start_key.is_empty() || !end_key.is_empty()) - && incoming_epoch.get_version() == 1 - && incoming_epoch.get_conf_ver() == 1; + let created_by_unsafe_recovery = (!start_key.is_empty() || !end_key.is_empty()) + && incoming_epoch.get_version() == 0 + && incoming_epoch.get_conf_ver() == 0; let overlaps = self.get_overlap(start_key, end_key); - if created_by_unsafe_recover { + if created_by_unsafe_recovery { // Allow recreated region by unsafe recover to overwrite other regions with a "older" // epoch. return Ok(overlaps); @@ -727,10 +729,14 @@ impl PdCluster { self.min_resolved_ts } - fn handle_store_heartbeat(&mut self) -> Result { + fn handle_store_heartbeat(&mut self, store_id: u64) -> Result { let mut resp = pdpb::StoreHeartbeatResponse::default(); resp.set_require_detailed_report(self.unsafe_recovery_require_report); self.unsafe_recovery_require_report = false; + if let Some((_, plan)) = self.unsafe_recovery_plan.remove_entry(&store_id) { + debug!("Unsafe recovery, sending recovery plan"; "store_id" => store_id, "plan" => ?plan); + resp.set_recovery_plan(plan); + } Ok(resp) } @@ -739,19 +745,16 @@ impl PdCluster { self.unsafe_recovery_require_report = require_report; } - fn get_store_reported(&self, store_id: &u64) -> i32 { - *self - .unsafe_recovery_store_reported - .get(store_id) - .unwrap_or(&0) + fn set_unsafe_recovery_plan(&mut self, store_id: u64, recovery_plan: pdpb::RecoveryPlan) { + self.unsafe_recovery_plan.insert(store_id, recovery_plan); } - fn store_reported_inc(&mut self, store_id: u64) { - let reported = self - .unsafe_recovery_store_reported - .entry(store_id) - .or_insert(0); - *reported += 1; + fn get_store_report(&mut self, store_id: u64) -> Option { + self.unsafe_recovery_store_reports.remove(&store_id) + } + + fn set_store_report(&mut self, store_id: u64, report: pdpb::StoreReport) { + let _ = self.unsafe_recovery_store_reports.insert(store_id, report); } } @@ -1308,8 +1311,12 @@ impl TestPdClient { self.cluster.wl().set_require_report(require_report); } - pub fn must_get_store_reported(&self, store_id: &u64) -> i32 { - self.cluster.rl().get_store_reported(store_id) + pub fn must_get_store_report(&self, store_id: u64) -> Option { + self.cluster.wl().get_store_report(store_id) + } + + pub fn must_set_unsafe_recovery_plan(&self, store_id: u64, plan: pdpb::RecoveryPlan) { + self.cluster.wl().set_unsafe_recovery_plan(store_id, plan) } } @@ -1462,11 +1469,6 @@ impl PdClient for TestPdClient { { let cluster1 = Arc::clone(&self.cluster); let timer = self.timer.clone(); - { - if let Err(e) = self.cluster.try_write() { - println!("try write {:?}", e); - } - } let mut cluster = self.cluster.wl(); let store = cluster .stores @@ -1600,11 +1602,11 @@ impl PdClient for TestPdClient { cluster.store_stats.insert(store_id, stats); - if report.is_some() { - cluster.store_reported_inc(store_id); + if let Some(store_report) = report { + cluster.set_store_report(store_id, store_report); } - let mut resp = cluster.handle_store_heartbeat().unwrap(); + let mut resp = cluster.handle_store_heartbeat(store_id).unwrap(); if let Some(ref status) = cluster.replication_status { resp.set_replication_status(status.clone()); diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 293e3620f5c..d3a9d2a6d43 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -1,16 +1,18 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::iter::FromIterator; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::Arc; +use std::time::Duration; use futures::executor::block_on; +use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::util::find_peer; use test_raftstore::*; +use tikv_util::{config::ReadableDuration, mpsc}; -#[allow(clippy::mutex_atomic)] #[test] -fn test_unsafe_recover_send_report() { +fn test_unsafe_recovery_send_report() { let mut cluster = new_server_cluster(0, 3); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); @@ -27,68 +29,438 @@ fn test_unsafe_recover_send_report() { cluster.put(b"random_key1", b"random_val1").unwrap(); // Blocks the raft apply process on store 1 entirely . - let apply_triggered_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_triggered_pair2 = Arc::clone(&apply_triggered_pair); - let apply_released_pair = Arc::new((Mutex::new(false), Condvar::new())); - let apply_released_pair2 = Arc::clone(&apply_released_pair); + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); fail::cfg_callback("on_handle_apply_store_1", move || { - { - let (lock, cvar) = &*apply_triggered_pair2; - let mut triggered = lock.lock().unwrap(); - *triggered = true; - cvar.notify_one(); - } - { - let (lock2, cvar2) = &*apply_released_pair2; - let mut released = lock2.lock().unwrap(); - while !*released { - released = cvar2.wait(released).unwrap(); - } - } + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); }) .unwrap(); // Mannually makes an update, and wait for the apply to be triggered, to simulate "some entries are commited but not applied" scenario. cluster.put(b"random_key2", b"random_val2").unwrap(); - { - let (lock, cvar) = &*apply_triggered_pair; - let mut triggered = lock.lock().unwrap(); - while !*triggered { - triggered = cvar.wait(triggered).unwrap(); + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + // Triggers the unsafe recovery store reporting process. + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Unblocks the apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; } + sleep_ms(100); } + assert_ne!(store_report, None); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_execution_result_report() { + let mut cluster = new_server_cluster(0, 3); + // Prolong force leader time. + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::minutes(5); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::minutes(10); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); // Makes the group lose its quorum. cluster.stop_node(nodes[1]); cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } - // Triggers the unsafe recovery store reporting process. - pd_client.must_set_require_report(true); + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region2 + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region2.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); cluster.must_send_store_heartbeat(nodes[0]); // No store report is sent, since there are peers have unapplied entries. for _ in 0..20 { - assert_eq!(pd_client.must_get_store_reported(&nodes[0]), 0); + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); + sleep_ms(100); + } + + // Unblocks the apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + for peer_report in store_report.unwrap().get_peer_reports() { + let region = peer_report.get_region_state().get_region(); + if region.get_id() == 101 { + assert_eq!(region.get_end_key(), b"random_key1".to_vec()); + } else { + assert_eq!(region.get_id(), region2.get_id()); + for peer in region.get_peers() { + if peer.get_store_id() != nodes[0] { + assert_eq!(peer.get_role(), metapb::PeerRole::Learner); + } + } + } + } + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recover_wait_for_snapshot_apply() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = 8; + cluster.cfg.raft_store.merge_max_log_gap = 3; + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.stop_node(nodes[1]); + let (raft_gc_triggered_tx, raft_gc_triggered_rx) = mpsc::bounded::<()>(1); + let (raft_gc_finished_tx, raft_gc_finished_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("worker_gc_raft_log", move || { + let _ = raft_gc_triggered_rx.recv(); + }) + .unwrap(); + fail::cfg_callback("worker_gc_raft_log_finished", move || { + let _ = raft_gc_finished_tx.send(()); + }) + .unwrap(); + // Add at least 4m data + (0..10).for_each(|_| cluster.must_put(b"random_k", b"random_v")); + // Unblock raft log GC. + drop(raft_gc_triggered_tx); + // Wait until logs are GCed. + raft_gc_finished_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + // Makes the group lose its quorum. + cluster.stop_node(nodes[2]); + + // Blocks the raft snap apply process. + let (apply_triggered_tx, apply_triggered_rx) = mpsc::bounded::<()>(1); + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("region_apply_snap", move || { + let _ = apply_triggered_tx.send(()); + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + cluster.run_node(nodes[1]).unwrap(); + + apply_triggered_rx + .recv_timeout(Duration::from_secs(1)) + .unwrap(); + + // Triggers the unsafe recovery store reporting process. + let plan = pdpb::RecoveryPlan::default(); + pd_client.must_set_unsafe_recovery_plan(nodes[1], plan); + cluster.must_send_store_heartbeat(nodes[1]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..20 { + assert_eq!(pd_client.must_get_store_report(nodes[1]), None); + sleep_ms(100); + } + + // Unblocks the snap apply process. + drop(apply_released_tx); + + // Store reports are sent once the entries are applied. + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[1]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + fail::remove("worker_gc_raft_log"); + fail::remove("worker_gc_raft_log_finished"); + fail::remove("raft_before_apply_snap_callback"); +} + +#[test] +fn test_unsafe_recovery_demotion_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + } + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + + // Blocks the raft apply process on store 1 entirely . + let (apply_released_tx, apply_released_rx) = mpsc::bounded::<()>(1); + fail::cfg_callback("on_handle_apply_store_1", move || { + let _ = apply_released_rx.recv(); + }) + .unwrap(); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + // No store report is sent, since there are peers have unapplied entries. + for _ in 0..10 { + assert_eq!(pd_client.must_get_store_report(nodes[0]), None); sleep_ms(100); } + // Send the plan again. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + // Unblocks the apply process. + drop(apply_released_tx); + + let mut demoted = false; + for _ in 0..10 { + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + demoted = region_in_pd + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + sleep_ms(100); + } + assert_eq!(demoted, true); + fail::remove("on_handle_apply_store_1"); +} + +#[test] +fn test_unsafe_recovery_create_destroy_reentrancy() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + // Makes the leadership definite. + let store2_peer = find_peer(®ion, nodes[1]).unwrap().to_owned(); + cluster.must_transfer_leader(region.get_id(), store2_peer); + cluster.put(b"random_key1", b"random_val1").unwrap(); + + // Split the region into 2, and remove one of them, so that we can test both region peer list + // update and region creation. + pd_client.must_split_region( + region, + pdpb::CheckPolicy::Usekey, + vec![b"random_key1".to_vec()], + ); + let region1 = pd_client.get_region(b"random_key".as_ref()).unwrap(); + let region2 = pd_client.get_region(b"random_key1".as_ref()).unwrap(); + let region1_store0_peer = find_peer(®ion1, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(region1.get_id(), region1_store0_peer); + cluster.must_remove_region(nodes[0], region1.get_id()); + + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); { - let (lock2, cvar2) = &*apply_released_pair; - let mut released = lock2.lock().unwrap(); - *released = true; - cvar2.notify_all(); + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region2.get_id(), + region2.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); } + cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_end_key(b"random_key1".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + plan.mut_creates().push(create); + + plan.mut_tombstones().push(region2.get_id()); + + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + sleep_ms(100); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + // Store reports are sent once the entries are applied. - let mut reported = false; + let mut store_report = None; for _ in 0..20 { - if pd_client.must_get_store_reported(&nodes[0]) > 0 { - reported = true; + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { break; } sleep_ms(100); } - assert_eq!(reported, true); + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), 101); + assert_eq!(reported_region.get_peers().len(), 1); + assert_eq!(reported_region.get_peers()[0].get_id(), 102); fail::remove("on_handle_apply_store_1"); } diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index decc5ea1ab7..efa118fb8f1 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -28,5 +28,5 @@ mod test_status_command; mod test_tombstone; mod test_transfer_leader; mod test_transport; -mod test_unsafe_recover; +mod test_unsafe_recovery; mod test_update_region_size; diff --git a/tests/integrations/raftstore/test_unsafe_recover.rs b/tests/integrations/raftstore/test_unsafe_recover.rs deleted file mode 100644 index 5e7b774a6d1..00000000000 --- a/tests/integrations/raftstore/test_unsafe_recover.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::iter::FromIterator; -use std::sync::Arc; - -use futures::executor::block_on; -use kvproto::metapb; -use pd_client::PdClient; -use test_raftstore::*; - -#[test] -fn test_unsafe_recover_update_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey2".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - assert_eq!(region.get_end_key(), b"anykey2"); -} - -#[test] -fn test_unsafe_recover_create_region() { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - let nodes = Vec::from_iter(cluster.get_node_ids()); - assert_eq!(nodes.len(), 3); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer number check. - pd_client.disable_default_operator(); - - let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - - configure_for_lease_read(&mut cluster, None, None); - cluster.stop_node(nodes[1]); - cluster.stop_node(nodes[2]); - cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); - - let mut update = metapb::Region::default(); - update.set_id(1); - update.set_end_key(b"anykey".to_vec()); - for p in region.get_peers() { - if p.get_store_id() == nodes[0] { - update.mut_peers().push(p.clone()); - } - } - update.mut_region_epoch().set_version(1); - update.mut_region_epoch().set_conf_ver(1); - // Removes the boostrap region, since it overlaps with any regions we create. - cluster.must_update_region_for_unsafe_recover(nodes[0], &update); - block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); - let mut create = metapb::Region::default(); - create.set_id(101); - create.set_start_key(b"anykey".to_vec()); - let mut peer = metapb::Peer::default(); - peer.set_id(102); - peer.set_store_id(nodes[0]); - create.mut_peers().push(peer); - cluster.must_recreate_region_for_unsafe_recover(nodes[0], &create); - let region = pd_client.get_region(b"anykey1").unwrap(); - assert_eq!(create.get_id(), region.get_id()); -} diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs new file mode 100644 index 00000000000..487389e407f --- /dev/null +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -0,0 +1,1070 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::iter::FromIterator; +use std::sync::Arc; +use std::time::Duration; + +use futures::executor::block_on; +use kvproto::{metapb, pdpb}; +use pd_client::PdClient; +use raft::eraftpb::{ConfChangeType, MessageType}; +use raftstore::store::util::find_peer; +use test_raftstore::*; +use tikv_util::config::ReadableDuration; +use tikv_util::HandyRwLock; + +fn confirm_quorum_is_lost(cluster: &mut Cluster, region: &metapb::Region) { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); +} + +#[test] +fn test_unsafe_recovery_demote_failed_voters() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + demoted = true; + for peer in region.get_peers() { + if peer.get_id() != nodes[0] && peer.get_role() == metapb::PeerRole::Voter { + demoted = false; + } + } + if demoted { + break; + } + sleep_ms(200); + } + assert_eq!(demoted, true); +} + +// Demote non-exist voters will not work, but TiKV should still report to PD. +#[test] +fn test_unsafe_recovery_demote_non_exist_voters() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + let mut peer = metapb::Peer::default(); + peer.set_id(12345); + peer.set_store_id(region.get_id()); + peer.set_role(metapb::PeerRole::Voter); + demote.mut_failed_voters().push(peer); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + let report = store_report.unwrap(); + let peer_reports = report.get_peer_reports(); + assert_eq!(peer_reports.len(), 1); + let reported_region = peer_reports[0].get_region_state().get_region(); + assert_eq!(reported_region.get_id(), region.get_id()); + assert_eq!(reported_region.get_peers().len(), 3); + let demoted = reported_region + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); + + let region_in_pd = block_on(pd_client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + assert_eq!(region_in_pd.get_peers().len(), 3); + let demoted = region_in_pd + .get_peers() + .iter() + .any(|peer| peer.get_role() != metapb::PeerRole::Voter); + assert_eq!(demoted, false); +} + +#[test] +fn test_unsafe_recovery_auto_promote_learner() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store0.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_already_in_joint_state() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + configure_for_lease_read(&mut cluster, None, None); + + let peer_on_store0 = find_peer(®ion, nodes[0]).unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[2]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store2.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(nodes[2], peer_on_store2.get_id()), + ); + // Wait the new learner to be initialized. + sleep_ms(100); + pd_client.must_joint_confchange( + region.get_id(), + vec![ + ( + ConfChangeType::AddLearnerNode, + new_learner_peer(nodes[0], peer_on_store0.get_id()), + ), + ( + ConfChangeType::AddNode, + new_peer(nodes[2], peer_on_store2.get_id()), + ), + ], + ); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + confirm_quorum_is_lost(&mut cluster, ®ion); + cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + + let to_be_removed: Vec = region + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut plan = pdpb::RecoveryPlan::default(); + let mut demote = pdpb::DemoteFailedVoters::default(); + demote.set_region_id(region.get_id()); + demote.set_failed_voters(to_be_removed.into()); + plan.mut_demotes().push(demote); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut demoted = true; + let mut promoted = false; + for _ in 0..10 { + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + promoted = region + .get_peers() + .iter() + .find(|peer| peer.get_store_id() == nodes[0]) + .unwrap() + .get_role() + == metapb::PeerRole::Voter; + + demoted = region + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted && promoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + assert_eq!(promoted, true); +} + +#[test] +fn test_unsafe_recovery_create_region() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer number check. + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let store0_peer = find_peer(®ion, nodes[0]).unwrap().to_owned(); + + // Removes the boostrap region, since it overlaps with any regions we create. + pd_client.must_remove_peer(region.get_id(), store0_peer); + cluster.must_remove_region(nodes[0], region.get_id()); + + configure_for_lease_read(&mut cluster, None, None); + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + cluster.must_wait_for_leader_expire(nodes[0], region.get_id()); + + let mut create = metapb::Region::default(); + create.set_id(101); + create.set_start_key(b"anykey".to_vec()); + let mut peer = metapb::Peer::default(); + peer.set_id(102); + peer.set_store_id(nodes[0]); + create.mut_peers().push(peer); + let mut plan = pdpb::RecoveryPlan::default(); + plan.mut_creates().push(create); + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan); + cluster.must_send_store_heartbeat(nodes[0]); + let mut created = false; + for _ in 1..11 { + let region = pd_client.get_region(b"anykey1").unwrap(); + if region.get_id() == 101 { + created = true; + } + sleep_ms(200); + } + assert_eq!(created, true); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); +} + +// Test the case that two of three nodes fail and force leader on the rest node. +#[test] +fn test_force_leader_three_nodes() { + let mut cluster = new_node_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store3 = find_peer(®ion, 3).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store3.clone()); + + cluster.stop_node(2); + cluster.stop_node(3); + + // quorum is lost, can't propose command successfully. + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![2, 3]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on one of the +// rest nodes. +#[test] +fn test_force_leader_five_nodes() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // quorum is lost, can't propose command successfully. + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + // forbid writes in force leader state + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads in force leader state + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index in force leader state + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// which is a learner. +#[test] +fn test_force_leader_for_learner() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + // replace one peer with learner + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store1.clone()); + cluster.pd_client.must_add_peer( + region.get_id(), + new_learner_peer(peer_on_store1.get_store_id(), peer_on_store1.get_id()), + ); + // Sleep 100 ms to wait for the new learner to be initialized. + sleep_ms(100); + + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // promote the learner first and remove the peers on failed nodes + cluster + .pd_client + .must_add_peer(region.get_id(), find_peer(®ion, 1).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on a hibernated +// previous leader. +#[test] +fn test_force_leader_on_hibernated_leader() { + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on a hibernated +// previous follower. +#[test] +fn test_force_leader_on_hibernated_follower() { + test_util::init_log_for_test(); + let mut cluster = new_node_cluster(0, 5); + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // wait a while to hibernate + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with triggering snapshot. +#[test] +fn test_force_leader_trigger_snapshot() { + let mut cluster = new_node_cluster(0, 5); + configure_for_snapshot(&mut cluster); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // Isolate node 2 + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // Compact logs to force requesting snapshot after clearing send filters. + let state = cluster.truncated_state(region.get_id(), 1); + // Write some data to trigger snapshot. + for i in 100..150 { + let key = format!("k{}", i); + let value = format!("v{}", i); + cluster.must_put(key.as_bytes(), value.as_bytes()); + } + cluster.wait_log_truncated(region.get_id(), 1, state.get_index() + 40); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // Recover the isolation of 2, but still don't permit snapshot + let recv_filter = Box::new( + RegionPacketFilter::new(region.get_id(), 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgSnapshot), + ); + cluster.sim.wl().add_recv_filter(2, recv_filter); + cluster.clear_send_filters(); + + // wait election timeout + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 5, + ); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + ); + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(1000)) + .unwrap() + .get_header() + .has_error() // error "there is a pending conf change" indicating no committed log after being the leader + ); + + // Permit snapshot message, snapshot should be applied and advance commit index now. + cluster.sim.wl().clear_recv_filters(2); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} + +// Test the case that three of five nodes fail and force leader on the rest node +// with uncommitted conf change. +#[test] +fn test_force_leader_with_uncommitted_conf_change() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // an uncommitted conf-change + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 2).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // the uncommitted conf-change is committed successfully after being force leader + cluster + .pd_client + .must_none_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), Some(b"v2".to_vec())); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that none of five nodes fails and force leader on one of the nodes. +// Note: It still can't defend extreme misuse cases. For example, a group of a, +// b and c. c is isolated from a, a is the leader. If c has increased its term +// by 2 somehow (for example false prevote success twice) and force leader is +// sent to b and break lease constrain, then b will reject a's heartbeat while +// can vote for c. So c becomes leader and there are two leaders in the group. +#[test] +fn test_force_leader_on_healthy_region() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // try to enter force leader, it can't succeed due to quorum isn't lost + cluster.enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // make sure it leaves pre force leader state. + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + )); + // put and get can propose successfully. + assert_eq!(cluster.must_get(b"k1"), Some(b"v1".to_vec())); + cluster.must_put(b"k2", b"v2"); + + // try to exit force leader, it will be ignored silently as it's not in the force leader state + cluster.exit_force_leader(region.get_id(), 1); + + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader on the one not +// having latest log +#[test] +fn test_force_leader_on_wrong_leader() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + // peer on node2 doesn't have latest committed log + cluster.stop_node(2); + cluster.must_put(b"k2", b"v2"); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + cluster.run_node(2).unwrap(); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + + confirm_quorum_is_lost(&mut cluster, ®ion); + + // try to force leader on peer of node2 which is stale + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); + // can't propose confchange as it's not in force leader state + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .is_err() + ); + cluster.exit_force_leader(region.get_id(), 2); + + // peer on node2 still doesn't have the latest committed log. + must_get_none(&cluster.get_engine(2), b"k2"); +} + +// Test the case that three of five nodes fail and force leader twice on +// peers on different nodes +#[test] +fn test_force_leader_twice_on_different_peers() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + confirm_quorum_is_lost(&mut cluster, ®ion); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // enter force leader on a different peer + cluster.must_enter_force_leader(region.get_id(), 2, vec![3, 4, 5]); + // leader is the peer of store 2 + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap(), + *find_peer(®ion, 2).unwrap() + ); + + // peer of store 1 should exit force leader state, and propose conf-change on it should fail + let conf_change = new_change_peer_request(ConfChangeType::RemoveNode, new_peer(3, 3)); + let mut req = new_admin_request(region.get_id(), region.get_region_epoch(), conf_change); + req.mut_header() + .set_peer(find_peer(®ion, 1).unwrap().clone()); + let resp = cluster + .call_command(req, Duration::from_millis(10)) + .unwrap(); + let mut not_leader = kvproto::errorpb::NotLeader { + region_id: region.get_id(), + ..Default::default() + }; + not_leader.set_leader(find_peer(®ion, 2).unwrap().clone()); + assert_eq!(resp.get_header().get_error().get_not_leader(), ¬_leader,); + + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 2); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader twice on +// peer on the same node +#[test] +fn test_force_leader_twice_on_same_peer() { + let mut cluster = new_node_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // restart to clean lease + cluster.stop_node(1); + cluster.run_node(1).unwrap(); + cluster.stop_node(2); + cluster.run_node(2).unwrap(); + + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +} + +// Test the case that three of five nodes fail and force leader doesn't finish +// in one election rounds due to network partition. +#[test] +fn test_force_leader_multiple_election_rounds() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(30); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store5 = find_peer(®ion, 5).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store5.clone()); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + cluster.add_send_filter(IsolationFilterFactory::new(1)); + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // wait election timeout + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 2, + )); + cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + // wait multiple election rounds + std::thread::sleep(Duration::from_millis( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 6, + )); + + cluster.clear_send_filters(); + // remove the peers on failed nodes + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); +}