Skip to content

Commit 6619277

Browse files
authored
network/strategy: Backoff and ban overloaded peers to avoid submitting the same request multiple times (#5029)
This PR avoids submitting the same block or state request multiple times to the same slow peer. Previously, we submitted the same request to the same slow peer, which resulted in reputation bans on the slow peer side. Furthermore, the strategy selected the same slow peer multiple times to submit queries to, although a better candidate may exist. Instead, in this PR we: - introduce a `DisconnectedPeers` via LRU with 512 peer capacity to only track the state of disconnected peers with a request in flight - when the `DisconnectedPeers` detects a peer disconnected with a request in flight, the peer is backed off - on the first disconnection: 60 seconds - on second disconnection: 120 seconds - on the third disconnection the peer is banned, and the peer remains banned until the peerstore decays its reputation This PR lifts the pressure from overloaded nodes that cannot process requests in due time. And if a peer is detected to be slow after backoffs, the peer is banned. Theoretically, submitting the same request multiple times can still happen when: - (a) we backoff and ban the peer - (b) the network does not discover other peers -- this may also be a test net - (c) the peer gets reconnected after the reputation decay and is still slow to respond Aims to improve: - #4924 - #531 Next Steps: - Investigate the network after this is deployed, possibly bumping the keep-alive timeout or seeing if there's something else misbehaving This PR builds on top of: - #4987 ### Testing Done - Added a couple of unit tests where test harness were set in place - Local testnet ```bash 13:13:25.102 DEBUG tokio-runtime-worker sync::persistent_peer_state: Added first time peer 12D3KooWHdiAxVd8uMQR1hGWXccidmfCwLqcMpGwR6QcTP6QRMuD 13:14:39.102 DEBUG tokio-runtime-worker sync::persistent_peer_state: Remove known peer 12D3KooWHdiAxVd8uMQR1hGWXccidmfCwLqcMpGwR6QcTP6QRMuD state: DisconnectedPeerState { num_disconnects: 2, last_disconnect: Instant { tv_sec: 93355, tv_nsec: 942016062 } }, should ban: false 13:16:49.107 DEBUG tokio-runtime-worker sync::persistent_peer_state: Remove known peer 12D3KooWHdiAxVd8uMQR1hGWXccidmfCwLqcMpGwR6QcTP6QRMuD state: DisconnectedPeerState { num_disconnects: 3, last_disconnect: Instant { tv_sec: 93485, tv_nsec: 947551051 } }, should ban: true 13:16:49.108 WARN tokio-runtime-worker peerset: Report 12D3KooWHdiAxVd8uMQR1hGWXccidmfCwLqcMpGwR6QcTP6QRMuD: -2147483648 to -2147483648. Reason: Slow peer after backoffs. Banned, disconnecting. ``` cc @paritytech/networking --------- Signed-off-by: Alexandru Vasile <[email protected]>
1 parent ad1e556 commit 6619277

File tree

7 files changed

+377
-9
lines changed

7 files changed

+377
-9
lines changed

prdoc/pr_5029.prdoc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
2+
# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
3+
4+
title: Backoff slow peers to avoid duplicate requests
5+
6+
doc:
7+
- audience: Node Dev
8+
description: |
9+
This PR introduces a backoff strategy mechanism. Whenever a peer disconnects with an inflight
10+
block (or state) request, the peer is backed off for a period of time before receiving requests.
11+
After several attempts, the peer is disconnected and banned. The strategy aims to offload
12+
the pressure from peers that are slow to respond or overloaded.
13+
14+
crates:
15+
- name: sc-network-sync
16+
bump: minor

substrate/client/network/sync/src/engine.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,10 @@ where
536536
},
537537
BlockAnnounceValidationResult::Failure { peer_id, disconnect } => {
538538
if disconnect {
539+
log::debug!(
540+
target: LOG_TARGET,
541+
"Disconnecting peer {peer_id} due to block announce validation failure",
542+
);
539543
self.network_service
540544
.disconnect_peer(peer_id, self.block_announce_protocol_name.clone());
541545
}

substrate/client/network/sync/src/strategy.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
//! and specific syncing algorithms.
2121
2222
pub mod chain_sync;
23+
mod disconnected_peers;
2324
mod state;
2425
pub mod state_sync;
2526
pub mod warp;

substrate/client/network/sync/src/strategy/chain_sync.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ use crate::{
3333
justification_requests::ExtraRequests,
3434
schema::v1::StateResponse,
3535
strategy::{
36+
disconnected_peers::DisconnectedPeers,
3637
state_sync::{ImportResult, StateSync, StateSyncProvider},
3738
warp::{WarpSyncPhase, WarpSyncProgress},
3839
},
@@ -250,6 +251,7 @@ pub struct ChainSync<B: BlockT, Client> {
250251
client: Arc<Client>,
251252
/// The active peers that we are using to sync and their PeerSync status
252253
peers: HashMap<PeerId, PeerSync<B>>,
254+
disconnected_peers: DisconnectedPeers,
253255
/// A `BlockCollection` of blocks that are being downloaded from peers
254256
blocks: BlockCollection<B>,
255257
/// The best block number in our queue of blocks to import
@@ -378,6 +380,7 @@ where
378380
let mut sync = Self {
379381
client,
380382
peers: HashMap::new(),
383+
disconnected_peers: DisconnectedPeers::new(),
381384
blocks: BlockCollection::new(),
382385
best_queued_hash: Default::default(),
383386
best_queued_number: Zero::zero(),
@@ -1141,7 +1144,17 @@ where
11411144
if let Some(gap_sync) = &mut self.gap_sync {
11421145
gap_sync.blocks.clear_peer_download(peer_id)
11431146
}
1144-
self.peers.remove(peer_id);
1147+
1148+
if let Some(state) = self.peers.remove(peer_id) {
1149+
if !state.state.is_available() {
1150+
if let Some(bad_peer) =
1151+
self.disconnected_peers.on_disconnect_during_request(*peer_id)
1152+
{
1153+
self.actions.push(ChainSyncAction::DropPeer(bad_peer));
1154+
}
1155+
}
1156+
}
1157+
11451158
self.extra_justifications.peer_disconnected(peer_id);
11461159
self.allowed_requests.set_all();
11471160
self.fork_targets.retain(|_, target| {
@@ -1541,10 +1554,14 @@ where
15411554
let max_parallel = if is_major_syncing { 1 } else { self.max_parallel_downloads };
15421555
let max_blocks_per_request = self.max_blocks_per_request;
15431556
let gap_sync = &mut self.gap_sync;
1557+
let disconnected_peers = &mut self.disconnected_peers;
15441558
self.peers
15451559
.iter_mut()
15461560
.filter_map(move |(&id, peer)| {
1547-
if !peer.state.is_available() || !allowed_requests.contains(&id) {
1561+
if !peer.state.is_available() ||
1562+
!allowed_requests.contains(&id) ||
1563+
!disconnected_peers.is_peer_available(&id)
1564+
{
15481565
return None
15491566
}
15501567

@@ -1656,7 +1673,10 @@ where
16561673
}
16571674

16581675
for (id, peer) in self.peers.iter_mut() {
1659-
if peer.state.is_available() && peer.common_number >= sync.target_number() {
1676+
if peer.state.is_available() &&
1677+
peer.common_number >= sync.target_number() &&
1678+
self.disconnected_peers.is_peer_available(&id)
1679+
{
16601680
peer.state = PeerSyncState::DownloadingState;
16611681
let request = sync.next_request();
16621682
trace!(target: LOG_TARGET, "New StateRequest for {}: {:?}", id, request);
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// This file is part of Substrate.
2+
3+
// Copyright (C) Parity Technologies (UK) Ltd.
4+
// SPDX-License-Identifier: GPL-3.0-or-later WITH Classpath-exception-2.0
5+
6+
// This program is free software: you can redistribute it and/or modify
7+
// it under the terms of the GNU General Public License as published by
8+
// the Free Software Foundation, either version 3 of the License, or
9+
// (at your option) any later version.
10+
11+
// This program is distributed in the hope that it will be useful,
12+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
// GNU General Public License for more details.
15+
16+
// You should have received a copy of the GNU General Public License
17+
// along with this program. If not, see <https://www.gnu.org/licenses/>.
18+
19+
use crate::types::BadPeer;
20+
use sc_network::ReputationChange as Rep;
21+
use sc_network_types::PeerId;
22+
use schnellru::{ByLength, LruMap};
23+
24+
const LOG_TARGET: &str = "sync::disconnected_peers";
25+
26+
/// The maximum number of disconnected peers to keep track of.
27+
///
28+
/// When a peer disconnects, we must keep track if it was in the middle of a request.
29+
/// The peer may disconnect because it cannot keep up with the number of requests
30+
/// (ie not having enough resources available to handle the requests); or because it is malicious.
31+
const MAX_DISCONNECTED_PEERS_STATE: u32 = 512;
32+
33+
/// The time we are going to backoff a peer that has disconnected with an inflight request.
34+
///
35+
/// The backoff time is calculated as `num_disconnects * DISCONNECTED_PEER_BACKOFF_SECONDS`.
36+
/// This is to prevent submitting a request to a peer that has disconnected because it could not
37+
/// keep up with the number of requests.
38+
///
39+
/// The peer may disconnect due to the keep-alive timeout, however disconnections without
40+
/// an inflight request are not tracked.
41+
const DISCONNECTED_PEER_BACKOFF_SECONDS: u64 = 60;
42+
43+
/// Maximum number of disconnects with a request in flight before a peer is banned.
44+
const MAX_NUM_DISCONNECTS: u64 = 3;
45+
46+
/// Peer disconnected with a request in flight after backoffs.
47+
///
48+
/// The peer may be slow to respond to the request after backoffs, or it refuses to respond.
49+
/// Report the peer and let the reputation system handle disconnecting the peer.
50+
pub const REPUTATION_REPORT: Rep = Rep::new_fatal("Peer disconnected with inflight after backoffs");
51+
52+
/// The state of a disconnected peer with a request in flight.
53+
#[derive(Debug)]
54+
struct DisconnectedState {
55+
/// The total number of disconnects.
56+
num_disconnects: u64,
57+
/// The time at the last disconnect.
58+
last_disconnect: std::time::Instant,
59+
}
60+
61+
impl DisconnectedState {
62+
/// Create a new `DisconnectedState`.
63+
pub fn new() -> Self {
64+
Self { num_disconnects: 1, last_disconnect: std::time::Instant::now() }
65+
}
66+
67+
/// Increment the number of disconnects.
68+
pub fn increment(&mut self) {
69+
self.num_disconnects = self.num_disconnects.saturating_add(1);
70+
self.last_disconnect = std::time::Instant::now();
71+
}
72+
73+
/// Get the number of disconnects.
74+
pub fn num_disconnects(&self) -> u64 {
75+
self.num_disconnects
76+
}
77+
78+
/// Get the time of the last disconnect.
79+
pub fn last_disconnect(&self) -> std::time::Instant {
80+
self.last_disconnect
81+
}
82+
}
83+
84+
/// Tracks the state of disconnected peers with a request in flight.
85+
///
86+
/// This helps to prevent submitting requests to peers that have disconnected
87+
/// before responding to the request to offload the peer.
88+
pub struct DisconnectedPeers {
89+
/// The state of disconnected peers.
90+
disconnected_peers: LruMap<PeerId, DisconnectedState>,
91+
/// Backoff duration in seconds.
92+
backoff_seconds: u64,
93+
}
94+
95+
impl DisconnectedPeers {
96+
/// Create a new `DisconnectedPeers`.
97+
pub fn new() -> Self {
98+
Self {
99+
disconnected_peers: LruMap::new(ByLength::new(MAX_DISCONNECTED_PEERS_STATE)),
100+
backoff_seconds: DISCONNECTED_PEER_BACKOFF_SECONDS,
101+
}
102+
}
103+
104+
/// Insert a new peer to the persistent state if not seen before, or update the state if seen.
105+
///
106+
/// Returns true if the peer should be disconnected.
107+
pub fn on_disconnect_during_request(&mut self, peer: PeerId) -> Option<BadPeer> {
108+
if let Some(state) = self.disconnected_peers.get(&peer) {
109+
state.increment();
110+
111+
let should_ban = state.num_disconnects() >= MAX_NUM_DISCONNECTS;
112+
log::debug!(
113+
target: LOG_TARGET,
114+
"Disconnected known peer {peer} state: {state:?}, should ban: {should_ban}",
115+
);
116+
117+
should_ban.then(|| {
118+
// We can lose track of the peer state and let the banning mechanism handle
119+
// the peer backoff.
120+
//
121+
// After the peer banning expires, if the peer continues to misbehave, it will be
122+
// backed off again.
123+
self.disconnected_peers.remove(&peer);
124+
BadPeer(peer, REPUTATION_REPORT)
125+
})
126+
} else {
127+
log::debug!(
128+
target: LOG_TARGET,
129+
"Added peer {peer} for the first time"
130+
);
131+
// First time we see this peer.
132+
self.disconnected_peers.insert(peer, DisconnectedState::new());
133+
None
134+
}
135+
}
136+
137+
/// Check if a peer is available for queries.
138+
pub fn is_peer_available(&mut self, peer_id: &PeerId) -> bool {
139+
let Some(state) = self.disconnected_peers.get(peer_id) else {
140+
return true;
141+
};
142+
143+
let elapsed = state.last_disconnect().elapsed();
144+
if elapsed.as_secs() >= self.backoff_seconds * state.num_disconnects {
145+
log::debug!(target: LOG_TARGET, "Peer {peer_id} is available for queries");
146+
self.disconnected_peers.remove(peer_id);
147+
true
148+
} else {
149+
log::debug!(target: LOG_TARGET,"Peer {peer_id} is backedoff");
150+
false
151+
}
152+
}
153+
}
154+
155+
#[cfg(test)]
156+
mod tests {
157+
use super::*;
158+
use std::time::Duration;
159+
160+
#[test]
161+
fn test_disconnected_peer_state() {
162+
let mut state = DisconnectedPeers::new();
163+
let peer = PeerId::random();
164+
165+
// Is not part of the disconnected peers yet.
166+
assert_eq!(state.is_peer_available(&peer), true);
167+
168+
for _ in 0..MAX_NUM_DISCONNECTS - 1 {
169+
assert!(state.on_disconnect_during_request(peer).is_none());
170+
assert_eq!(state.is_peer_available(&peer), false);
171+
}
172+
173+
assert!(state.on_disconnect_during_request(peer).is_some());
174+
// Peer is supposed to get banned and disconnected.
175+
// The state ownership moves to the PeerStore.
176+
assert!(state.disconnected_peers.get(&peer).is_none());
177+
}
178+
179+
#[test]
180+
fn ensure_backoff_time() {
181+
const TEST_BACKOFF_SECONDS: u64 = 2;
182+
let mut state = DisconnectedPeers {
183+
disconnected_peers: LruMap::new(ByLength::new(1)),
184+
backoff_seconds: TEST_BACKOFF_SECONDS,
185+
};
186+
let peer = PeerId::random();
187+
188+
assert!(state.on_disconnect_during_request(peer).is_none());
189+
assert_eq!(state.is_peer_available(&peer), false);
190+
191+
// Wait until the backoff time has passed
192+
std::thread::sleep(Duration::from_secs(TEST_BACKOFF_SECONDS + 1));
193+
194+
assert_eq!(state.is_peer_available(&peer), true);
195+
}
196+
}

0 commit comments

Comments
 (0)