From 455e334eeaeb1a748dc9834697bd56ef3f12e7e3 Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Thu, 29 Aug 2024 12:31:35 +0300 Subject: [PATCH 1/8] cumulus/client: added external rpc connection retry logic Signed-off-by: Iulian Barbu --- .../src/reconnecting_ws_client.rs | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index 48d35dd3a55e..f030fc7dee4f 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -34,7 +34,7 @@ use jsonrpsee::{ use sc_rpc_api::chain::ChainApiClient; use schnellru::{ByLength, LruMap}; use sp_runtime::generic::SignedBlock; -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use tokio::sync::mpsc::{ channel as tokio_channel, Receiver as TokioReceiver, Sender as TokioSender, }; @@ -43,6 +43,9 @@ use url::Url; use crate::rpc_client::{distribute_header, RpcDispatcherMessage}; const LOG_TARGET: &str = "reconnecting-websocket-client"; +const DEFAULT_EXTERNAL_RPC_CONN_RETRIES: usize = 5; +const DEFAULT_SLEEP_TIME_MS_BETWEEN_RETRIES: u64 = 1000; +const DEFAULT_SLEEP_EXP_BACKOFF_BETWEEN_RETRIES: i32 = 2; /// Worker that should be used in combination with [`RelayChainRpcClient`]. /// @@ -93,16 +96,45 @@ struct RelayChainSubscriptions { best_subscription: Subscription, } -/// Try to find a new RPC server to connect to. +/// Try to find a new RPC server to connect to. Uses a naive retry +/// logic that does an exponential backoff in between iterations +/// through all URLs from the list. It uses a constant to tell how +/// many iterations of connection attempts to all URLs we allow. We +/// return early when a connection is made. async fn connect_next_available_rpc_server( urls: &Vec, starting_position: usize, ) -> Result<(usize, Arc), ()> { tracing::debug!(target: LOG_TARGET, starting_position, "Connecting to RPC server."); - for (counter, url) in urls.iter().cycle().skip(starting_position).take(urls.len()).enumerate() { + + let mut prev_iteration: u32 = 0; + for (counter, url) in urls + .iter() + .cycle() + .skip(starting_position) + .take(urls.len() * DEFAULT_EXTERNAL_RPC_CONN_RETRIES) + .enumerate() + { + // If we reached the end of the urls list, backoff before retrying + // connections to the entire list once more. + let Ok(current_iteration) = (counter / urls.len()).try_into() else { + tracing::error!(target: LOG_TARGET, "Too many connection attempts to the RPC servers, aborting..."); + break; + }; + if current_iteration > prev_iteration { + // Safe conversion given we convert positive i32s which are lower than u64::MAX. + tokio::time::sleep(Duration::from_millis( + DEFAULT_SLEEP_TIME_MS_BETWEEN_RETRIES * + DEFAULT_SLEEP_EXP_BACKOFF_BETWEEN_RETRIES.pow(prev_iteration) as u64, + )) + .await; + prev_iteration = current_iteration; + } + let index = (starting_position + counter) % urls.len(); tracing::info!( target: LOG_TARGET, + current_iteration, index, url, "Trying to connect to next external relaychain node.", @@ -112,6 +144,7 @@ async fn connect_next_available_rpc_server( Err(err) => tracing::debug!(target: LOG_TARGET, url, ?err, "Unable to connect."), }; } + Err(()) } From 9024ac1e02f9b030122dda554787aab6710888e0 Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Thu, 29 Aug 2024 16:55:43 +0300 Subject: [PATCH 2/8] added prdoc Signed-off-by: Iulian Barbu --- prdoc/pr_5515.prdoc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 prdoc/pr_5515.prdoc diff --git a/prdoc/pr_5515.prdoc b/prdoc/pr_5515.prdoc new file mode 100644 index 000000000000..60f43b922c7f --- /dev/null +++ b/prdoc/pr_5515.prdoc @@ -0,0 +1,15 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Add retry logic in relay chain rpc interface + +doc: + - audience: [ Node Dev, Node Operator ] + description: | + Added a basic retry logic for collators connecting to external RPC servers. The collator + will try for 5 times to connect to each RPC server from the provided list. In between + each iteration will wait a duration which will increase exponentailly by a factor of two. + The maximum time a collator can spend in the retry logic is 1 + 2 + 4 + 8 + 16 = 31 seconds. +crates: + - name: cumulus-relay-chain-rpc-interface + bump: minor From 8e18eded24861138d81a437e253790e086b65499 Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Thu, 29 Aug 2024 22:36:04 +0300 Subject: [PATCH 3/8] added retry logic test Signed-off-by: Iulian Barbu --- Cargo.lock | 1 + .../relay-chain-rpc-interface/Cargo.toml | 3 ++ .../src/reconnecting_ws_client.rs | 49 ++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index bb0f01542d31..dfacad89a3c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4429,6 +4429,7 @@ dependencies = [ "parity-scale-codec", "pin-project", "polkadot-overseer", + "portpicker", "rand", "sc-client-api", "sc-rpc-api", diff --git a/cumulus/client/relay-chain-rpc-interface/Cargo.toml b/cumulus/client/relay-chain-rpc-interface/Cargo.toml index 6c0730a56a26..c2deddc5341d 100644 --- a/cumulus/client/relay-chain-rpc-interface/Cargo.toml +++ b/cumulus/client/relay-chain-rpc-interface/Cargo.toml @@ -9,6 +9,9 @@ license = "GPL-3.0-or-later WITH Classpath-exception-2.0" [lints] workspace = true +[dev-dependencies] +portpicker = "0.1.1" + [dependencies] polkadot-overseer = { workspace = true, default-features = true } diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index f030fc7dee4f..b36ce682b245 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -464,7 +464,10 @@ impl ReconnectingWebsocketWorker { #[cfg(test)] mod test { - use super::url_to_string_with_port; + use std::time::Duration; + + use super::{url_to_string_with_port, ClientManager}; + use jsonrpsee::Methods; use url::Url; #[test] @@ -493,4 +496,48 @@ mod test { url_to_string_with_port(url) ); } + + #[tokio::test] + // Testing the retry logic at full means increasing CI with half a minute according + // to the current logic, so lets test it best effort. + async fn client_manager_retry_logic() { + let port = portpicker::pick_unused_port().unwrap(); + let server = jsonrpsee::server::Server::builder() + .build(format!("0.0.0.0:{}", port)) + .await + .unwrap(); + + // Wait three seconds while attempting connection. + let conn_res = tokio::spawn(async move { + tokio::time::timeout( + Duration::from_secs(3), + ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]), + ) + .await + }); + + // Start the server too. + let server = tokio::spawn(async { + tokio::time::sleep(Duration::from_secs(10)).await; + server.start(Methods::default()) + }); + + // By this time the client can not make a connection because the server is not up. + assert!(conn_res.await.unwrap().is_err()); + + // Trying to connect again to the RPC with a client that stays around for sufficient + // time to catche the RPC server online and connect to it. + let conn_res = tokio::spawn(async move { + tokio::time::timeout( + Duration::from_secs(8), + ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]), + ) + .await + }); + let res = conn_res.await.unwrap(); + assert!(res.is_ok()); + assert!(res.unwrap().is_ok()); + + server.await.unwrap(); + } } From d47a96553e2206d2ff51170fb09920df90c004b4 Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Fri, 30 Aug 2024 17:50:42 +0300 Subject: [PATCH 4/8] emit log for returning w/o any connection Signed-off-by: Iulian Barbu --- .../relay-chain-rpc-interface/src/reconnecting_ws_client.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index b36ce682b245..fb276d447b42 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -145,6 +145,7 @@ async fn connect_next_available_rpc_server( }; } + tracing::error!(target: LOG_TARGET, "Retrying to connect to any external relaychain node failed."); Err(()) } From 71e44b47cc4514f659a3650b8125ebc7143f58c8 Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Fri, 30 Aug 2024 18:03:54 +0300 Subject: [PATCH 5/8] documentet possibiliy for flakyness Signed-off-by: Iulian Barbu --- .../src/reconnecting_ws_client.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index fb276d447b42..d8fd68a6915e 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -471,6 +471,8 @@ mod test { use jsonrpsee::Methods; use url::Url; + const RETRY_LOGIC_SERVER_STARTUP_MARGIN_SECONDS: u64 = 1; + #[test] fn url_to_string_works() { let url = Url::parse("wss://something/path").unwrap(); @@ -500,7 +502,10 @@ mod test { #[tokio::test] // Testing the retry logic at full means increasing CI with half a minute according - // to the current logic, so lets test it best effort. + // to the current logic, so lets test it best effort. The test can get flaky if ran + // on an overloaded machine because it depends on correct timing coordination and the + // margin is set to 1 second (the time after we check if a side effect was produced). + // Increase async fn client_manager_retry_logic() { let port = portpicker::pick_unused_port().unwrap(); let server = jsonrpsee::server::Server::builder() @@ -530,7 +535,7 @@ mod test { // time to catche the RPC server online and connect to it. let conn_res = tokio::spawn(async move { tokio::time::timeout( - Duration::from_secs(8), + Duration::from_secs(7 + RETRY_LOGIC_SERVER_STARTUP_MARGIN_SECONDS), ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]), ) .await From ee68d040653b801897af8e646c19508a8015511d Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Fri, 30 Aug 2024 19:05:17 +0300 Subject: [PATCH 6/8] simplified test Signed-off-by: Iulian Barbu --- .../src/reconnecting_ws_client.rs | 38 ++++--------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index d8fd68a6915e..507479e3f073 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -471,7 +471,7 @@ mod test { use jsonrpsee::Methods; use url::Url; - const RETRY_LOGIC_SERVER_STARTUP_MARGIN_SECONDS: u64 = 1; + const SERVER_STARTUP_DELAY_SECONDS: u64 = 10; #[test] fn url_to_string_works() { @@ -502,10 +502,7 @@ mod test { #[tokio::test] // Testing the retry logic at full means increasing CI with half a minute according - // to the current logic, so lets test it best effort. The test can get flaky if ran - // on an overloaded machine because it depends on correct timing coordination and the - // margin is set to 1 second (the time after we check if a side effect was produced). - // Increase + // to the current logic, so lets test it best effort. async fn client_manager_retry_logic() { let port = portpicker::pick_unused_port().unwrap(); let server = jsonrpsee::server::Server::builder() @@ -513,36 +510,17 @@ mod test { .await .unwrap(); - // Wait three seconds while attempting connection. - let conn_res = tokio::spawn(async move { - tokio::time::timeout( - Duration::from_secs(3), - ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]), - ) - .await - }); - - // Start the server too. + // Start the server. let server = tokio::spawn(async { - tokio::time::sleep(Duration::from_secs(10)).await; + tokio::time::sleep(Duration::from_secs(SERVER_STARTUP_DELAY_SECONDS)).await; server.start(Methods::default()) }); - // By this time the client can not make a connection because the server is not up. - assert!(conn_res.await.unwrap().is_err()); - - // Trying to connect again to the RPC with a client that stays around for sufficient - // time to catche the RPC server online and connect to it. - let conn_res = tokio::spawn(async move { - tokio::time::timeout( - Duration::from_secs(7 + RETRY_LOGIC_SERVER_STARTUP_MARGIN_SECONDS), - ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]), - ) - .await - }); - let res = conn_res.await.unwrap(); + // Start the client. Not exitting right away with an error means it + // is handling gracefully connections refused to the server while it + // is started. + let res = ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]).await; assert!(res.is_ok()); - assert!(res.unwrap().is_ok()); server.await.unwrap(); } From 9694b5c195be79e0e41c11343e6ef773486c9cfc Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Fri, 30 Aug 2024 19:09:50 +0300 Subject: [PATCH 7/8] reformulate a comment Signed-off-by: Iulian Barbu --- .../relay-chain-rpc-interface/src/reconnecting_ws_client.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index 507479e3f073..f565fa2ff369 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -517,8 +517,8 @@ mod test { }); // Start the client. Not exitting right away with an error means it - // is handling gracefully connections refused to the server while it - // is started. + // is handling gracefully received connections refused while the server + // is starting. let res = ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]).await; assert!(res.is_ok()); From 4b0d5219500f1c6864298169c97460fb8e061c5c Mon Sep 17 00:00:00 2001 From: Iulian Barbu Date: Sat, 31 Aug 2024 23:36:00 +0300 Subject: [PATCH 8/8] rename current_iteration field to attempt Signed-off-by: Iulian Barbu --- .../relay-chain-rpc-interface/src/reconnecting_ws_client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs index f565fa2ff369..dc0e9d697b46 100644 --- a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs +++ b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs @@ -134,7 +134,7 @@ async fn connect_next_available_rpc_server( let index = (starting_position + counter) % urls.len(); tracing::info!( target: LOG_TARGET, - current_iteration, + attempt = current_iteration, index, url, "Trying to connect to next external relaychain node.",