paritytech · skunert · Sep 3, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/cumulus/client/relay-chain-rpc-interface/Cargo.toml b/cumulus/client/relay-chain-rpc-interface/Cargo.toml
@@ -9,6 +9,9 @@ license = "GPL-3.0-or-later WITH Classpath-exception-2.0"
 [lints]
 workspace = true
 
+[dev-dependencies]
+portpicker = "0.1.1"
+
 [dependencies]
 polkadot-overseer = { workspace = true, default-features = true }
 

diff --git a/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs b/cumulus/client/relay-chain-rpc-interface/src/reconnecting_ws_client.rs
@@ -34,7 +34,7 @@ use jsonrpsee::{
 use sc_rpc_api::chain::ChainApiClient;
 use schnellru::{ByLength, LruMap};
 use sp_runtime::generic::SignedBlock;
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 use tokio::sync::mpsc::{
 	channel as tokio_channel, Receiver as TokioReceiver, Sender as TokioSender,
 };
@@ -43,6 +43,9 @@ use url::Url;
 use crate::rpc_client::{distribute_header, RpcDispatcherMessage};
 
 const LOG_TARGET: &str = "reconnecting-websocket-client";
+const DEFAULT_EXTERNAL_RPC_CONN_RETRIES: usize = 5;
+const DEFAULT_SLEEP_TIME_MS_BETWEEN_RETRIES: u64 = 1000;
+const DEFAULT_SLEEP_EXP_BACKOFF_BETWEEN_RETRIES: i32 = 2;
 
 /// Worker that should be used in combination with [`RelayChainRpcClient`].
 ///
@@ -93,16 +96,45 @@ struct RelayChainSubscriptions {
 	best_subscription: Subscription<RelayHeader>,
 }
 
-/// Try to find a new RPC server to connect to.
+/// Try to find a new RPC server to connect to. Uses a naive retry
+/// logic that does an exponential backoff in between iterations
+/// through all URLs from the list. It uses a constant to tell how
+/// many iterations of connection attempts to all URLs we allow. We
+/// return early when a connection is made.
 async fn connect_next_available_rpc_server(
 	urls: &Vec<String>,
 	starting_position: usize,
 ) -> Result<(usize, Arc<JsonRpcClient>), ()> {
 	tracing::debug!(target: LOG_TARGET, starting_position, "Connecting to RPC server.");
-	for (counter, url) in urls.iter().cycle().skip(starting_position).take(urls.len()).enumerate() {
+
+	let mut prev_iteration: u32 = 0;
+	for (counter, url) in urls
+		.iter()
+		.cycle()
+		.skip(starting_position)
+		.take(urls.len() * DEFAULT_EXTERNAL_RPC_CONN_RETRIES)
+		.enumerate()
+	{
+		// If we reached the end of the urls list, backoff before retrying
+		// connections to the entire list once more.
+		let Ok(current_iteration) = (counter / urls.len()).try_into() else {
+			tracing::error!(target: LOG_TARGET, "Too many connection attempts to the RPC servers, aborting...");
-			tracing::error!(target: LOG_TARGET, "Too many connection attempts to the RPC servers, aborting...");
+			tracing::error!(target: LOG_TARGET, "Too many failed connection attempts to the RPC servers, aborting...");
-			tracing::error!(target: LOG_TARGET, "Too many connection attempts to the RPC servers, aborting...");
+			tracing::error!(target: LOG_TARGET, "Too many failed connection attempts to the RPC servers, aborting...");
+			break;
+		};
+		if current_iteration > prev_iteration {
+			// Safe conversion given we convert positive i32s which are lower than u64::MAX.
+			tokio::time::sleep(Duration::from_millis(
+				DEFAULT_SLEEP_TIME_MS_BETWEEN_RETRIES *
+					DEFAULT_SLEEP_EXP_BACKOFF_BETWEEN_RETRIES.pow(prev_iteration) as u64,
+			))
+			.await;
+			prev_iteration = current_iteration;
+		}
+
 		let index = (starting_position + counter) % urls.len();
 		tracing::info!(
 			target: LOG_TARGET,
+			current_iteration,
 			index,
 			url,
 			"Trying to connect to next external relaychain node.",
@@ -112,6 +144,7 @@ async fn connect_next_available_rpc_server(
 			Err(err) => tracing::debug!(target: LOG_TARGET, url, ?err, "Unable to connect."),
 		};
 	}
+
 	Err(())
 }
 
@@ -431,7 +464,10 @@ impl ReconnectingWebsocketWorker {
 
 #[cfg(test)]
 mod test {
-	use super::url_to_string_with_port;
+	use std::time::Duration;
+
+	use super::{url_to_string_with_port, ClientManager};
+	use jsonrpsee::Methods;
 	use url::Url;
 
 	#[test]
@@ -460,4 +496,48 @@ mod test {
 			url_to_string_with_port(url)
 		);
 	}
+
+	#[tokio::test]
+	// Testing the retry logic at full means increasing CI with half a minute according
+	// to the current logic, so lets test it best effort.
+	async fn client_manager_retry_logic() {
+		let port = portpicker::pick_unused_port().unwrap();
+		let server = jsonrpsee::server::Server::builder()
+			.build(format!("0.0.0.0:{}", port))
+			.await
+			.unwrap();
+
+		// Wait three seconds while attempting connection.
+		let conn_res = tokio::spawn(async move {
+			tokio::time::timeout(
+				Duration::from_secs(3),
+				ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]),
+			)
+			.await
+		});
+
+		// Start the server too.
+		let server = tokio::spawn(async {
+			tokio::time::sleep(Duration::from_secs(10)).await;
+			server.start(Methods::default())
+		});
+
+		// By this time the client can not make a connection because the server is not up.
+		assert!(conn_res.await.unwrap().is_err());
+
+		// Trying to connect again to the RPC with a client that stays around for sufficient
+		// time to catche the RPC server online and connect to it.
+		let conn_res = tokio::spawn(async move {
+			tokio::time::timeout(
+				Duration::from_secs(8),
+				ClientManager::new(vec![format!("ws://127.0.0.1:{}", port)]),
+			)
+			.await
+		});
+		let res = conn_res.await.unwrap();
+		assert!(res.is_ok());
+		assert!(res.unwrap().is_ok());
+
+		server.await.unwrap();
+	}
 }
diff --git a/prdoc/pr_5515.prdoc b/prdoc/pr_5515.prdoc
@@ -0,0 +1,15 @@
+# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
+# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
+
+title: Add retry logic in relay chain rpc interface
+
+doc:
+  - audience: [ Node Dev, Node Operator ]
+    description: |
+      Added a basic retry logic for collators connecting to external RPC servers. The collator
+      will try for 5 times to connect to each RPC server from the provided list. In between
+      each iteration will wait a duration which will increase exponentailly by a factor of two.
+      The maximum time a collator can spend in the retry logic is 1 + 2 + 4 + 8 + 16 = 31 seconds.
+crates: 
+  - name: cumulus-relay-chain-rpc-interface
+    bump: minor