Skip to content

Commit c783ab8

Browse files
committed
fix: block production graceful shutdown simplified
1 parent bb54d6d commit c783ab8

File tree

1 file changed

+64
-197
lines changed
  • madara/crates/client/block_production/src

1 file changed

+64
-197
lines changed

madara/crates/client/block_production/src/lib.rs

Lines changed: 64 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,38 @@
6565
//! interval defined by the `pending tick` as set in the chain config.
6666
//! (TODO(mohit 13/10/2025): update this when 0.14.0 merges)
6767
//!
68+
//! ## Graceful Shutdown and Error Handling
69+
//!
70+
//! The [`BlockProductionTask::run`] method implements graceful shutdown and error handling for
71+
//! batcher and executor tasks. The main loop tracks completion of both tasks, which only complete
72+
//! during shutdown scenarios (cancellation, error, or panic).
73+
//!
74+
//! ### Graceful Shutdown
75+
//!
76+
//! When a cancellation signal is received:
77+
//! 1. The batcher detects cancellation and exits gracefully, closing the `send_batch` channel
78+
//! 2. The executor detects the channel closure and finalizes any open block
79+
//! 3. The executor sends an `EndBlock` message and then completes
80+
//! 4. The main loop processes the `EndBlock`, closes the block, and exits when both tasks complete
81+
//!
82+
//! ### Batcher Panic/Error
83+
//!
84+
//! If the batcher encounters an error or panics:
85+
//! - **With preconfirmed block**: The error is saved and graceful shutdown is attempted. The batcher
86+
//! closes the channel, executor closes the block, and shutdown completes with the saved error.
87+
//! - **Without preconfirmed block**: The error is returned immediately (no need to wait for executor).
88+
//!
89+
//! ### Executor Panic
90+
//!
91+
//! If the executor thread panics:
92+
//! - The panic is caught and propagated via the `stop` channel
93+
//! - The main loop resumes the panic, causing the block to remain preconfirmed
94+
//! - The preconfirmed block will be handled on restart
95+
//!
96+
//! The loop exits when:
97+
//! - Both batcher and executor have completed → returns `Ok(())` or the saved batcher error
98+
//! - Batcher errored without a preconfirmed block → returns the error immediately
99+
//!
68100
//! [mempool]: mc_mempool
69101
//! [`StartNewBlock`]: ExecutorMessage::StartNewBlock
70102
//! [`BatchExecuted`]: ExecutorMessage::BatchExecuted
@@ -742,7 +774,6 @@ impl BlockProductionTask {
742774
let batch_sender = executor.send_batch.take().context("Channel sender already taken")?;
743775
let bypass_tx_input = self.bypass_tx_input.take().context("Bypass tx channel already taken")?;
744776
// Clone ctx to check for cancellation in the main loop
745-
let mut ctx_for_cancellation = ctx.clone();
746777
let mut batcher_task = AbortOnDrop::spawn(
747778
Batcher::new(
748779
self.backend.clone(),
@@ -755,227 +786,63 @@ impl BlockProductionTask {
755786
.run(),
756787
);
757788

758-
// Graceful shutdown flow:
759-
//
760-
// ┌─────────────────────────────────────────────────────────────────────────────┐
761-
// │ Graceful Shutdown Flow │
762-
// └─────────────────────────────────────────────────────────────────────────────┘
763-
//
764-
// 1. Cancellation Signal (ctx.cancel_global())
765-
// │
766-
// ├─> Main loop detects cancellation → marks shutting_down = true
767-
// │
768-
// └─> Batcher detects cancellation → exits → closes send_batch channel
769-
//
770-
// 2. Batcher Task Completion
771-
// │
772-
// ├─> Normal completion (Ok):
773-
// │ └─> batcher_completed = true → continue loop (wait for EndBlock)
774-
// │
775-
// └─> Error completion (Err):
776-
// │
777-
// ├─> If preconfirmed block exists:
778-
// │ ├─> shutting_down = true
779-
// │ ├─> batcher_completed = true
780-
// │ └─> Continue loop (attempt graceful shutdown)
781-
// │
782-
// └─> If no preconfirmed block:
783-
// └─> Return error immediately
784-
//
785-
// 3. Executor detects send_batch channel closure (WaitTxBatchOutcome::Exit)
786-
// │
787-
// ├─> Check: Is there an executing block?
788-
// │ │
789-
// │ ├─> YES: Finalize block → send EndBlock message → exit
790-
// │ │ └─> Uses executor's existing state (no re-execution needed)
791-
// │ │
792-
// │ └─> NO: Just exit (nothing to close)
793-
//
794-
// 4. Main loop receives EndBlock message
795-
// │
796-
// ├─> process_reply(EndBlock) → closes block (DB responsibility)
797-
// │
798-
// └─> Check shutdown conditions:
799-
// │
800-
// ├─> shutting_down = true ✓
801-
// ├─> is_end_block = true ✓
802-
// └─> batcher_completed = true ✓
803-
// │
804-
// └─> If all true → shutdown complete ✅
805-
// └─> If batcher_completed = false → continue loop (wait for batcher)
806-
//
807-
// 5. Race Condition Handling
808-
// │
809-
// └─> If EndBlock received before batcher completes:
810-
// ├─> Block closes (normal operation or block time deadline)
811-
// ├─> EndBlock processed but batcher_completed = false
812-
// └─> Loop continues until batcher_completed = true
813-
//
814-
// 6. Executor Panic Handling
815-
// │
816-
// └─> If executor panics, the panic propagates naturally
817-
// Block remains preconfirmed and will be handled on restart
818-
//
819-
// Key Benefits:
820-
// - Simpler flow: Executor handles block closing automatically
821-
// - Error handling: Batcher errors handled gracefully if block exists
822-
// - Race condition: EndBlock before batcher completion handled correctly
823-
// - No re-execution: Uses executor's existing state (unless executor panics)
824-
// - No timeout: Madara's graceful shutdown timeout is sufficient
825-
// - Panic handling: Attempts to close block if executor panics
826-
//
827-
// Note: CloseBlock command is still supported for explicit shutdown requests,
828-
// but the Exit path (batcher shutdown) automatically closes blocks.
829-
830-
// Track if we're shutting down to detect when EndBlock completes shutdown
831-
let mut shutting_down = false;
789+
// Track shutdown state: both batcher and executor must complete before shutdown finishes.
790+
// Both tasks only complete during shutdown scenarios (cancellation, error, or panic).
832791
let mut batcher_completed = false;
792+
let mut executor_completed = false;
793+
let mut batcher_error: Option<anyhow::Error> = None; // Store batcher error to return after graceful shutdown
833794

834795
// Main loop: handles normal operation and graceful shutdown
835796
loop {
836797
tokio::select! {
837-
// ============================================================
838-
// Path 1: Cancellation detected
839-
// ============================================================
840-
// When cancellation is requested, we mark shutting down and wait for
841-
// the executor to automatically close any open block when it detects
842-
// the send_batch channel closure.
843-
_ = ctx_for_cancellation.cancelled(), if !shutting_down => {
844-
shutting_down = true;
845-
tracing::debug!("Cancellation detected, waiting for executor to close block");
846-
// Continue loop to wait for EndBlock or batcher completion
847-
}
848-
849-
// ============================================================
850-
// Path 2: Batcher task completed
851-
// ============================================================
852-
// The batcher task can complete in two ways:
853-
//
854-
// 1. Normal completion (Ok):
855-
// - Cancellation detected → batcher exits gracefully
856-
// - Closes send_batch channel → signals executor to shut down
857-
// - Sets batcher_completed = true
858-
// - If shutting_down, continue loop to wait for EndBlock
859-
//
860-
// 2. Error completion (Err):
861-
// - Batcher encounters error (e.g., database error, network error)
862-
// - Sets batcher_completed = true
863-
// - If preconfirmed block exists:
864-
// * Mark shutting_down = true
865-
// * Attempt graceful shutdown (batcher dropping closes send_batch channel)
866-
// * Continue loop to wait for EndBlock
867-
// - If no preconfirmed block:
868-
// * Propagate error immediately (no graceful shutdown needed)
869-
//
870-
// The executor will automatically close any open block when it detects
871-
// the send_batch channel closure (if a block exists).
798+
// Path 1: Batcher task completed (cancellation, error, or channel closure)
872799
res = &mut batcher_task, if !batcher_completed => {
873800
batcher_completed = true;
874-
875-
// Handle batcher result
876801
match res {
877-
Ok(()) => {
878-
// Normal completion (cancellation or natural completion)
879-
if shutting_down {
880-
// We're shutting down, continue waiting for EndBlock from executor
881-
continue;
882-
}
883-
// Normal batcher completion (not during shutdown)
884-
// This shouldn't happen normally, but handle gracefully
885-
}
802+
Ok(()) => tracing::debug!("Batcher task completed normally"),
886803
Err(e) => {
887-
// Batcher errored - attempt graceful shutdown if we have a preconfirmed block
888-
tracing::warn!("Batcher task errored: {e:?}");
889-
890-
// Check if we have a preconfirmed block that needs closing
804+
let error = e.context("In batcher task");
805+
tracing::warn!("Batcher task errored: {error:?}");
891806
if self.backend.has_preconfirmed_block() {
892-
shutting_down = true;
807+
batcher_error = Some(error);
893808
tracing::warn!("Batcher errored with preconfirmed block, attempting graceful shutdown");
894-
// Batcher task dropping will close send_batch channel automatically
895-
// Executor will detect channel closure and send EndBlock
896-
// Continue loop to wait for EndBlock
897-
continue;
898809
} else {
899-
// No block to close, propagate error immediately
900-
return Err(e).context("In batcher task");
810+
batcher_error = Some(error);
901811
}
902812
}
903813
}
904814
}
905815

906-
// ============================================================
907-
// Path 3: Executor replies (EndBlock message)
908-
// ============================================================
909-
// The executor thread sends messages back via the replies channel. When we
910-
// receive an EndBlock message, it means the executor has finalized the block
911-
// and we can close it using the execution summary.
912-
//
913-
// EndBlock can be received in two scenarios:
914-
//
915-
// 1. During graceful shutdown:
916-
// - Executor detects send_batch channel closure
917-
// - Executor finalizes block and sends EndBlock
918-
// - Main loop processes EndBlock and checks shutdown conditions
919-
//
920-
// 2. Normal block closing (not shutdown):
921-
// - Block closes due to block time deadline, block full, or explicit CloseBlock
922-
// - Executor sends EndBlock as part of normal operation
923-
// - Main loop processes EndBlock normally (shutting_down = false)
924-
//
925-
// Race Condition Handling:
926-
// - If EndBlock received during shutdown but batcher_completed = false:
927-
// * This can happen if block closes due to block time deadline before batcher
928-
// detects cancellation
929-
// * Continue loop to wait for batcher completion
930-
// * Shutdown only completes when both EndBlock processed AND batcher_completed = true
816+
// Path 2: Executor replies (EndBlock message during normal operation or shutdown)
931817
Some(reply) = executor.replies.recv() => {
932-
let is_end_block = matches!(reply, ExecutorMessage::EndBlock(_));
933-
934-
// Process the reply (this will close the block if it's EndBlock)
935818
self.process_reply(reply).await.context("Processing reply from executor thread")?;
936-
937-
// If we're shutting down and just processed EndBlock, check if shutdown is complete
938-
if shutting_down && is_end_block {
939-
// Ensure batcher has completed before finishing shutdown
940-
// This handles the race condition where EndBlock is received before batcher
941-
// detects cancellation (e.g., when block closes due to block time deadline)
942-
if !batcher_completed {
943-
tracing::debug!("EndBlock received during shutdown, waiting for batcher to complete");
944-
continue;
945-
}
946-
tracing::debug!("EndBlock processed during graceful shutdown, shutdown complete");
947-
return Ok(());
948-
}
949819
}
950820

951-
// ============================================================
952-
// Path 4: Executor thread stopped
953-
// ============================================================
954-
// The executor thread signals completion via the stop channel. This happens
955-
// when the executor detects the send_batch channel closure and exits.
956-
//
957-
// With the new implementation, the executor should send EndBlock before exiting
958-
// (if a block exists). If we reach here during shutdown, it means either:
959-
// - No block existed (executor exited without sending EndBlock)
960-
// - Executor panicked before sending EndBlock
961-
//
962-
// If executor panicked, the panic propagates naturally (handled by StopErrorReceiver).
963-
// Block remains preconfirmed and will be handled on restart.
821+
// Path 3: Executor thread stopped (normal completion or panic)
964822
res = executor.stop.recv() => {
965-
if shutting_down {
966-
// Executor exited during shutdown
967-
// If there was a block, executor should have sent EndBlock before exiting
968-
// If we're here, either no block existed or executor panicked
969-
// In case of panic, it will propagate naturally
970-
tracing::debug!("Executor shut down during graceful shutdown");
971-
return Ok(());
972-
}
973-
// Normal executor completion (not during shutdown)
974-
// If executor panicked, recv() will resume the panic (handled by StopErrorReceiver)
823+
executor_completed = true;
975824
res.context("In executor thread")?;
976-
return Ok(());
825+
tracing::debug!("Executor thread stopped");
977826
}
978827
}
828+
829+
// Exit conditions:
830+
// 1. Both completed → shutdown done (return error if any, otherwise success)
831+
// 2. Batcher errored without preconfirmed block → exit immediately with error
832+
if batcher_completed && executor_completed {
833+
tracing::debug!("Shutdown complete: batcher completed, executor completed");
834+
return batcher_error
835+
.map(|e| {
836+
tracing::warn!("Shutdown completed but batcher had error: {e:?}");
837+
Err(e)
838+
})
839+
.unwrap_or(Ok(()));
840+
}
841+
842+
if batcher_error.is_some() && !self.backend.has_preconfirmed_block() {
843+
tracing::debug!("Batcher errored without preconfirmed block, exiting");
844+
return Err(batcher_error.take().unwrap());
845+
}
979846
}
980847
}
981848
}

0 commit comments

Comments
 (0)