6565//! interval defined by the `pending tick` as set in the chain config.
6666//! (TODO(mohit 13/10/2025): update this when 0.14.0 merges)
6767//!
68+ //! ## Graceful Shutdown and Error Handling
69+ //!
70+ //! The [`BlockProductionTask::run`] method implements graceful shutdown and error handling for
71+ //! batcher and executor tasks. The main loop tracks completion of both tasks, which only complete
72+ //! during shutdown scenarios (cancellation, error, or panic).
73+ //!
74+ //! ### Graceful Shutdown
75+ //!
76+ //! When a cancellation signal is received:
77+ //! 1. The batcher detects cancellation and exits gracefully, closing the `send_batch` channel
78+ //! 2. The executor detects the channel closure and finalizes any open block
79+ //! 3. The executor sends an `EndBlock` message and then completes
80+ //! 4. The main loop processes the `EndBlock`, closes the block, and exits when both tasks complete
81+ //!
82+ //! ### Batcher Panic/Error
83+ //!
84+ //! If the batcher encounters an error or panics:
85+ //! - **With preconfirmed block**: The error is saved and graceful shutdown is attempted. The batcher
86+ //! closes the channel, executor closes the block, and shutdown completes with the saved error.
87+ //! - **Without preconfirmed block**: The error is returned immediately (no need to wait for executor).
88+ //!
89+ //! ### Executor Panic
90+ //!
91+ //! If the executor thread panics:
92+ //! - The panic is caught and propagated via the `stop` channel
93+ //! - The main loop resumes the panic, causing the block to remain preconfirmed
94+ //! - The preconfirmed block will be handled on restart
95+ //!
96+ //! The loop exits when:
97+ //! - Both batcher and executor have completed → returns `Ok(())` or the saved batcher error
98+ //! - Batcher errored without a preconfirmed block → returns the error immediately
99+ //!
68100//! [mempool]: mc_mempool
69101//! [`StartNewBlock`]: ExecutorMessage::StartNewBlock
70102//! [`BatchExecuted`]: ExecutorMessage::BatchExecuted
@@ -742,7 +774,6 @@ impl BlockProductionTask {
742774 let batch_sender = executor. send_batch . take ( ) . context ( "Channel sender already taken" ) ?;
743775 let bypass_tx_input = self . bypass_tx_input . take ( ) . context ( "Bypass tx channel already taken" ) ?;
744776 // Clone ctx to check for cancellation in the main loop
745- let mut ctx_for_cancellation = ctx. clone ( ) ;
746777 let mut batcher_task = AbortOnDrop :: spawn (
747778 Batcher :: new (
748779 self . backend . clone ( ) ,
@@ -755,227 +786,63 @@ impl BlockProductionTask {
755786 . run ( ) ,
756787 ) ;
757788
758- // Graceful shutdown flow:
759- //
760- // ┌─────────────────────────────────────────────────────────────────────────────┐
761- // │ Graceful Shutdown Flow │
762- // └─────────────────────────────────────────────────────────────────────────────┘
763- //
764- // 1. Cancellation Signal (ctx.cancel_global())
765- // │
766- // ├─> Main loop detects cancellation → marks shutting_down = true
767- // │
768- // └─> Batcher detects cancellation → exits → closes send_batch channel
769- //
770- // 2. Batcher Task Completion
771- // │
772- // ├─> Normal completion (Ok):
773- // │ └─> batcher_completed = true → continue loop (wait for EndBlock)
774- // │
775- // └─> Error completion (Err):
776- // │
777- // ├─> If preconfirmed block exists:
778- // │ ├─> shutting_down = true
779- // │ ├─> batcher_completed = true
780- // │ └─> Continue loop (attempt graceful shutdown)
781- // │
782- // └─> If no preconfirmed block:
783- // └─> Return error immediately
784- //
785- // 3. Executor detects send_batch channel closure (WaitTxBatchOutcome::Exit)
786- // │
787- // ├─> Check: Is there an executing block?
788- // │ │
789- // │ ├─> YES: Finalize block → send EndBlock message → exit
790- // │ │ └─> Uses executor's existing state (no re-execution needed)
791- // │ │
792- // │ └─> NO: Just exit (nothing to close)
793- //
794- // 4. Main loop receives EndBlock message
795- // │
796- // ├─> process_reply(EndBlock) → closes block (DB responsibility)
797- // │
798- // └─> Check shutdown conditions:
799- // │
800- // ├─> shutting_down = true ✓
801- // ├─> is_end_block = true ✓
802- // └─> batcher_completed = true ✓
803- // │
804- // └─> If all true → shutdown complete ✅
805- // └─> If batcher_completed = false → continue loop (wait for batcher)
806- //
807- // 5. Race Condition Handling
808- // │
809- // └─> If EndBlock received before batcher completes:
810- // ├─> Block closes (normal operation or block time deadline)
811- // ├─> EndBlock processed but batcher_completed = false
812- // └─> Loop continues until batcher_completed = true
813- //
814- // 6. Executor Panic Handling
815- // │
816- // └─> If executor panics, the panic propagates naturally
817- // Block remains preconfirmed and will be handled on restart
818- //
819- // Key Benefits:
820- // - Simpler flow: Executor handles block closing automatically
821- // - Error handling: Batcher errors handled gracefully if block exists
822- // - Race condition: EndBlock before batcher completion handled correctly
823- // - No re-execution: Uses executor's existing state (unless executor panics)
824- // - No timeout: Madara's graceful shutdown timeout is sufficient
825- // - Panic handling: Attempts to close block if executor panics
826- //
827- // Note: CloseBlock command is still supported for explicit shutdown requests,
828- // but the Exit path (batcher shutdown) automatically closes blocks.
829-
830- // Track if we're shutting down to detect when EndBlock completes shutdown
831- let mut shutting_down = false ;
789+ // Track shutdown state: both batcher and executor must complete before shutdown finishes.
790+ // Both tasks only complete during shutdown scenarios (cancellation, error, or panic).
832791 let mut batcher_completed = false ;
792+ let mut executor_completed = false ;
793+ let mut batcher_error: Option < anyhow:: Error > = None ; // Store batcher error to return after graceful shutdown
833794
834795 // Main loop: handles normal operation and graceful shutdown
835796 loop {
836797 tokio:: select! {
837- // ============================================================
838- // Path 1: Cancellation detected
839- // ============================================================
840- // When cancellation is requested, we mark shutting down and wait for
841- // the executor to automatically close any open block when it detects
842- // the send_batch channel closure.
843- _ = ctx_for_cancellation. cancelled( ) , if !shutting_down => {
844- shutting_down = true ;
845- tracing:: debug!( "Cancellation detected, waiting for executor to close block" ) ;
846- // Continue loop to wait for EndBlock or batcher completion
847- }
848-
849- // ============================================================
850- // Path 2: Batcher task completed
851- // ============================================================
852- // The batcher task can complete in two ways:
853- //
854- // 1. Normal completion (Ok):
855- // - Cancellation detected → batcher exits gracefully
856- // - Closes send_batch channel → signals executor to shut down
857- // - Sets batcher_completed = true
858- // - If shutting_down, continue loop to wait for EndBlock
859- //
860- // 2. Error completion (Err):
861- // - Batcher encounters error (e.g., database error, network error)
862- // - Sets batcher_completed = true
863- // - If preconfirmed block exists:
864- // * Mark shutting_down = true
865- // * Attempt graceful shutdown (batcher dropping closes send_batch channel)
866- // * Continue loop to wait for EndBlock
867- // - If no preconfirmed block:
868- // * Propagate error immediately (no graceful shutdown needed)
869- //
870- // The executor will automatically close any open block when it detects
871- // the send_batch channel closure (if a block exists).
798+ // Path 1: Batcher task completed (cancellation, error, or channel closure)
872799 res = & mut batcher_task, if !batcher_completed => {
873800 batcher_completed = true ;
874-
875- // Handle batcher result
876801 match res {
877- Ok ( ( ) ) => {
878- // Normal completion (cancellation or natural completion)
879- if shutting_down {
880- // We're shutting down, continue waiting for EndBlock from executor
881- continue ;
882- }
883- // Normal batcher completion (not during shutdown)
884- // This shouldn't happen normally, but handle gracefully
885- }
802+ Ok ( ( ) ) => tracing:: debug!( "Batcher task completed normally" ) ,
886803 Err ( e) => {
887- // Batcher errored - attempt graceful shutdown if we have a preconfirmed block
888- tracing:: warn!( "Batcher task errored: {e:?}" ) ;
889-
890- // Check if we have a preconfirmed block that needs closing
804+ let error = e. context( "In batcher task" ) ;
805+ tracing:: warn!( "Batcher task errored: {error:?}" ) ;
891806 if self . backend. has_preconfirmed_block( ) {
892- shutting_down = true ;
807+ batcher_error = Some ( error ) ;
893808 tracing:: warn!( "Batcher errored with preconfirmed block, attempting graceful shutdown" ) ;
894- // Batcher task dropping will close send_batch channel automatically
895- // Executor will detect channel closure and send EndBlock
896- // Continue loop to wait for EndBlock
897- continue ;
898809 } else {
899- // No block to close, propagate error immediately
900- return Err ( e) . context( "In batcher task" ) ;
810+ batcher_error = Some ( error) ;
901811 }
902812 }
903813 }
904814 }
905815
906- // ============================================================
907- // Path 3: Executor replies (EndBlock message)
908- // ============================================================
909- // The executor thread sends messages back via the replies channel. When we
910- // receive an EndBlock message, it means the executor has finalized the block
911- // and we can close it using the execution summary.
912- //
913- // EndBlock can be received in two scenarios:
914- //
915- // 1. During graceful shutdown:
916- // - Executor detects send_batch channel closure
917- // - Executor finalizes block and sends EndBlock
918- // - Main loop processes EndBlock and checks shutdown conditions
919- //
920- // 2. Normal block closing (not shutdown):
921- // - Block closes due to block time deadline, block full, or explicit CloseBlock
922- // - Executor sends EndBlock as part of normal operation
923- // - Main loop processes EndBlock normally (shutting_down = false)
924- //
925- // Race Condition Handling:
926- // - If EndBlock received during shutdown but batcher_completed = false:
927- // * This can happen if block closes due to block time deadline before batcher
928- // detects cancellation
929- // * Continue loop to wait for batcher completion
930- // * Shutdown only completes when both EndBlock processed AND batcher_completed = true
816+ // Path 2: Executor replies (EndBlock message during normal operation or shutdown)
931817 Some ( reply) = executor. replies. recv( ) => {
932- let is_end_block = matches!( reply, ExecutorMessage :: EndBlock ( _) ) ;
933-
934- // Process the reply (this will close the block if it's EndBlock)
935818 self . process_reply( reply) . await . context( "Processing reply from executor thread" ) ?;
936-
937- // If we're shutting down and just processed EndBlock, check if shutdown is complete
938- if shutting_down && is_end_block {
939- // Ensure batcher has completed before finishing shutdown
940- // This handles the race condition where EndBlock is received before batcher
941- // detects cancellation (e.g., when block closes due to block time deadline)
942- if !batcher_completed {
943- tracing:: debug!( "EndBlock received during shutdown, waiting for batcher to complete" ) ;
944- continue ;
945- }
946- tracing:: debug!( "EndBlock processed during graceful shutdown, shutdown complete" ) ;
947- return Ok ( ( ) ) ;
948- }
949819 }
950820
951- // ============================================================
952- // Path 4: Executor thread stopped
953- // ============================================================
954- // The executor thread signals completion via the stop channel. This happens
955- // when the executor detects the send_batch channel closure and exits.
956- //
957- // With the new implementation, the executor should send EndBlock before exiting
958- // (if a block exists). If we reach here during shutdown, it means either:
959- // - No block existed (executor exited without sending EndBlock)
960- // - Executor panicked before sending EndBlock
961- //
962- // If executor panicked, the panic propagates naturally (handled by StopErrorReceiver).
963- // Block remains preconfirmed and will be handled on restart.
821+ // Path 3: Executor thread stopped (normal completion or panic)
964822 res = executor. stop. recv( ) => {
965- if shutting_down {
966- // Executor exited during shutdown
967- // If there was a block, executor should have sent EndBlock before exiting
968- // If we're here, either no block existed or executor panicked
969- // In case of panic, it will propagate naturally
970- tracing:: debug!( "Executor shut down during graceful shutdown" ) ;
971- return Ok ( ( ) ) ;
972- }
973- // Normal executor completion (not during shutdown)
974- // If executor panicked, recv() will resume the panic (handled by StopErrorReceiver)
823+ executor_completed = true ;
975824 res. context( "In executor thread" ) ?;
976- return Ok ( ( ) ) ;
825+ tracing :: debug! ( "Executor thread stopped" ) ;
977826 }
978827 }
828+
829+ // Exit conditions:
830+ // 1. Both completed → shutdown done (return error if any, otherwise success)
831+ // 2. Batcher errored without preconfirmed block → exit immediately with error
832+ if batcher_completed && executor_completed {
833+ tracing:: debug!( "Shutdown complete: batcher completed, executor completed" ) ;
834+ return batcher_error
835+ . map ( |e| {
836+ tracing:: warn!( "Shutdown completed but batcher had error: {e:?}" ) ;
837+ Err ( e)
838+ } )
839+ . unwrap_or ( Ok ( ( ) ) ) ;
840+ }
841+
842+ if batcher_error. is_some ( ) && !self . backend . has_preconfirmed_block ( ) {
843+ tracing:: debug!( "Batcher errored without preconfirmed block, exiting" ) ;
844+ return Err ( batcher_error. take ( ) . unwrap ( ) ) ;
845+ }
979846 }
980847 }
981848}
0 commit comments