rails · darinwilson · Feb 18, 2025 · Feb 20, 2025 · Apr 29, 2025
diff --git a/README.md b/README.md
@@ -336,6 +336,8 @@ If processes have no chance of cleaning up before exiting (e.g. if someone pulls
 
 In a similar way, if a worker is terminated in any other way not initiated by the above signals (e.g. a worker is sent a `KILL` signal), jobs in progress will be marked as failed so that they can be inspected, with a `SolidQueue::Processes::Process::ProcessExitError`. Sometimes a job in particular is responsible for this, for example, if it has a memory leak and you have a mechanism to kill processes over a certain memory threshold, so this will help identifying this kind of situation.
 
+In the unlikely event that the supervisor fails (e.g. the database goes offline), Solid Queue can attempt to recover itself. It uses an exponential backoff delay that maxes out at 60 seconds, and the user can set the number of restart attempts that should be tried. See `max_restart_attempts` below.
+
 
 ### Database configuration
 
@@ -362,6 +364,7 @@ There are several settings that control how Solid Queue works that you can set a
 - `process_heartbeat_interval`: the heartbeat interval that all processes will follow—defaults to 60 seconds.
 - `process_alive_threshold`: how long to wait until a process is considered dead after its last heartbeat—defaults to 5 minutes.
 - `shutdown_timeout`: time the supervisor will wait since it sent the `TERM` signal to its supervised processes before sending a `QUIT` version to them requesting immediate termination—defaults to 5 seconds.
+- `max_restart_attempts`: the number of restart attempts Solid Queue should make if the supervisor fails. Set to any number, or `nil` if you want Solid Queue to keep trying forever. The default is 0, which means Solid Queue won't try to recover.
 - `silence_polling`: whether to silence Active Record logs emitted when polling for both workers and dispatchers—defaults to `true`.
 - `supervisor_pidfile`: path to a pidfile that the supervisor will create when booting to prevent running more than one supervisor in the same host, or in case you want to use it for a health check. It's `nil` by default.
 - `preserve_finished_jobs`: whether to keep finished jobs in the `solid_queue_jobs` table—defaults to `true`.

diff --git a/lib/solid_queue.rb b/lib/solid_queue.rb
@@ -36,6 +36,7 @@ module SolidQueue
 
   mattr_accessor :supervisor_pidfile
   mattr_accessor :supervisor, default: false
+  mattr_accessor :max_restart_attempts, default: 0
 
   mattr_accessor :preserve_finished_jobs, default: true
   mattr_accessor :clear_finished_jobs_after, default: 1.day

diff --git a/lib/solid_queue/log_subscriber.rb b/lib/solid_queue/log_subscriber.rb
@@ -144,6 +144,14 @@ def unhandled_signal_error(event)
     error formatted_event(event, action: "Received unhandled signal", **event.payload.slice(:signal))
   end
 
+  def supervisor_restart(event)
+    info formatted_event(event, action: "Supervisor terminated unexpectedly: attempting restart in #{event.payload[:delay]}s", **event.payload.slice(:attempt))
+  end
+
+  def supervisor_restart_failure(event)
+    error formatted_event(event, action: "Supervisor restart attempts failed - exiting", error: formatted_error(event.payload[:error]))
+  end
+
   def replace_fork(event)
     supervisor_pid = event.payload[:supervisor_pid]
     status = event.payload[:status]

diff --git a/lib/solid_queue/supervisor.rb b/lib/solid_queue/supervisor.rb
@@ -13,7 +13,7 @@ def start(**options)
         configuration = Configuration.new(**options)
 
         if configuration.valid?
-          new(configuration).tap(&:start)
+          Launcher.new(configuration).tap(&:start)
         else
           abort configuration.errors.full_messages.join("\n") + "\nExiting..."
         end

diff --git a/lib/solid_queue/supervisor/launcher.rb b/lib/solid_queue/supervisor/launcher.rb
@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+
+module SolidQueue
+  class Supervisor::Launcher
+    MAX_RESTART_DELAY = 60
+
+    def initialize(configuration)
+      @configuration = configuration
+      @current_restart_attempt = 0
+    end
+
+    def start
+      SolidQueue.on_start { @current_restart_attempt = 0 } # reset after successful start
+
+      begin
+        SolidQueue::Supervisor.new(@configuration).tap(&:start)
+      rescue StandardError => error
+        if should_attempt_restart?
+          @current_restart_attempt += 1
+          delay = [ 2 ** @current_restart_attempt, MAX_RESTART_DELAY ].min
+
+          SolidQueue.instrument(:supervisor_restart, delay: delay, attempt: @current_restart_attempt)
+          sleep delay
+          retry
+        else
+          SolidQueue.instrument(:supervisor_restart_failure, error: error)
+          raise
+        end
+      end
+    end
+
+    private
+
+      def should_attempt_restart?
+        SolidQueue.max_restart_attempts.nil? || @current_restart_attempt < SolidQueue.max_restart_attempts
+      end
+  end
+end
diff --git a/test/unit/supervisor_test.rb b/test/unit/supervisor_test.rb
@@ -155,6 +155,22 @@ class SupervisorTest < ActiveSupport::TestCase
     end
   end
 
+  test "attempt to restart supervisor if it fails unexpectedly" do
+    SolidQueue.stubs(:max_restart_attempts).returns(2)
+    SolidQueue::Supervisor.any_instance.expects(:start).raises(StandardError).times(SolidQueue.max_restart_attempts + 1)
+    assert_raises StandardError do
+      SolidQueue::Supervisor.start
+    end
+  end
+
+  test "skip restart attempt if configured not to" do
+    SolidQueue.stubs(:max_restart_attempts).returns(0)
+    SolidQueue::Supervisor.any_instance.expects(:start).raises(StandardError).times(1)
+    assert_raises StandardError do
+      SolidQueue::Supervisor.start
+    end
+  end
+
   private
     def assert_registered_workers(supervisor_pid: nil, count: 1)
       assert_registered_processes(kind: "Worker", count: count, supervisor_pid: supervisor_pid)