Orchestrator crash recovery (#340)

adamruzicka · ezr-ondrej · commit 132249743df4 · 2019-10-15T10:58:16.000+02:00
* Run only one sidekiq thread for orchestrator

The orchestrator jobs only take messages from redis and put them into the
orchestrator's mailbox. By reducing the number of threads to 1, we rule out
possibility of shuffling order of incoming messages.

* Make orchestrator discard items submitted by another orchestrator

When the orchestrator boots up, it puts a DrainMarker job onto the default
queue, puts itself into recovery mode and starts processing messages.

When in recovery mode, it discards any incoming WorkerDone jobs. This means
anything that was being done while there was no orchestrator gets only processed
by the workers. When in recovery, the orchestrator should still be able to
acquire completely new jobs and being in recovery should only impact already
running jobs.

When a worker receives the DrainMarker job, it replies to orchestrator with a
StartupComplete job. Upon receiving the StartupComplete job, the orchestrator
performs world invalidation and tries to resume all execution plans which were
being managed by the previous orchestrator. When the world invalidation is done,
the orchestrator goes out of recovery.

This "round trip" is done to ensure we get into as consistent state as possible.
After the DrainMarker-StartupComplete exchange, there will be at most n
inconsistent execution plans, where n is the number of worker threads. This is
just a heuristic and not a complete solution which starts to break when more
queues are added.

Upon receiving a WorkerDone from a worker with world_id different from the
current orchestrator's one, the orchestrator tries to resume the execution plan,
unless it already holds its execution lock.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -34,7 +34,7 @@ services:
   orchestrator:
     <<: *common
     command: |
-      sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator
+      sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1
   worker:
     <<: *common
     command: |
diff --git a/examples/remote_executor.rb b/examples/remote_executor.rb
@@ -69,6 +69,7 @@ def initialize_sidekiq_orchestrator
         config.connector           = connector
         config.executor            = ::Dynflow::Executors::Sidekiq::Core
         config.process_role        = :orchestrator
+        config.auto_validity_check = false
       end
     end
 
diff --git a/lib/dynflow/director.rb b/lib/dynflow/director.rb
@@ -21,11 +21,12 @@ class Director
     UnprocessableEvent = Class.new(Dynflow::Error)
 
     class WorkItem < Serializable
-      attr_reader :execution_plan_id, :queue
+      attr_reader :execution_plan_id, :queue, :sender_orchestrator_id
 
-      def initialize(execution_plan_id, queue)
+      def initialize(execution_plan_id, queue, sender_orchestrator_id)
         @execution_plan_id = execution_plan_id
         @queue = queue
+        @sender_orchestrator_id = sender_orchestrator_id
       end
 
       def world
@@ -46,7 +47,8 @@ def execute
       def to_hash
         { class: self.class.name,
           execution_plan_id: execution_plan_id,
-          queue: queue }
+          queue: queue,
+          sender_orchestrator_id: sender_orchestrator_id }
       end
 
       def self.new_from_hash(hash, *_args)
@@ -57,8 +59,8 @@ def self.new_from_hash(hash, *_args)
     class StepWorkItem < WorkItem
       attr_reader :step
 
-      def initialize(execution_plan_id, step, queue)
-        super(execution_plan_id, queue)
+      def initialize(execution_plan_id, step, queue, sender_orchestrator_id)
+        super(execution_plan_id, queue, sender_orchestrator_id)
         @step = step
       end
 
@@ -73,15 +75,16 @@ def to_hash
       def self.new_from_hash(hash, *_args)
         self.new(hash[:execution_plan_id],
                  Serializable.from_hash(hash[:step], hash[:execution_plan_id], Dynflow.process_world),
-                 hash[:queue])
+                 hash[:queue],
+                 hash[:sender_orchestrator_id])
       end
     end
 
     class EventWorkItem < StepWorkItem
       attr_reader :event, :request_id
 
-      def initialize(request_id, execution_plan_id, step, event, queue)
-        super(execution_plan_id, step, queue)
+      def initialize(request_id, execution_plan_id, step, event, queue, sender_orchestrator_id)
+        super(execution_plan_id, step, queue, sender_orchestrator_id)
         @event = event
         @request_id = request_id
       end
@@ -99,16 +102,17 @@ def self.new_from_hash(hash, *_args)
                  hash[:execution_plan_id],
                  Serializable.from_hash(hash[:step], hash[:execution_plan_id], Dynflow.process_world),
                  Dynflow.serializer.load(hash[:event]),
-                 hash[:queue])
+                 hash[:queue],
+                 hash[:sender_orchestrator_id])
       end
     end
 
     class FinalizeWorkItem < WorkItem
       attr_reader :finalize_steps_data
 
       # @param finalize_steps_data - used to pass the result steps from the worker back to orchestrator
-      def initialize(execution_plan_id, queue, finalize_steps_data = nil)
-        super(execution_plan_id, queue)
+      def initialize(execution_plan_id, queue, sender_orchestrator_id, finalize_steps_data = nil)
+        super(execution_plan_id, queue, sender_orchestrator_id)
         @finalize_steps_data = finalize_steps_data
       end
 
@@ -124,7 +128,7 @@ def to_hash
       end
 
       def self.new_from_hash(hash, *_args)
-        self.new(hash[:execution_plan_id], hash[:queue], hash[:finalize_steps_data])
+        self.new(*hash.values_at(:execution_plan_id, :queue, :sender_orchestrator_id, :finalize_steps_data))
       end
     end
 
diff --git a/lib/dynflow/director/execution_plan_manager.rb b/lib/dynflow/director/execution_plan_manager.rb
@@ -31,7 +31,7 @@ def restart
       end
 
       def prepare_next_step(step)
-        StepWorkItem.new(execution_plan.id, step, step.queue).tap do |work|
+        StepWorkItem.new(execution_plan.id, step, step.queue, @world.id).tap do |work|
           @running_steps_manager.add(step, work)
         end
       end
@@ -112,7 +112,7 @@ def start_finalize
         return if execution_plan.finalize_flow.empty?
         raise 'finalize phase already started' if @finalize_manager
         @finalize_manager = SequentialManager.new(@world, execution_plan)
-        [FinalizeWorkItem.new(execution_plan.id, execution_plan.finalize_steps.first.queue)]
+        [FinalizeWorkItem.new(execution_plan.id, execution_plan.finalize_steps.first.queue, @world.id)]
       end
 
       def finish
diff --git a/lib/dynflow/director/running_steps_manager.rb b/lib/dynflow/director/running_steps_manager.rb
@@ -98,7 +98,7 @@ def event(event)
       def create_next_event_work_item(step)
         event = @events.shift(step.id)
         return unless event
-        work = EventWorkItem.new(event.request_id, event.execution_plan_id, step, event.event, step.queue)
+        work = EventWorkItem.new(event.request_id, event.execution_plan_id, step, event.event, step.queue, @world.id)
         @work_items.push(step.id, work)
         work
       end
diff --git a/lib/dynflow/executors/sidekiq/core.rb b/lib/dynflow/executors/sidekiq/core.rb
@@ -21,6 +21,7 @@ def initialize(world, *_args)
           wait_for_orchestrator_lock
           super
           schedule_update_telemetry
+          begin_startup!
         end
 
         def heartbeat
@@ -56,6 +57,29 @@ def update_telemetry
           schedule_update_telemetry
         end
 
+        def work_finished(work)
+          # If the work item is sent in reply to a request from the current orchestrator, proceed
+          if work.sender_orchestrator_id == @world.id
+            super
+          else
+            # If we're in recovery, we can drop the work as the execution plan will be resumed during validity checks performed when leaving recovery
+            # If we're not in recovery and receive an event from another orchestrator, it means it survived the queue draining.
+            handle_unknown_work_item(work) unless @recovery
+          end
+        end
+
+        def begin_startup!
+          WorkerJobs::DrainMarker.perform_async(@world.id)
+          @recovery = true
+        end
+
+        def startup_complete
+          logger.info('Performing validity checks')
+          @world.perform_validity_checks
+          logger.info('Finished performing validity checks')
+          @recovery = false
+        end
+
         private
 
         def fallback_queue
@@ -69,6 +93,20 @@ def schedule_update_telemetry
         def telemetry_options(queue)
           { queue: queue.to_s, world: @world.id }
         end
+
+        # We take a look if an execution lock is already being held by an orchestrator (it should be the current one). If no lock is held
+        # we try to resume the execution plan if possible
+        def handle_unknown_work_item(work)
+          # We are past recovery now, if we receive an event here, the execution plan will be most likely paused
+          # We can either try to rescue it or turn it over to stopped
+          execution_lock = @world.coordinator.find_locks(class: Coordinator::ExecutionLock.name,
+                                                         id: "execution-plan:#{work.execution_plan_id}").first
+          if execution_lock.nil?
+            plan = @world.persistence.load_execution_plan(work.execution_plan_id)
+            should_resume = !plan.error? || plan.prepare_for_rescue == :running
+            @world.execute(plan.id) if should_resume
+          end
+        end
       end
     end
   end
diff --git a/lib/dynflow/executors/sidekiq/orchestrator_jobs.rb b/lib/dynflow/executors/sidekiq/orchestrator_jobs.rb
@@ -35,6 +35,20 @@ def perform(error, work_item)
             Dynflow.process_world.executor.core.tell([:handle_persistence_error, error, work_item])
           end
         end
+
+        class StartupComplete < InternalJobBase
+          sidekiq_options queue: :dynflow_orchestrator
+
+          # @param request_envelope [Dispatcher::Request] - request to handle on orchestrator side
+          #   usually to start new execution or to pass some event
+          def perform(world_id)
+            if Dynflow.process_world.id == world_id
+              Dynflow.process_world.executor.core.tell([:startup_complete])
+            else
+              logger.warn("Received startup complete for a different world #{world_id}, discarding.")
+            end
+          end
+        end
       end
     end
   end
diff --git a/lib/dynflow/executors/sidekiq/worker_jobs.rb b/lib/dynflow/executors/sidekiq/worker_jobs.rb
@@ -30,6 +30,12 @@ def with_telemetry(work_item)
             end
           end
         end
+
+        class DrainMarker < InternalJobBase
+          def perform(world_id)
+            OrchestratorJobs::StartupComplete.perform_async(world_id)
+          end
+        end
       end
     end
   end
diff --git a/test/executor_test.rb b/test/executor_test.rb
@@ -27,6 +27,10 @@ module ExecutorTest
           ::Dynflow.instance_variable_set('@process_world', nil)
         end
 
+        before do
+          executor.any_instance.stubs(:begin_startup!)
+        end
+
         let(:world) do
           WorldFactory.create_world do |c|
             c.executor     = executor