Regular killing reschedules a cancel of scheduler job

agoscinski · agoscinski · commit 8cdadad148df · 2025-06-02T14:12:50.000+02:00
PR aiidateam#6793 introduced the cancelation of earlier kill actions. This had the problem if two kill commands are set in a sequence, the second kill action will cancel the first one which triggered the cancelation of the scheduler job within an EBM. The second kill command however did not retrigger the cancelation of the scheduler job. This bug appeared because we have two places where the killing logic is placed. More information about this can be found in PR aiidateam#6868 that fixes this properly refactoring the kill action. This PR only serves as a fast temporary fix with workarounds. Before this PR, when the killing command failed through the EBM, the scheduler job could not be cancelled through a kill anymore. Since we have now force-kill option to bypass the EBM, we can reschedule the cancelation of the scheduler job to gracefully kill a process.
diff --git a/src/aiida/engine/processes/calcjobs/tasks.py b/src/aiida/engine/processes/calcjobs/tasks.py
@@ -582,7 +582,6 @@ async def execute(self) -> plumpy.process_states.State:  # type: ignore[override
         except TransportTaskException as exception:
             raise plumpy.process_states.PauseInterruption(f'Pausing after failed transport task: {exception}')
         except plumpy.process_states.KillInterruption as exception:
-            await self._kill_job(node, transport_queue)
             node.set_process_status(str(exception))
             return self.retrieve(monitor_result=self._monitor_result)
         except (plumpy.futures.CancelledError, asyncio.CancelledError):
diff --git a/src/aiida/engine/processes/process.py b/src/aiida/engine/processes/process.py
@@ -51,7 +51,9 @@
 from aiida.common.lang import classproperty, override
 from aiida.common.links import LinkType
 from aiida.common.log import LOG_LEVEL_REPORT
+from aiida.engine.utils import InterruptableFuture
 from aiida.orm.implementation.utils import clean_value
+from aiida.orm.nodes.process.calculation.calcjob import CalcJobNode
 from aiida.orm.utils import serialize
 
 from .builder import ProcessBuilder
@@ -72,6 +74,7 @@ class Process(PlumpyProcess):
     have full provenance saved in the database.
     """
 
+    _cancelling_scheduler_job: asyncio.Task | None = None
     _node_class = orm.ProcessNode
     _spec_class = ProcessSpec
 
@@ -336,10 +339,43 @@ def kill(self, msg_text: str | None = None, force_kill: bool = False) -> Union[b
         """
         self.node.logger.info(f'Request to kill Process<{self.node.pk}>')
 
-        had_been_terminated = self.has_terminated()
+        # PR_COMMENT Because we need to overwrite the logic of the cancelation of the self._killing task of the
+        #            scheduler job, we need to copy this logic of the parent class in plumpy, we need to adapt the
+        #            cancelation of the last sent killing action to also resend the kill/cancelation of the scheduler
+        #            job as we stop this canelation by canceling the last killing action
+        if self.killed():
+            # Already killed
+            return True
+
+        if self.has_terminated():
+            # Can't kill
+            return False
+
+        # Cancel scheduler job
+        if not force_kill and isinstance(self.node, CalcJobNode):
+            if self._killing:
+                self._killing.cancel()
+
+            # PR_COMMENT: We cannot reuse _killing because of type issues, it is a CancellableAction.
+            #             We can wrap a task around a CancellableAction but the CancellableAction catches silently any
+            #             error whilel here we need to know if the cancelation of the scheduler job failed.
+            if self._cancelling_scheduler_job:
+                self._cancelling_scheduler_job.cancel()
+
+            from .calcjobs.tasks import task_kill_job
+
+            coro = self._launch_task(task_kill_job, self.node, self.runner.transport)
+            self._cancelling_scheduler_job = asyncio.create_task(coro)
+            try:
+                self.loop.run_until_complete(self._cancelling_scheduler_job)
+            except Exception as exc:
+                self.node.logger.error(f'While cancelling job error was raised: {exc!s}')
+                return False
 
         result = super().kill(msg_text, force_kill)
 
+        had_been_terminated = self.has_terminated()
+
         # Only kill children if we could be killed ourselves
         if result is not False and not had_been_terminated:
             killing = []
@@ -374,6 +410,24 @@ def done(done_future: plumpy.futures.Future):
 
         return result
 
+    # PR_COMMENT This is a copy of the function in engine/processes/calcjobs/tasks.py
+    #            and will merged to one place in PR #6868
+    async def _launch_task(self, coro, *args, **kwargs):
+        """Launch a coroutine as a task, making sure to make it interruptable."""
+        import functools
+
+        from aiida.engine.utils import interruptable_task
+
+        self._task: Union[InterruptableFuture, None]
+
+        task_fn = functools.partial(coro, *args, **kwargs)
+        try:
+            self._task = interruptable_task(task_fn)
+            result = await self._task
+            return result
+        finally:
+            self._task = None
+
     @override
     def out(self, output_port: str, value: Any = None) -> None:
         """Attach output to output port.
diff --git a/src/aiida/engine/utils.py b/src/aiida/engine/utils.py
@@ -193,6 +193,7 @@ async def exponential_backoff_retry(
     :param ignore_exceptions: exceptions to ignore, i.e. when caught do nothing and simply re-raise
     :return: result if the ``coro`` call completes within ``max_attempts`` retries without raising
     """
+
     if logger is None:
         logger = LOGGER
 
diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py
@@ -25,6 +25,7 @@
 from aiida.common.log import LOG_LEVEL_REPORT
 from aiida.engine import Process, ProcessState
 from aiida.engine.processes import control as process_control
+from aiida.engine.utils import exponential_backoff_retry
 from aiida.orm import CalcJobNode, Group, WorkChainNode, WorkflowNode, WorkFunctionNode
 from tests.utils.processes import WaitProcess
 
@@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams(
 
     try:
         pid = os.getpid()
+        # For easier debugging you can change these to stdout
         sys.stdout = open(log_dir / f'worker-{pid}.out', 'w')
         sys.stderr = open(log_dir / f'worker-{pid}.err', 'w')
         start_daemon_worker(False, aiida_profile_name)
@@ -72,10 +74,22 @@ def mock_open(_):
         raise Exception('Mock open exception')
 
     @staticmethod
-    async def mock_exponential_backoff_retry(*_, **__):
+    async def exponential_backoff_retry_fail_upload(fct: t.Callable[..., t.Any], *args, **kwargs):
         from aiida.common.exceptions import TransportTaskException
 
-        raise TransportTaskException
+        if 'do_upload' in fct.__name__:
+            raise TransportTaskException
+        else:
+            return await exponential_backoff_retry(fct, *args, **kwargs)
+
+    @staticmethod
+    async def exponential_backoff_retry_fail_kill(fct: t.Callable[..., t.Any], *args, **kwargs):
+        from aiida.common.exceptions import TransportTaskException
+
+        if 'do_kill' in fct.__name__:
+            raise TransportTaskException
+        else:
+            return await exponential_backoff_retry(fct, *args, **kwargs)
 
 
 @pytest.fixture(scope='function')
@@ -213,11 +227,12 @@ def make_a_builder(sleep_seconds=0):
 
 @pytest.mark.requires_rmq
 @pytest.mark.usefixtures('started_daemon_client')
-def test_process_kill_failng_ebm(
+def test_process_kill_failing_ebm_upload(
     fork_worker_context, submit_and_await, aiida_code_installed, run_cli_command, monkeypatch
 ):
-    """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally.
-    # (e.g. in scenarios that transport is working again)
+    """Kill a process that is waiting after failed EBM during upload. It should be possible to kill it normally.
+
+    A process that failed upload (e.g. in scenarios that transport is working again) and is then killed with
     """
     from aiida.orm import Int
 
@@ -232,7 +247,10 @@ def make_a_builder(sleep_seconds=0):
 
     kill_timeout = 10
 
-    monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', MockFunctions.mock_exponential_backoff_retry)
+    monkeypatch_args = (
+        'aiida.engine.utils.exponential_backoff_retry',
+        MockFunctions.exponential_backoff_retry_fail_upload,
+    )
     with fork_worker_context(monkeypatch.setattr, monkeypatch_args):
         node = submit_and_await(make_a_builder(), ProcessState.WAITING)
         await_condition(
@@ -241,10 +259,60 @@ def make_a_builder(sleep_seconds=0):
             timeout=kill_timeout,
         )
 
+        # kill should start EBM and should successfully kill
         run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait'])
         await_condition(lambda: node.is_killed, timeout=kill_timeout)
 
 
+@pytest.mark.requires_rmq
+@pytest.mark.usefixtures('started_daemon_client')
+def test_process_kill_failing_ebm_kill(
+    fork_worker_context, submit_and_await, aiida_code_installed, run_cli_command, monkeypatch
+):
+    """Kill a process that with a failng EBM during the kill.
+
+    Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries
+    it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler.
+    """
+    from aiida.orm import Int
+
+    code = aiida_code_installed(default_calc_job_plugin='core.arithmetic.add', filepath_executable='/bin/bash')
+
+    def make_a_builder(sleep_seconds=0):
+        builder = code.get_builder()
+        builder.x = Int(1)
+        builder.y = Int(1)
+        builder.metadata.options.sleep = sleep_seconds
+        return builder
+
+    kill_timeout = 10
+
+    monkeypatch_args = (
+        'aiida.engine.utils.exponential_backoff_retry',
+        MockFunctions.exponential_backoff_retry_fail_kill,
+    )
+    # from aiida.engine.utils import exponential_backoff_retry
+    # monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', exponential_backoff_retry)
+    with fork_worker_context(monkeypatch.setattr, monkeypatch_args):
+        node = submit_and_await(make_a_builder(kill_timeout + 10), ProcessState.WAITING, timeout=kill_timeout)
+        await_condition(
+            lambda: node.process_status == 'Monitoring scheduler: job state RUNNING',
+            timeout=kill_timeout,
+        )
+
+        # kill should start EBM and be not successful in EBM
+        run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait'])
+        await_condition(lambda: not node.is_killed, timeout=kill_timeout)
+
+        # kill should restart EBM and be not successful in EBM
+        run_cli_command(cmd_process.process_kill, [str(node.pk), '--wait'])
+        await_condition(lambda: not node.is_killed, timeout=kill_timeout)
+
+        # force kill should skip EBM and successfully kill the process
+        run_cli_command(cmd_process.process_kill, [str(node.pk), '-F', '--wait'])
+        await_condition(lambda: node.is_killed, timeout=kill_timeout)
+
+
 class TestVerdiProcess:
     """Tests for `verdi process`."""