aiidateam · agoscinski · May 2, 2025 · Mar 7, 2025 · Mar 10, 2025 · Apr 28, 2025
@@ -16,14 +16,14 @@ dependencies:
 - docstring_parser
 - get-annotations~=0.1
 - python-graphviz~=0.19
+- plumpy~=0.25.0
 - ipython>=7
 - jedi<0.19
 - jinja2~=3.0
 - kiwipy[rmq]~=0.8.4
 - importlib-metadata~=6.0
 - numpy~=1.21
 - paramiko~=3.0
-- plumpy~=0.24.0
 - pgsu~=0.3.0
 - psutil~=5.6
 - psycopg[binary]~=3.0

@@ -28,14 +28,14 @@ dependencies = [
   'docstring-parser',
   'get-annotations~=0.1;python_version<"3.10"',
   'graphviz~=0.19',
+  'plumpy~=0.25.0',
   'ipython>=7',
   'jedi<0.19',
   'jinja2~=3.0',
   'kiwipy[rmq]~=0.8.4',
   'importlib-metadata~=6.0',
   'numpy~=1.21',
   'paramiko~=3.0',
-  'plumpy~=0.24.0',
   'pgsu~=0.3.0',
   'psutil~=5.6',
   'psycopg[binary]~=3.0',

diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py
@@ -12,6 +12,7 @@
 
 from aiida.cmdline.commands.cmd_verdi import verdi
 from aiida.cmdline.params import arguments, options, types
+from aiida.cmdline.params.options.overridable import OverridableOption
 from aiida.cmdline.utils import decorators, echo
 from aiida.common.log import LOG_LEVELS, capture_logging
 
@@ -318,10 +319,25 @@ def process_status(call_link_label, most_recent_node, max_depth, processes):
 @verdi_process.command('kill')
 @arguments.PROCESSES()
 @options.ALL(help='Kill all processes if no specific processes are specified.')
-@options.TIMEOUT()
+@OverridableOption(
+    '-t',
+    '--timeout',
+    type=click.FLOAT,
+    default=5.0,
+    show_default=True,
+    help='Time in seconds to wait for a response of the kill task before timing out.',
+)()
 @options.WAIT()
+@OverridableOption(
+    '-F',
+    '--force-kill',
+    is_flag=True,
+    default=False,
+    help='Kills the process without waiting for a confirmation if the job has been killed.\n'
+    'Note: This may lead to orphaned jobs on your HPC and should be used with caution.',
+)()
 @decorators.with_dbenv()
-def process_kill(processes, all_entries, timeout, wait):
+def process_kill(processes, all_entries, timeout, wait, force_kill):
     """Kill running processes.
 
     Kill one or multiple running processes."""
@@ -338,11 +354,17 @@ def process_kill(processes, all_entries, timeout, wait):
     if all_entries:
         click.confirm('Are you sure you want to kill all processes?', abort=True)
 
+    if force_kill:
+        echo.echo_warning('Force kill is enabled. This may lead to orphaned jobs on your HPC.')
+        msg_text = 'Force killed through `verdi process kill`'
+    else:
+        msg_text = 'Killed through `verdi process kill`'
     with capture_logging() as stream:
         try:
             control.kill_processes(
                 processes,
-                msg_text='Killed through `verdi process kill`',
+                msg_text=msg_text,
+                force_kill=force_kill,
                 all_entries=all_entries,
                 timeout=timeout,
                 wait=wait,

diff --git a/src/aiida/engine/daemon/client.py b/src/aiida/engine/daemon/client.py
@@ -477,6 +477,10 @@ def get_worker_info(self, timeout: int | None = None) -> dict[str, t.Any]:
         command = {'command': 'stats', 'properties': {'name': self.daemon_name}}
         return self.call_client(command, timeout=timeout)
 
+    def get_number_of_workers(self, timeout: int | None = None) -> int:
+        """Get number of workers."""
+        return len(self.get_worker_info(timeout).get('info', []))
+
     def get_daemon_info(self, timeout: int | None = None) -> dict[str, t.Any]:
         """Get statistics about this daemon itself.
 
@@ -531,7 +535,8 @@ def start_daemon(
         try:
             subprocess.check_output(command, env=env, stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as exception:
-            raise DaemonException('The daemon failed to start.') from exception
+            # CalledProcessError is not passing the subprocess stderr in its message so we add it in DaemonException
+            raise DaemonException(f'The daemon failed to start with error:\n{exception.stdout.decode()}') from exception
 
         if not wait:
             return

diff --git a/src/aiida/engine/daemon/worker.py b/src/aiida/engine/daemon/worker.py
@@ -12,6 +12,7 @@
 import logging
 import signal
 import sys
+from typing import Union
 
 from aiida.common.log import configure_logging
 from aiida.engine.daemon.client import get_daemon_client
@@ -32,18 +33,20 @@ async def shutdown_worker(runner: Runner) -> None:
         task.cancel()
 
     await asyncio.gather(*tasks, return_exceptions=True)
+
     runner.close()
 
     LOGGER.info('Daemon worker stopped')
 
 
-def start_daemon_worker(foreground: bool = False) -> None:
+def start_daemon_worker(foreground: bool = False, profile_name: Union[str, None] = None) -> None:
     """Start a daemon worker for the currently configured profile.
 
     :param foreground: If true, the logging will be configured to write to stdout, otherwise it will be configured to
         write to the daemon log file.
     """
-    daemon_client = get_daemon_client()
+
+    daemon_client = get_daemon_client(profile_name)
     configure_logging(with_orm=True, daemon=not foreground, daemon_log_file=daemon_client.daemon_log_file)
 
     LOGGER.debug(f'sys.executable: {sys.executable}')

diff --git a/src/aiida/engine/processes/calcjobs/tasks.py b/src/aiida/engine/processes/calcjobs/tasks.py
@@ -24,10 +24,11 @@
 from aiida.common.datastructures import CalcJobState
 from aiida.common.exceptions import FeatureNotAvailable, TransportTaskException
 from aiida.common.folders import SandboxFolder
+from aiida.engine import utils
 from aiida.engine.daemon import execmanager
 from aiida.engine.processes.exit_code import ExitCode
 from aiida.engine.transports import TransportQueue
-from aiida.engine.utils import InterruptableFuture, exponential_backoff_retry, interruptable_task
+from aiida.engine.utils import InterruptableFuture, interruptable_task
 from aiida.manage.configuration import get_config_option
 from aiida.orm.nodes.process.calculation.calcjob import CalcJobNode
 from aiida.schedulers.datastructures import JobState
@@ -102,7 +103,7 @@
     try:
         logger.info(f'scheduled request to upload CalcJob<{node.pk}>')
         ignore_exceptions = (plumpy.futures.CancelledError, PreSubmitException, plumpy.process_states.Interruption)
-        skip_submit = await exponential_backoff_retry(
+        skip_submit = await utils.exponential_backoff_retry(
             do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
         )
     except PreSubmitException:
@@ -150,7 +151,7 @@
     try:
         logger.info(f'scheduled request to submit CalcJob<{node.pk}>')
         ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption)
-        result = await exponential_backoff_retry(
+        result = await utils.exponential_backoff_retry(
             do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
         )
     except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):
@@ -208,7 +209,7 @@
     try:
         logger.info(f'scheduled request to update CalcJob<{node.pk}>')
         ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption)
-        job_done = await exponential_backoff_retry(
+        job_done = await utils.exponential_backoff_retry(
             do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
         )
     except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):
@@ -258,7 +259,7 @@
     try:
         logger.info(f'scheduled request to monitor CalcJob<{node.pk}>')
         ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption)
-        monitor_result = await exponential_backoff_retry(
+        monitor_result = await utils.exponential_backoff_retry(
             do_monitor, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
         )
     except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):
@@ -326,7 +327,7 @@
     try:
         logger.info(f'scheduled request to retrieve CalcJob<{node.pk}>')
         ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption)
-        result = await exponential_backoff_retry(
+        result = await utils.exponential_backoff_retry(
             do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
         )
     except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):
@@ -371,7 +372,7 @@
             return await execmanager.stash_calculation(node, transport)
 
     try:
-        await exponential_backoff_retry(
+        await utils.exponential_backoff_retry(
             do_stash,
             initial_interval,
             max_attempts,
@@ -419,7 +420,7 @@
 
     try:
         logger.info(f'scheduled request to kill CalcJob<{node.pk}>')
-        result = await exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger)
+        result = await utils.exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger)
     except plumpy.process_states.Interruption:
         raise
     except Exception as exception:

diff --git a/src/aiida/engine/processes/control.py b/src/aiida/engine/processes/control.py
@@ -173,6 +173,7 @@
     processes: list[ProcessNode] | None = None,
     *,
     msg_text: str = 'Killed through `aiida.engine.processes.control.kill_processes`',
+    force_kill: bool = False,
     all_entries: bool = False,
     timeout: float = 5.0,
     wait: bool = False,
@@ -201,7 +202,7 @@
         return
 
     controller = get_manager().get_process_controller()
-    action = functools.partial(controller.kill_process, msg_text=msg_text)
+    action = functools.partial(controller.kill_process, msg_text=msg_text, force_kill=force_kill)
     _perform_actions(processes, action, 'kill', 'killing', timeout, wait)
 
 
@@ -276,15 +277,17 @@
             LOGGER.error(f'got unexpected response when {present} Process<{process.pk}>: {result}')
 
     try:
-        for future in concurrent.futures.as_completed(futures.keys(), timeout=timeout):
-            process = futures[future]
-
+        for future, process in futures.items():
+            # unwrap is need here since LoopCommunicator will also wrap a future
+            unwrapped = unwrap_kiwi_future(future)
             try:
-                # unwrap is need here since LoopCommunicator will also wrap a future
-                unwrapped = unwrap_kiwi_future(future)
-                result = unwrapped.result()
+                result = unwrapped.result(timeout=timeout)
             except communications.TimeoutError:
-                LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out')
+                cancelled = unwrapped.cancel()
+                if cancelled:
+                    LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out and was cancelled.')
+                else:
+                    LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out but could not be cancelled.')
             except Exception as exception:
                 LOGGER.error(f'failed to {infinitive} Process<{process.pk}>: {exception}')
             else:

diff --git a/src/aiida/engine/processes/process.py b/src/aiida/engine/processes/process.py
@@ -329,7 +329,7 @@ def load_instance_state(
 
         self.node.logger.info(f'Loaded process<{self.node.pk}> from saved state')
 
-    def kill(self, msg_text: str | None = None) -> Union[bool, plumpy.futures.Future]:
+    def kill(self, msg_text: str | None = None, force_kill: bool = False) -> Union[bool, plumpy.futures.Future]:
         """Kill the process and all the children calculations it called
 
         :param msg: message
@@ -338,7 +338,7 @@ def kill(self, msg_text: str | None = None) -> Union[bool, plumpy.futures.Future
 
         had_been_terminated = self.has_terminated()
 
-        result = super().kill(msg_text)
+        result = super().kill(msg_text, force_kill)
 
         # Only kill children if we could be killed ourselves
         if result is not False and not had_been_terminated:

diff --git a/src/aiida/manage/tests/pytest_fixtures.py b/src/aiida/manage/tests/pytest_fixtures.py
@@ -760,6 +760,7 @@
                     f'Daemon <{started_daemon_client.profile.name}|{daemon_status}> log file content: \n'
                     f'{daemon_log_file}'
                 )
+            time.sleep(0.1)
 
         return node
 

diff --git a/src/aiida/tools/pytest_fixtures/daemon.py b/src/aiida/tools/pytest_fixtures/daemon.py
@@ -155,6 +155,7 @@ def factory(
                     f'Daemon <{started_daemon_client.profile.name}|{daemon_status}> log file content: \n'
                     f'{daemon_log_file}'
                 )
+            time.sleep(0.1)
 
         return node