-
Notifications
You must be signed in to change notification settings - Fork 231
Implement force-kill option in verdi process kill #6793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4c6d52d
8d96493
d13e7a1
692adcf
831419a
dc2cb77
56ce11f
fd169a7
a26e782
f5c4be5
83e4d84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,10 +24,11 @@ | |
from aiida.common.datastructures import CalcJobState | ||
from aiida.common.exceptions import FeatureNotAvailable, TransportTaskException | ||
from aiida.common.folders import SandboxFolder | ||
from aiida.engine import utils | ||
from aiida.engine.daemon import execmanager | ||
from aiida.engine.processes.exit_code import ExitCode | ||
from aiida.engine.transports import TransportQueue | ||
from aiida.engine.utils import InterruptableFuture, exponential_backoff_retry, interruptable_task | ||
from aiida.engine.utils import InterruptableFuture, interruptable_task | ||
from aiida.manage.configuration import get_config_option | ||
from aiida.orm.nodes.process.calculation.calcjob import CalcJobNode | ||
from aiida.schedulers.datastructures import JobState | ||
|
@@ -102,7 +103,7 @@ | |
try: | ||
logger.info(f'scheduled request to upload CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, PreSubmitException, plumpy.process_states.Interruption) | ||
skip_submit = await exponential_backoff_retry( | ||
skip_submit = await utils.exponential_backoff_retry( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these changes are important to be able to monkeypatch, otherwise one imports a local copy of the function that cannot be monkeypatched |
||
do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
) | ||
except PreSubmitException: | ||
|
@@ -150,7 +151,7 @@ | |
try: | ||
logger.info(f'scheduled request to submit CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
result = await exponential_backoff_retry( | ||
result = await utils.exponential_backoff_retry( | ||
do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
|
@@ -208,7 +209,7 @@ | |
try: | ||
logger.info(f'scheduled request to update CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
job_done = await exponential_backoff_retry( | ||
job_done = await utils.exponential_backoff_retry( | ||
do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
|
@@ -258,7 +259,7 @@ | |
try: | ||
logger.info(f'scheduled request to monitor CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
monitor_result = await exponential_backoff_retry( | ||
monitor_result = await utils.exponential_backoff_retry( | ||
do_monitor, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
|
@@ -326,7 +327,7 @@ | |
try: | ||
logger.info(f'scheduled request to retrieve CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
result = await exponential_backoff_retry( | ||
result = await utils.exponential_backoff_retry( | ||
do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
|
@@ -371,7 +372,7 @@ | |
return await execmanager.stash_calculation(node, transport) | ||
|
||
try: | ||
await exponential_backoff_retry( | ||
await utils.exponential_backoff_retry( | ||
do_stash, | ||
initial_interval, | ||
max_attempts, | ||
|
@@ -419,7 +420,7 @@ | |
|
||
try: | ||
logger.info(f'scheduled request to kill CalcJob<{node.pk}>') | ||
result = await exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) | ||
result = await utils.exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) | ||
except plumpy.process_states.Interruption: | ||
raise | ||
except Exception as exception: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,6 +173,7 @@ | |
processes: list[ProcessNode] | None = None, | ||
*, | ||
msg_text: str = 'Killed through `aiida.engine.processes.control.kill_processes`', | ||
force_kill: bool = False, | ||
all_entries: bool = False, | ||
timeout: float = 5.0, | ||
wait: bool = False, | ||
|
@@ -201,7 +202,7 @@ | |
return | ||
|
||
controller = get_manager().get_process_controller() | ||
action = functools.partial(controller.kill_process, msg_text=msg_text) | ||
action = functools.partial(controller.kill_process, msg_text=msg_text, force_kill=force_kill) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion, this is one of the changes need to be tested. The CLI tests are calling this and should not test against using daemon, which cross too many boundaries. In essence, if I understand correctly to make
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be now tested. |
||
_perform_actions(processes, action, 'kill', 'killing', timeout, wait) | ||
|
||
|
||
|
@@ -276,15 +277,17 @@ | |
LOGGER.error(f'got unexpected response when {present} Process<{process.pk}>: {result}') | ||
|
||
try: | ||
for future in concurrent.futures.as_completed(futures.keys(), timeout=timeout): | ||
process = futures[future] | ||
|
||
for future, process in futures.items(): | ||
# unwrap is need here since LoopCommunicator will also wrap a future | ||
unwrapped = unwrap_kiwi_future(future) | ||
try: | ||
agoscinski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# unwrap is need here since LoopCommunicator will also wrap a future | ||
unwrapped = unwrap_kiwi_future(future) | ||
result = unwrapped.result() | ||
result = unwrapped.result(timeout=timeout) | ||
except communications.TimeoutError: | ||
LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out') | ||
cancelled = unwrapped.cancel() | ||
if cancelled: | ||
LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out and was cancelled.') | ||
else: | ||
LOGGER.error(f'call to {infinitive} Process<{process.pk}> timed out but could not be cancelled.') | ||
except Exception as exception: | ||
LOGGER.error(f'failed to {infinitive} Process<{process.pk}>: {exception}') | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -329,7 +329,7 @@ def load_instance_state( | |
|
||
self.node.logger.info(f'Loaded process<{self.node.pk}> from saved state') | ||
|
||
def kill(self, msg_text: str | None = None) -> Union[bool, plumpy.futures.Future]: | ||
def kill(self, msg_text: str | None = None, force_kill: bool = False) -> Union[bool, plumpy.futures.Future]: | ||
"""Kill the process and all the children calculations it called | ||
|
||
:param msg: message | ||
|
@@ -338,7 +338,7 @@ def kill(self, msg_text: str | None = None) -> Union[bool, plumpy.futures.Future | |
|
||
had_been_terminated = self.has_terminated() | ||
|
||
result = super().kill(msg_text) | ||
result = super().kill(msg_text, force_kill) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the other thing I think require a test, but from the change you made, it should be the plumpy's responsibility to make sure it kill properly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, you already did the change aiidateam/plumpy#320, I think there is the place require more test to make sure the behavior of kill works as expected. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have now several tests. Do you think something is still missing? |
||
|
||
# Only kill children if we could be killed ourselves | ||
if result is not False and not had_been_terminated: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -760,6 +760,7 @@ | |
f'Daemon <{started_daemon_client.profile.name}|{daemon_status}> log file content: \n' | ||
f'{daemon_log_file}' | ||
) | ||
time.sleep(0.1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to not keep cpu spinning |
||
|
||
return node | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice 👍