-
Notifications
You must be signed in to change notification settings - Fork 230
WIP: Implement force-kill option in verdi process kill #6575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
1e9fcdf
3cb8539
75bb430
cad6738
dbd0a06
144680d
bb31b69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,9 +101,9 @@ async def do_upload(): | |
|
||
try: | ||
logger.info(f'scheduled request to upload CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, PreSubmitException, plumpy.process_states.Interruption) | ||
breaking_exceptions = (plumpy.futures.CancelledError, PreSubmitException, plumpy.process_states.Interruption) | ||
skip_submit = await exponential_backoff_retry( | ||
do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
do_upload, initial_interval, max_attempts, logger=node.logger, breaking_exceptions=breaking_exceptions | ||
) | ||
except PreSubmitException: | ||
raise | ||
|
@@ -149,9 +149,9 @@ async def do_submit(): | |
|
||
try: | ||
logger.info(f'scheduled request to submit CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
breaking_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
result = await exponential_backoff_retry( | ||
do_submit, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
do_submit, initial_interval, max_attempts, logger=node.logger, breaking_exceptions=breaking_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
raise | ||
|
@@ -207,9 +207,9 @@ async def do_update(): | |
|
||
try: | ||
logger.info(f'scheduled request to update CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
breaking_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
job_done = await exponential_backoff_retry( | ||
do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
do_update, initial_interval, max_attempts, logger=node.logger, breaking_exceptions=breaking_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
raise | ||
|
@@ -258,9 +258,9 @@ async def do_monitor(): | |
|
||
try: | ||
logger.info(f'scheduled request to monitor CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
breaking_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
monitor_result = await exponential_backoff_retry( | ||
do_monitor, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
do_monitor, initial_interval, max_attempts, logger=node.logger, breaking_exceptions=breaking_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
raise | ||
|
@@ -334,9 +334,9 @@ async def do_retrieve(): | |
|
||
try: | ||
logger.info(f'scheduled request to retrieve CalcJob<{node.pk}>') | ||
ignore_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
breaking_exceptions = (plumpy.futures.CancelledError, plumpy.process_states.Interruption) | ||
result = await exponential_backoff_retry( | ||
do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions | ||
do_retrieve, initial_interval, max_attempts, logger=node.logger, breaking_exceptions=breaking_exceptions | ||
) | ||
except (plumpy.futures.CancelledError, plumpy.process_states.Interruption): | ||
raise | ||
|
@@ -385,7 +385,7 @@ async def do_stash(): | |
initial_interval, | ||
max_attempts, | ||
logger=node.logger, | ||
ignore_exceptions=plumpy.process_states.Interruption, | ||
breaking_exceptions=plumpy.process_states.Interruption, | ||
) | ||
except plumpy.process_states.Interruption: | ||
raise | ||
|
@@ -398,7 +398,9 @@ async def do_stash(): | |
return | ||
|
||
|
||
async def task_kill_job(node: CalcJobNode, transport_queue: TransportQueue, cancellable: InterruptableFuture): | ||
async def task_kill_job( | ||
node: CalcJobNode, transport_queue: TransportQueue, cancellable: InterruptableFuture, force_kill: bool = False | ||
): | ||
"""Transport task that will attempt to kill a job calculation. | ||
|
||
The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager | ||
|
@@ -426,13 +428,19 @@ async def do_kill(): | |
transport = await cancellable.with_interrupt(request) | ||
return execmanager.kill_calculation(node, transport) | ||
|
||
if force_kill: | ||
logger.warning(f'Process<{node.pk}> has been force killed! this may result in orphaned jobs.') | ||
raise plumpy.process_states.ForceKillInterruption('Force killing CalcJob') | ||
try: | ||
logger.info(f'scheduled request to kill CalcJob<{node.pk}>') | ||
result = await exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger) | ||
# Note: any exception raised here, will result in the process being excepted. not killed! | ||
# There for it can result in orphaned jobs! | ||
except plumpy.process_states.Interruption: | ||
logger.warning(f'killing CalcJob<{node.pk}> excepted, the job might be orphaned.') | ||
raise | ||
except Exception as exception: | ||
logger.warning(f'killing CalcJob<{node.pk}> failed') | ||
logger.warning(f'killing CalcJob<{node.pk}> excepted, the job might be orphaned.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't think this warning is correct. When a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not wrong, what you are referring to is already in my test scenario There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand. The exception being caught here is thrown by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. |
||
raise TransportTaskException(f'kill_calculation failed {max_attempts} times consecutively') from exception | ||
else: | ||
logger.info(f'killing CalcJob<{node.pk}> successful') | ||
|
@@ -528,7 +536,7 @@ async def execute(self) -> plumpy.process_states.State: # type: ignore[override | |
monitor_result = await self._monitor_job(node, transport_queue, self.monitors) | ||
|
||
if monitor_result and monitor_result.action is CalcJobMonitorAction.KILL: | ||
await self._kill_job(node, transport_queue) | ||
await self._kill_job(node, transport_queue, force_kill=False) | ||
job_done = True | ||
|
||
if monitor_result and not monitor_result.retrieve: | ||
|
@@ -567,7 +575,11 @@ async def execute(self) -> plumpy.process_states.State: # type: ignore[override | |
except TransportTaskException as exception: | ||
raise plumpy.process_states.PauseInterruption(f'Pausing after failed transport task: {exception}') | ||
except plumpy.process_states.KillInterruption as exception: | ||
await self._kill_job(node, transport_queue) | ||
await self._kill_job(node, transport_queue, force_kill=False) | ||
node.set_process_status(str(exception)) | ||
return self.retrieve(monitor_result=self._monitor_result) | ||
except plumpy.process_states.ForceKillInterruption as exception: | ||
await self._kill_job(node, transport_queue, force_kill=True) | ||
khsrali marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
node.set_process_status(str(exception)) | ||
return self.retrieve(monitor_result=self._monitor_result) | ||
except (plumpy.futures.CancelledError, asyncio.CancelledError): | ||
|
@@ -607,9 +619,9 @@ async def _monitor_job(self, node, transport_queue, monitors) -> CalcJobMonitorR | |
|
||
return monitor_result | ||
|
||
async def _kill_job(self, node, transport_queue) -> None: | ||
async def _kill_job(self, node, transport_queue, force_kill) -> None: | ||
"""Kill the job.""" | ||
await self._launch_task(task_kill_job, node, transport_queue) | ||
await self._launch_task(task_kill_job, node, transport_queue, force_kill=force_kill) | ||
if self._killing is not None: | ||
self._killing.set_result(True) | ||
else: | ||
|
Uh oh!
There was an error while loading. Please reload this page.