25
25
from aiida .common .log import LOG_LEVEL_REPORT
26
26
from aiida .engine import Process , ProcessState
27
27
from aiida .engine .processes import control as process_control
28
+ from aiida .engine .utils import exponential_backoff_retry
28
29
from aiida .orm import CalcJobNode , Group , WorkChainNode , WorkflowNode , WorkFunctionNode
29
30
from tests .utils .processes import WaitProcess
30
31
@@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams(
53
54
54
55
try :
55
56
pid = os .getpid ()
57
+ # For easier debugging you can change these to stdout
56
58
sys .stdout = open (log_dir / f'worker-{ pid } .out' , 'w' )
57
59
sys .stderr = open (log_dir / f'worker-{ pid } .err' , 'w' )
58
60
start_daemon_worker (False , aiida_profile_name )
@@ -72,10 +74,22 @@ def mock_open(_):
72
74
raise Exception ('Mock open exception' )
73
75
74
76
@staticmethod
75
- async def mock_exponential_backoff_retry ( * _ , ** __ ):
77
+ async def exponential_backoff_retry_fail_upload ( fct : t . Callable [..., t . Any ], * args , ** kwargs ):
76
78
from aiida .common .exceptions import TransportTaskException
77
79
78
- raise TransportTaskException
80
+ if 'do_upload' in fct .__name__ :
81
+ raise TransportTaskException
82
+ else :
83
+ return await exponential_backoff_retry (fct , * args , ** kwargs )
84
+
85
+ @staticmethod
86
+ async def exponential_backoff_retry_fail_kill (fct : t .Callable [..., t .Any ], * args , ** kwargs ):
87
+ from aiida .common .exceptions import TransportTaskException
88
+
89
+ if 'do_kill' in fct .__name__ :
90
+ raise TransportTaskException
91
+ else :
92
+ return await exponential_backoff_retry (fct , * args , ** kwargs )
79
93
80
94
81
95
@pytest .fixture (scope = 'function' )
@@ -213,11 +227,12 @@ def make_a_builder(sleep_seconds=0):
213
227
214
228
@pytest .mark .requires_rmq
215
229
@pytest .mark .usefixtures ('started_daemon_client' )
216
- def test_process_kill_failng_ebm (
230
+ def test_process_kill_failing_ebm_upload (
217
231
fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
218
232
):
219
- """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally.
220
- # (e.g. in scenarios that transport is working again)
233
+ """Kill a process that is waiting after failed EBM during upload. It should be possible to kill it normally.
234
+
235
+ A process that failed upload (e.g. in scenarios that transport is working again) and is then killed with
221
236
"""
222
237
from aiida .orm import Int
223
238
@@ -232,7 +247,10 @@ def make_a_builder(sleep_seconds=0):
232
247
233
248
kill_timeout = 10
234
249
235
- monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry' , MockFunctions .mock_exponential_backoff_retry )
250
+ monkeypatch_args = (
251
+ 'aiida.engine.utils.exponential_backoff_retry' ,
252
+ MockFunctions .exponential_backoff_retry_fail_upload ,
253
+ )
236
254
with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
237
255
node = submit_and_await (make_a_builder (), ProcessState .WAITING )
238
256
await_condition (
@@ -241,10 +259,60 @@ def make_a_builder(sleep_seconds=0):
241
259
timeout = kill_timeout ,
242
260
)
243
261
262
+ # kill should start EBM and should successfully kill
244
263
run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
245
264
await_condition (lambda : node .is_killed , timeout = kill_timeout )
246
265
247
266
267
+ @pytest .mark .requires_rmq
268
+ @pytest .mark .usefixtures ('started_daemon_client' )
269
+ def test_process_kill_failing_ebm_kill (
270
+ fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
271
+ ):
272
+ """Kill a process that with a failng EBM during the kill.
273
+
274
+ Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries
275
+ it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler.
276
+ """
277
+ from aiida .orm import Int
278
+
279
+ code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
280
+
281
+ def make_a_builder (sleep_seconds = 0 ):
282
+ builder = code .get_builder ()
283
+ builder .x = Int (1 )
284
+ builder .y = Int (1 )
285
+ builder .metadata .options .sleep = sleep_seconds
286
+ return builder
287
+
288
+ kill_timeout = 10
289
+
290
+ monkeypatch_args = (
291
+ 'aiida.engine.utils.exponential_backoff_retry' ,
292
+ MockFunctions .exponential_backoff_retry_fail_kill ,
293
+ )
294
+ # from aiida.engine.utils import exponential_backoff_retry
295
+ # monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry', exponential_backoff_retry)
296
+ with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
297
+ node = submit_and_await (make_a_builder (kill_timeout + 10 ), ProcessState .WAITING , timeout = kill_timeout )
298
+ await_condition (
299
+ lambda : node .process_status == 'Monitoring scheduler: job state RUNNING' ,
300
+ timeout = kill_timeout ,
301
+ )
302
+
303
+ # kill should start EBM and be not successful in EBM
304
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
305
+ await_condition (lambda : not node .is_killed , timeout = kill_timeout )
306
+
307
+ # kill should restart EBM and be not successful in EBM
308
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
309
+ await_condition (lambda : not node .is_killed , timeout = kill_timeout )
310
+
311
+ # force kill should skip EBM and successfully kill the process
312
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '-F' , '--wait' ])
313
+ await_condition (lambda : node .is_killed , timeout = kill_timeout )
314
+
315
+
248
316
class TestVerdiProcess :
249
317
"""Tests for `verdi process`."""
250
318
0 commit comments