25
25
from aiida .common .log import LOG_LEVEL_REPORT
26
26
from aiida .engine import Process , ProcessState
27
27
from aiida .engine .processes import control as process_control
28
- from aiida .orm import CalcJobNode , Group , WorkChainNode , WorkflowNode , WorkFunctionNode
28
+ from aiida .engine .utils import exponential_backoff_retry
29
+ from aiida .orm import CalcJobNode , Group , Int , WorkChainNode , WorkflowNode , WorkFunctionNode
29
30
from tests .utils .processes import WaitProcess
30
31
31
32
FuncArgs = tuple [t .Any , ...]
@@ -53,6 +54,7 @@ def start_daemon_worker_in_foreground_and_redirect_streams(
53
54
54
55
try :
55
56
pid = os .getpid ()
57
+ # For easier debugging you can change these to stdout
56
58
sys .stdout = open (log_dir / f'worker-{ pid } .out' , 'w' )
57
59
sys .stderr = open (log_dir / f'worker-{ pid } .err' , 'w' )
58
60
start_daemon_worker (False , aiida_profile_name )
@@ -72,10 +74,22 @@ def mock_open(_):
72
74
raise Exception ('Mock open exception' )
73
75
74
76
@staticmethod
75
- async def mock_exponential_backoff_retry ( * _ , ** __ ):
77
+ async def exponential_backoff_retry_fail_upload ( fct : t . Callable [..., t . Any ], * args , ** kwargs ):
76
78
from aiida .common .exceptions import TransportTaskException
77
79
78
- raise TransportTaskException
80
+ if 'do_upload' in fct .__name__ :
81
+ raise TransportTaskException
82
+ else :
83
+ return await exponential_backoff_retry (fct , * args , ** kwargs )
84
+
85
+ @staticmethod
86
+ async def exponential_backoff_retry_fail_kill (fct : t .Callable [..., t .Any ], * args , ** kwargs ):
87
+ from aiida .common .exceptions import TransportTaskException
88
+
89
+ if 'do_kill' in fct .__name__ :
90
+ raise TransportTaskException
91
+ else :
92
+ return await exponential_backoff_retry (fct , * args , ** kwargs )
79
93
80
94
81
95
@pytest .fixture (scope = 'function' )
@@ -138,7 +152,6 @@ def test_process_kill_failing_transport(
138
152
A failure in opening a transport connection results in the EBM to be fired blocking a regular kill command.
139
153
The force kill command will ignore the EBM and kill the process in any case."""
140
154
from aiida .cmdline .utils .common import get_process_function_report
141
- from aiida .orm import Int
142
155
143
156
code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
144
157
@@ -179,7 +192,6 @@ def test_process_kill_failing_transport_failed_kill(
179
192
"""
180
193
181
194
from aiida .cmdline .utils .common import get_process_function_report
182
- from aiida .orm import Int
183
195
184
196
code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
185
197
@@ -213,14 +225,14 @@ def make_a_builder(sleep_seconds=0):
213
225
214
226
@pytest .mark .requires_rmq
215
227
@pytest .mark .usefixtures ('started_daemon_client' )
216
- def test_process_kill_failng_ebm (
228
+ def test_process_kill_failing_ebm_transport (
217
229
fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
218
230
):
219
- """9) Kill a process that is paused after EBM (5 times failed). It should be possible to kill it normally.
220
- # (e.g. in scenarios that transport is working again)
221
- """
222
- from aiida .orm import Int
231
+ """Kill a process that is waiting after failed EBM during a transport task.
223
232
233
+ It should be possible to kill it normally. A process that failed upload (e.g. in scenarios that transport is working
234
+ again) and is then killed
235
+ """
224
236
code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
225
237
226
238
def make_a_builder (sleep_seconds = 0 ):
@@ -232,7 +244,10 @@ def make_a_builder(sleep_seconds=0):
232
244
233
245
kill_timeout = 10
234
246
235
- monkeypatch_args = ('aiida.engine.utils.exponential_backoff_retry' , MockFunctions .mock_exponential_backoff_retry )
247
+ monkeypatch_args = (
248
+ 'aiida.engine.utils.exponential_backoff_retry' ,
249
+ MockFunctions .exponential_backoff_retry_fail_upload ,
250
+ )
236
251
with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
237
252
node = submit_and_await (make_a_builder (), ProcessState .WAITING )
238
253
await_condition (
@@ -241,7 +256,60 @@ def make_a_builder(sleep_seconds=0):
241
256
timeout = kill_timeout ,
242
257
)
243
258
259
+ # kill should start EBM and should successfully kill
260
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
261
+ await_condition (lambda : node .is_killed , timeout = kill_timeout )
262
+
263
+
264
+ @pytest .mark .requires_rmq
265
+ @pytest .mark .usefixtures ('started_daemon_client' )
266
+ def test_process_kill_failing_ebm_kill (
267
+ fork_worker_context , submit_and_await , aiida_code_installed , run_cli_command , monkeypatch
268
+ ):
269
+ """Kill a process that had previously failed with an EBM.
270
+
271
+ Killing a process tries to gracefully cancel the job on the remote node. If there are connection problems it retries
272
+ it in using the EBM. If this fails another kill command can be send to restart the cancelation of the job scheduler.
273
+ """
274
+ from aiida .cmdline .utils .common import get_process_function_report
275
+
276
+ code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
277
+
278
+ def make_a_builder (sleep_seconds = 0 ):
279
+ builder = code .get_builder ()
280
+ builder .x = Int (1 )
281
+ builder .y = Int (1 )
282
+ builder .metadata .options .sleep = sleep_seconds
283
+ return builder
284
+
285
+ kill_timeout = 10
286
+
287
+ monkeypatch_args = (
288
+ 'aiida.engine.utils.exponential_backoff_retry' ,
289
+ MockFunctions .exponential_backoff_retry_fail_kill ,
290
+ )
291
+ with fork_worker_context (monkeypatch .setattr , monkeypatch_args ):
292
+ node = submit_and_await (make_a_builder (kill_timeout + 10 ), ProcessState .WAITING , timeout = kill_timeout )
293
+ await_condition (
294
+ lambda : node .process_status == 'Monitoring scheduler: job state RUNNING' ,
295
+ timeout = kill_timeout ,
296
+ )
297
+
298
+ # kill should start EBM and be not successful in EBM
244
299
run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
300
+ await_condition (lambda : not node .is_killed , timeout = kill_timeout )
301
+
302
+ # kill should restart EBM and be not successful in EBM
303
+ # this tests if the old task is cancelled and restarted successfully
304
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '--wait' ])
305
+ await_condition (
306
+ lambda : 'Found active scheduler job cancelation that will be rescheduled.'
307
+ in get_process_function_report (node ),
308
+ timeout = kill_timeout ,
309
+ )
310
+
311
+ # force kill should skip EBM and successfully kill the process
312
+ run_cli_command (cmd_process .process_kill , [str (node .pk ), '-F' , '--wait' ])
245
313
await_condition (lambda : node .is_killed , timeout = kill_timeout )
246
314
247
315
@@ -758,8 +826,6 @@ def test_process_kill(submit_and_await, run_cli_command, aiida_code_installed):
758
826
assert result .exit_code == ExitCode .USAGE_ERROR
759
827
assert len (result .output_lines ) > 0
760
828
761
- from aiida .orm import Int
762
-
763
829
code = aiida_code_installed (default_calc_job_plugin = 'core.arithmetic.add' , filepath_executable = '/bin/bash' )
764
830
builder = code .get_builder ()
765
831
builder .x = Int (2 )
0 commit comments