Skip to content

Commit 7488283

Browse files
If a worker doesn't start successfully then retry (#99)
* WIP Require all workers start successfully before scheduling any test items * Retry worker creation * fixup! Retry worker creation * fixup! fixup! Retry worker creation * fixup! fixup! fixup! Retry worker creation * Add tests * typo * tidy up * Bump version * Loosen test to allow termsignal=typemin(Int32) * fixup! Loosen test to allow termsignal=typemin(Int32) * Improve comments * Use mktemp
1 parent f42369c commit 7488283

File tree

4 files changed

+125
-14
lines changed

4 files changed

+125
-14
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "ReTestItems"
22
uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
3-
version = "1.14.0"
3+
version = "1.15.0"
44

55
[deps]
66
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

src/ReTestItems.jl

+55-13
Original file line numberDiff line numberDiff line change
@@ -311,16 +311,29 @@ function _runtests_in_current_env(
311311
end
312312
end
313313
elseif !isempty(testitems.testitems)
314-
# spawn a task per worker to start and manage the lifetime of the worker
315-
# get starting test items for each worker
314+
# Use the logger that was set before we eval'd any user code to avoid world age
315+
# issues when logging https://github.com/JuliaLang/julia/issues/33865
316+
original_logger = current_logger()
317+
# Wait for all workers to be started so we can throw as soon as possible if
318+
# we were unable to start the requested number of workers
319+
@info "Starting test workers"
320+
workers = Vector{Worker}(undef, nworkers)
321+
ntestitems = length(testitems.testitems)
322+
@sync for i in 1:nworkers
323+
@spawn begin
324+
with_logger(original_logger) do
325+
$workers[$i] = robust_start_worker($proj_name, $nworker_threads, $worker_init_expr, $ntestitems; worker_num=$i)
326+
end
327+
end
328+
end
329+
# Now all workers are started, we can begin processing test items.
330+
@info "Starting evaluating test items"
316331
starting = get_starting_testitems(testitems, nworkers)
317-
@sync for i = 1:nworkers
332+
@sync for (i, w) in enumerate(workers)
318333
ti = starting[i]
319334
@spawn begin
320-
# Wrapping with the logger that was set before we eval'd any user code to
321-
# avoid world age issues when logging https://github.com/JuliaLang/julia/issues/33865
322-
with_logger(current_logger()) do
323-
start_and_manage_worker($proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $testitem_timeout, $retries, $verbose_results, $debug, $report, $logs)
335+
with_logger(original_logger) do
336+
manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $testitem_timeout, $retries, $verbose_results, $debug, $report, $logs)
324337
end
325338
end
326339
end
@@ -337,21 +350,51 @@ function _runtests_in_current_env(
337350
return nothing
338351
end
339352

340-
function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
353+
# Start a new `Worker` with `nworker_threads` threads and evaluate `worker_init_expr` on it.
354+
# The provided `worker_num` is only for logging purposes, and not persisted as part of the worker.
355+
function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; worker_num=nothing)
341356
w = Worker(; threads="$nworker_threads")
357+
i = worker_num == nothing ? "" : " $worker_num"
342358
# remote_fetch here because we want to make sure the worker is all setup before starting to eval testitems
343359
remote_fetch(w, quote
344360
using ReTestItems, Test
345361
Test.TESTSET_PRINT_ENABLE[] = false
346362
const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems)
347363
GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules()
348-
@info "Starting test item evaluations on pid = $(Libc.getpid()), with $(Threads.nthreads()) threads"
364+
@info "Starting test worker$($i) on pid = $(Libc.getpid()), with $(Threads.nthreads()) threads"
349365
$(worker_init_expr.args...)
350366
nothing
351367
end)
352368
return w
353369
end
354370

371+
# Want to be somewhat robust to workers possibly terminating during start up (e.g. due to
372+
# the `worker_init_expr`).
373+
# The number of retries and delay between retries is currently arbitrary...
374+
# we want to retry at least once, and we give a slight delay in case there are resources
375+
# that need to be cleaned up before a new worker would be able to start successfully.
376+
const _NRETRIES = 2
377+
const _RETRY_DELAY_SECONDS = 1
378+
379+
# Start a worker, retrying up to `_NRETRIES` times if it terminates unexpectedly,
380+
# with a delay of `_RETRY_DELAY_SECONDS` seconds between retries.
381+
# If we fail to start a worker successfully after `_NRETRIES` retries, or if we somehow hit
382+
# something other than a `WorkerTerminatedException`, then rethrow the exception.
383+
function robust_start_worker(args...; kwargs...)
384+
f = retry(start_worker; delays=fill(_RETRY_DELAY_SECONDS, _NRETRIES), check=_worker_terminated)
385+
f(args...; kwargs...)
386+
end
387+
388+
function _worker_terminated(state, exception)
389+
if exception isa WorkerTerminatedException
390+
retry_num = state - 1
391+
@error "$(exception.worker) terminated unexpectedly. Starting new worker (retry $retry_num/$_NRETRIES)."
392+
return true
393+
else
394+
return false
395+
end
396+
end
397+
355398
any_non_pass(ts::DefaultTestSet) = ts.anynonpass
356399

357400
function record_timeout!(testitem, run_number::Int, timeout_limit::Real)
@@ -396,12 +439,11 @@ function record_test_error!(testitem, msg, elapsed_seconds::Real=0.0)
396439
return testitem
397440
end
398441

399-
function start_and_manage_worker(
400-
proj_name, testitems, testitem, nworker_threads, worker_init_expr,
442+
function manage_worker(
443+
worker::Worker, proj_name, testitems, testitem, nworker_threads, worker_init_expr,
401444
timeout::Real, retries::Int, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol
402445
)
403446
ntestitems = length(testitems.testitems)
404-
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
405447
run_number = 1
406448
while testitem !== nothing
407449
ch = Channel{TestItemResult}(1)
@@ -479,7 +521,7 @@ function start_and_manage_worker(
479521
end
480522
# The worker was terminated, so replace it unless there are no more testitems to run
481523
if testitem !== nothing
482-
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
524+
worker = robust_start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
483525
end
484526
# Now loop back around to reschedule the testitem
485527
continue

test/integrationtests.jl

+58
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,8 @@ end
609609
@testset "test retrying failing testitem" begin
610610
file = joinpath(TEST_FILES_DIR, "_retry_tests.jl")
611611
# This directory must match what's set in `_retry_tests`
612+
# Use `/tmp` directly instead of `mktemp` to remove chance that files are cleaned up
613+
# as soon as the worker process crashes.
612614
tmpdir = joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR")
613615
# must run with `testitem_timeout < 20` for test to timeout as expected.
614616
# and must run with `nworkers > 0` for retries to be supported.
@@ -769,4 +771,60 @@ end
769771
@test ts.time_end - ts.time_start timeout
770772
end
771773

774+
@testset "worker always crashes immediately" begin
775+
file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")
776+
777+
# We have occassionally seen the Process exist with the expected signal.
778+
@assert typemin(Int32) == -2147483648
779+
terminated_err_log_1 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 1/2\)."
780+
terminated_err_log_2 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 2/2\)."
781+
782+
worker_init_expr = :(@eval ccall(:abort, Cvoid, ()))
783+
# We don't use IOCapture for capturing logs as that seems to hang when the worker crashes.
784+
mktemp() do io, path
785+
results = redirect_stdio(stdout=io, stderr=io, stdin=devnull) do
786+
encased_testset() do
787+
runtests(file; nworkers=2, worker_init_expr)
788+
end
789+
end
790+
captured = read(path, String)
791+
# Test we retried starting a worker twice and saw the expected log each time.
792+
@test contains(captured, terminated_err_log_1)
793+
@test contains(captured, terminated_err_log_2)
794+
# Test that `runtests` errored overall, before any test items were run.
795+
@test n_tests(results) == 1
796+
@test length(errors(results)) == 1
797+
end
798+
end
799+
800+
@testset "worker crashes immediately but succeeds on retry" begin
801+
file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")
802+
mktemp() do crash_io, _
803+
# At least one worker should crash, but all workers should succeed upon a retry.
804+
worker_init_expr = quote
805+
if isempty(read($crash_io))
806+
write($crash_io, "1")
807+
@eval ccall(:abort, Cvoid, ())
808+
end
809+
end
810+
# We have occassionally seen the Process exist with the expected signal.
811+
@assert typemin(Int32) == -2147483648
812+
terminated_err_log_1 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 1/2\)."
813+
# We don't use IOCapture for capturing logs as that seems to hang when the worker crashes.
814+
mktemp() do log_io, _
815+
results = redirect_stdio(stdout=log_io, stderr=log_io, stdin=devnull) do
816+
encased_testset() do
817+
runtests(file; nworkers=2, worker_init_expr)
818+
end
819+
end
820+
captured = read(log_io, String)
821+
# Test we saw a worker crash and retried starting the worker.
822+
@test contains(captured, terminated_err_log_1)
823+
# Test we were then able to run all tests successfully.
824+
@test n_tests(results) == 3
825+
@test all_passed(results) == 1
826+
end
827+
end
828+
end
829+
772830
end # integrationtests.jl testset

test/testfiles/_happy_tests.jl

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
@testitem "happy 1" begin
2+
@test 1 == 1
3+
end
4+
5+
@testitem "happy 2" begin
6+
@test 2 == 2
7+
end
8+
9+
@testitem "happy 3" begin
10+
@test 3 == 3
11+
end

0 commit comments

Comments
 (0)