If a worker doesn't start successfully then retry (#99)

nickrobinson251 · web-flow · commit 74882838b63e · 2023-07-31T17:31:05.000+01:00
* WIP Require all workers start successfully before scheduling any test items

* Retry worker creation

* fixup! Retry worker creation

* fixup! fixup! Retry worker creation

* fixup! fixup! fixup! Retry worker creation

* Add tests

* typo

* tidy up

* Bump version

* Loosen test to allow termsignal=typemin(Int32)

* fixup! Loosen test to allow termsignal=typemin(Int32)

* Improve comments

* Use mktemp
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "ReTestItems"
 uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
-version = "1.14.0"
+version = "1.15.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl
@@ -311,16 +311,29 @@ function _runtests_in_current_env(
                 end
             end
         elseif !isempty(testitems.testitems)
-            # spawn a task per worker to start and manage the lifetime of the worker
-            # get starting test items for each worker
+            # Use the logger that was set before we eval'd any user code to avoid world age
+            # issues when logging https://github.com/JuliaLang/julia/issues/33865
+            original_logger = current_logger()
+            # Wait for all workers to be started so we can throw as soon as possible if
+            # we were unable to start the requested number of workers
+            @info "Starting test workers"
+            workers = Vector{Worker}(undef, nworkers)
+            ntestitems = length(testitems.testitems)
+            @sync for i in 1:nworkers
+                @spawn begin
+                    with_logger(original_logger) do
+                        $workers[$i] = robust_start_worker($proj_name, $nworker_threads, $worker_init_expr, $ntestitems; worker_num=$i)
+                    end
+                end
+            end
+            # Now all workers are started, we can begin processing test items.
+            @info "Starting evaluating test items"
             starting = get_starting_testitems(testitems, nworkers)
-            @sync for i = 1:nworkers
+            @sync for (i, w) in enumerate(workers)
                 ti = starting[i]
                 @spawn begin
-                    # Wrapping with the logger that was set before we eval'd any user code to
-                    # avoid world age issues when logging https://github.com/JuliaLang/julia/issues/33865
-                    with_logger(current_logger()) do
-                        start_and_manage_worker($proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $testitem_timeout, $retries, $verbose_results, $debug, $report, $logs)
+                    with_logger(original_logger) do
+                        manage_worker($w, $proj_name, $testitems, $ti, $nworker_threads, $worker_init_expr, $testitem_timeout, $retries, $verbose_results, $debug, $report, $logs)
                     end
                 end
             end
@@ -337,21 +350,51 @@ function _runtests_in_current_env(
     return nothing
 end
 
-function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
+# Start a new `Worker` with `nworker_threads` threads and evaluate `worker_init_expr` on it.
+# The provided `worker_num` is only for logging purposes, and not persisted as part of the worker.
+function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems; worker_num=nothing)
     w = Worker(; threads="$nworker_threads")
+    i = worker_num == nothing ? "" : " $worker_num"
     # remote_fetch here because we want to make sure the worker is all setup before starting to eval testitems
     remote_fetch(w, quote
         using ReTestItems, Test
         Test.TESTSET_PRINT_ENABLE[] = false
         const GLOBAL_TEST_CONTEXT = ReTestItems.TestContext($proj_name, $ntestitems)
         GLOBAL_TEST_CONTEXT.setups_evaled = ReTestItems.TestSetupModules()
-        @info "Starting test item evaluations on pid = $(Libc.getpid()), with $(Threads.nthreads()) threads"
+        @info "Starting test worker$($i) on pid = $(Libc.getpid()), with $(Threads.nthreads()) threads"
         $(worker_init_expr.args...)
         nothing
     end)
     return w
 end
 
+# Want to be somewhat robust to workers possibly terminating during start up (e.g. due to
+# the `worker_init_expr`).
+# The number of retries and delay between retries is currently arbitrary...
+# we want to retry at least once, and we give a slight delay in case there are resources
+# that need to be cleaned up before a new worker would be able to start successfully.
+const _NRETRIES = 2
+const _RETRY_DELAY_SECONDS = 1
+
+# Start a worker, retrying up to `_NRETRIES` times if it terminates unexpectedly,
+# with a delay of `_RETRY_DELAY_SECONDS` seconds between retries.
+# If we fail to start a worker successfully after `_NRETRIES` retries, or if we somehow hit
+# something other than a `WorkerTerminatedException`, then rethrow the exception.
+function robust_start_worker(args...; kwargs...)
+    f = retry(start_worker; delays=fill(_RETRY_DELAY_SECONDS, _NRETRIES), check=_worker_terminated)
+    f(args...; kwargs...)
+end
+
+function _worker_terminated(state, exception)
+    if exception isa WorkerTerminatedException
+        retry_num = state - 1
+        @error "$(exception.worker) terminated unexpectedly. Starting new worker (retry $retry_num/$_NRETRIES)."
+        return true
+    else
+        return false
+    end
+end
+
 any_non_pass(ts::DefaultTestSet) = ts.anynonpass
 
 function record_timeout!(testitem, run_number::Int, timeout_limit::Real)
@@ -396,12 +439,11 @@ function record_test_error!(testitem, msg, elapsed_seconds::Real=0.0)
     return testitem
 end
 
-function start_and_manage_worker(
-    proj_name, testitems, testitem, nworker_threads, worker_init_expr,
+function manage_worker(
+    worker::Worker, proj_name, testitems, testitem, nworker_threads, worker_init_expr,
     timeout::Real, retries::Int, verbose_results::Bool, debug::Int, report::Bool, logs::Symbol
 )
     ntestitems = length(testitems.testitems)
-    worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
     run_number = 1
     while testitem !== nothing
         ch = Channel{TestItemResult}(1)
@@ -479,7 +521,7 @@ function start_and_manage_worker(
             end
             # The worker was terminated, so replace it unless there are no more testitems to run
             if testitem !== nothing
-                worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
+                worker = robust_start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
             end
             # Now loop back around to reschedule the testitem
             continue
diff --git a/test/integrationtests.jl b/test/integrationtests.jl
@@ -609,6 +609,8 @@ end
 @testset "test retrying failing testitem" begin
     file = joinpath(TEST_FILES_DIR, "_retry_tests.jl")
     # This directory must match what's set in `_retry_tests`
+    # Use `/tmp` directly instead of `mktemp` to remove chance that files are cleaned up
+    # as soon as the worker process crashes.
     tmpdir = joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR")
     # must run with `testitem_timeout < 20` for test to timeout as expected.
     # and must run with `nworkers > 0` for retries to be supported.
@@ -769,4 +771,60 @@ end
     @test ts.time_end - ts.time_start ≈ timeout
 end
 
+@testset "worker always crashes immediately" begin
+    file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")
+
+    # We have occassionally seen the Process exist with the expected signal.
+    @assert typemin(Int32) == -2147483648
+    terminated_err_log_1 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 1/2\)."
+    terminated_err_log_2 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 2/2\)."
+
+    worker_init_expr = :(@eval ccall(:abort, Cvoid, ()))
+    # We don't use IOCapture for capturing logs as that seems to hang when the worker crashes.
+    mktemp() do io, path
+        results = redirect_stdio(stdout=io, stderr=io, stdin=devnull) do
+            encased_testset() do
+                runtests(file; nworkers=2, worker_init_expr)
+            end
+        end
+        captured = read(path, String)
+        # Test we retried starting a worker twice and saw the expected log each time.
+        @test contains(captured, terminated_err_log_1)
+        @test contains(captured, terminated_err_log_2)
+        # Test that `runtests` errored overall, before any test items were run.
+        @test n_tests(results) == 1
+        @test length(errors(results)) == 1
+    end
+end
+
+@testset "worker crashes immediately but succeeds on retry" begin
+    file = joinpath(TEST_FILES_DIR, "_happy_tests.jl")
+    mktemp() do crash_io, _
+        # At least one worker should crash, but all workers should succeed upon a retry.
+        worker_init_expr = quote
+            if isempty(read($crash_io))
+                write($crash_io, "1")
+                @eval ccall(:abort, Cvoid, ())
+            end
+        end
+        # We have occassionally seen the Process exist with the expected signal.
+        @assert typemin(Int32) == -2147483648
+        terminated_err_log_1 = r"Error: Worker\(pid=\d+, terminated=true, termsignal=(6|-2147483648)\) terminated unexpectedly. Starting new worker \(retry 1/2\)."
+        # We don't use IOCapture for capturing logs as that seems to hang when the worker crashes.
+        mktemp() do log_io, _
+            results = redirect_stdio(stdout=log_io, stderr=log_io, stdin=devnull) do
+                encased_testset() do
+                    runtests(file; nworkers=2, worker_init_expr)
+                end
+            end
+            captured = read(log_io, String)
+            # Test we saw a worker crash and retried starting the worker.
+            @test contains(captured, terminated_err_log_1)
+            # Test we were then able to run all tests successfully.
+            @test n_tests(results) == 3
+            @test all_passed(results) == 1
+        end
+    end
+end
+
 end # integrationtests.jl testset
diff --git a/test/testfiles/_happy_tests.jl b/test/testfiles/_happy_tests.jl
@@ -0,0 +1,11 @@
+@testitem "happy 1" begin
+    @test 1 == 1
+end
+
+@testitem "happy 2" begin
+    @test 2 == 2
+end
+
+@testitem "happy 3" begin
+    @test 3 == 3
+end