Skip to content

Commit 536e9f1

Browse files
d3flexperlpunk
andcommitted
Add linear backoff in hook script
Use the _delay_ which comes from FinalizeResults to override the default Minion delay behavior and force a linear backoff calculation based on the retries in case of job failure and until it reaches the max retries. This should offer a less agressive pattern, giving the Minion tasks more time to recover from various issues and prevent overloading. Linear backoff is selected over exponential as the later is already applied to the Minion tasks in general. https://progress.opensuse.org/issues/181427 Co-authored-by: perlpunk <[email protected]>
1 parent 72510c7 commit 536e9f1

File tree

2 files changed

+37
-3
lines changed

2 files changed

+37
-3
lines changed

lib/OpenQA/Task/Job/HookScript.pm

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@ sub _hook_script ($job, $hook, $openqa_job_id, $options) {
1313
my $ensure_task_retry_on_termination_signal_guard = OpenQA::Task::SignalGuard->new($job);
1414
my $timeout = $options->{timeout};
1515
my $kill_timeout = $options->{kill_timeout};
16-
my $delay = $options->{delay};
1716
my $retries = $options->{retries};
17+
my $delay = $options->{delay} * ($job->retries + 1);
1818
my $skip_rc = $options->{skip_rc};
1919

2020
$ensure_task_retry_on_termination_signal_guard->abort(1);
2121
my ($rc, $out) = _run_hook($hook, $openqa_job_id, $timeout, $kill_timeout);
2222
$job->note(hook_cmd => $hook, hook_result => $out, hook_rc => $rc);
2323

24-
if ($retries && $skip_rc) {
25-
$job->retry($delay ? {delay => $delay} : {}) if defined $rc && $rc == $skip_rc && $job->retries < $retries;
24+
if ($retries && $skip_rc && defined $rc && $rc == $skip_rc && $job->retries < $retries) {
25+
$job->app->log->debug("Job $job->id: Retrying $job->retries with linear delay: ${delay}s");
26+
$job->retry($delay ? {delay => $delay} : {});
2627
}
2728
}
2829

t/10-jobs.t

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,39 @@ subtest 'carry over, including soft-fails' => sub {
696696
is $notes->{hook_rc}, 142, 'exit code of the hook cmd is as expected';
697697
is $job_info->{retries}, 1, 'hook script has been retried once because of delay';
698698
};
699+
700+
subtest 'There is a linear delay for each hook script which exits with 142' => sub {
701+
$job->settings->create({key => '_TRIGGER_JOB_DONE_RETRIES', value => '4'});
702+
$hooks->{job_done_hook_failed} = 'echo delayed && exit 142;';
703+
$job->update({state => UPLOADING});
704+
$job->discard_changes;
705+
$job->done;
706+
perform_minion_jobs($t->app->minion);
707+
$job_info = $t->app->minion->jobs({tasks => ['hook_script']})->next;
708+
is_deeply(
709+
$job_info->{args}[2],
710+
{delay => 60, retries => 4, skip_rc => 142, kill_timeout => '30s', timeout => '5m'},
711+
'args contain expected parameters'
712+
);
713+
is $job_info->{retries}, 1, 'hook script has been already tried once since has been queued';
714+
is $job_info->{state}, 'inactive', 'job ends up inactive due to retry';
715+
note "we expect 4 retries * the default one minute delay";
716+
my $linear_delay = $job_info->{delayed} - $job_info->{retried};
717+
is $linear_delay, 60, 'default delay for first retry applied correctly';
718+
# force some retries with delay 0 to avoid having to wait in the test
719+
$t->app->minion->job($job_info->{id})->retry({delay => 0}) for 1, 2;
720+
perform_minion_jobs($t->app->minion);
721+
$job_info = $t->app->minion->jobs({tasks => ['hook_script']})->next;
722+
is $job_info->{retries}, 4, 'hook script has been retried the expected number of times';
723+
$linear_delay = $job_info->{delayed} - $job_info->{retried};
724+
is $linear_delay, 240, 'linear delay calculated correctly for 4th retry';
725+
note "Retry one last time to validate the final state";
726+
$t->app->minion->job($job_info->{id})->retry({delay => 0});
727+
perform_minion_jobs($t->app->minion);
728+
$job_info = $t->app->minion->jobs({tasks => ['hook_script']})->next;
729+
is $job_info->{retries}, 5, 'hook script retried for last time, exceeding the retry limit';
730+
is $job_info->{state}, 'finished', 'hook script at the end of the retries has finished state';
731+
};
699732
};
700733
};
701734

0 commit comments

Comments
 (0)