Skip to content

Commit ee3d2ab

Browse files
committed
attempt to handle ooms
1 parent c36a277 commit ee3d2ab

File tree

1 file changed

+46
-2
lines changed

1 file changed

+46
-2
lines changed

scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -790,13 +790,34 @@ task GenerateVepAndLofteeAnnotations {
790790
File loftee_phylo_csf_database
791791
File input_vcf
792792
File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
793+
Float memory_mib = 8 * 1024
794+
# The memory headroom left for other processes including the Batch agent.
795+
Float overhead_memory_mib = 1.6 * 1024
793796
}
794797

795798
command <<<
796799
# Prepend date, time and pwd to xtrace log entries.
797800
PS4='\D{+%F %T} \w $ '
798801
set -o errexit -o nounset -o pipefail -o xtrace
799802
803+
echo "MEM_SIZE is ${MEM_SIZE}"
804+
echo "MEM_UNIT is ${MEM_UNIT}"
805+
806+
if [[ -z "${MEM_UNIT:-}" ]]
807+
then
808+
vep_memory_kib=$(python3 -c "from math import floor; print(floor((~{memory_mib} - ~{overhead_memory_mib}) * 1024))")
809+
elif [[ ${MEM_UNIT} == "GB" ]]
810+
then
811+
vep_memory_kib=$(python3 -c "from math import floor; print(floor((${MEM_SIZE} - ~{overhead_memory_mib}) * 1024))")
812+
else
813+
echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2
814+
exit 1
815+
fi
816+
817+
echo "memory_mib is ~{memory_mib}"
818+
echo "overhead_memory_mib is ~{overhead_memory_mib}"
819+
echo "vep_memory_kib is ${vep_memory_kib}"
820+
800821
bash ~{monitoring_script} > monitoring.log &
801822
802823
if { grep -E -v '^#' ~{input_vcf} 2>&1 > /dev/null; }
@@ -839,8 +860,31 @@ task GenerateVepAndLofteeAnnotations {
839860
--output_file vep_loftee_raw_output.txt
840861
)
841862
863+
# Limit the amount of memory the VEP Python process uses, expressed in KiB.
864+
# If we don't do this it seems that the Batch agent is often (though not always) starved for memory and
865+
# unable to check in with the Batch service. If this happens the job fails for reasons that appear to
866+
# Cromwell to be unretryable, and thus the whole workflow fails. e.g.
867+
#
868+
# Task GvsCreateVATfromVDS.GenerateVepAndLofteeAnnotations:150:4 failed. The job was stopped before the command finished. GCP Batch task exited with VMReportingTimeout(50002).
869+
#
870+
ulimit -m $vep_memory_kib
871+
set +o errexit
842872
vep "${args[@]}"
843-
echo "VEP + LOFTEE run complete."
873+
set -o errexit
874+
875+
VEP_RC=$?
876+
if (( VEP_RC == 137 ))
877+
then
878+
# Cromwell does not currently consider the value in the rc file when determining retryability, though
879+
# there are PRs open that would enable this.
880+
# https://github.com/broadinstitute/cromwell/pull/7786/files
881+
echo "VEP + LOFTEE appears to have OOMed with exit code 137, writing messages to stderr to hopefully trigger Cromwell to retry with more memory."
882+
echo "Killed" >& 2
883+
echo "java.lang.OutOfMemoryError" >& 2
884+
exit 1
885+
else
886+
echo "VEP + LOFTEE run complete."
887+
fi
844888
else
845889
echo "No data found for processing in VCF, exit 0."
846890
touch "vep_loftee_raw_output.txt"
@@ -853,7 +897,7 @@ task GenerateVepAndLofteeAnnotations {
853897
maxRetries: 3
854898
docker: vep_loftee_docker
855899
memory: "16 GB"
856-
disks: "local-disk 1000 HDD"
900+
disks: "local-disk 500 HDD"
857901
}
858902

859903
output {

0 commit comments

Comments
 (0)