Skip to content

Commit 67d0dda

Browse files
committed
data loading rework
1 parent c42d390 commit 67d0dda

File tree

1 file changed

+38
-16
lines changed

1 file changed

+38
-16
lines changed

scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,12 @@ task BigQueryLoadRawVepAndLofteeAnnotations {
873873
File raw_data_table_schema
874874
}
875875

876+
parameter_meta {
877+
vep_loftee_raw_output: {
878+
localization_optional: true
879+
}
880+
}
881+
876882
command <<<
877883
# Prepend date, time and pwd to xtrace log entries.
878884
PS4='\D{+%F %T} \w $ '
@@ -891,26 +897,35 @@ task BigQueryLoadRawVepAndLofteeAnnotations {
891897
bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} ~{raw_data_table_schema}
892898
fi
893899
894-
for file in ~{sep=' ' vep_loftee_raw_output}
895-
do
896-
if [ -s $file ]
897-
then
898-
# Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data
899-
# - Remove lines beginning with '##'.
900-
# - Remove the leading '#' from the one line that should be left with a single leading '#' so the line can
901-
# serve as a TSV header.
902-
sed -E '/^##/d' $file | sed -E 's/^#//' > vep_loftee_load_file.txt
903-
904-
bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' --skip_leading_rows=1 \
905-
--null_marker="-" ~{dataset_name}.~{raw_data_table} vep_loftee_load_file.txt
906-
else
907-
echo "File $file is empty, skipping."
908-
fi
909-
done
900+
num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{raw_data_table} --format json | jq -r .numRows)
901+
if ((num_rows != 0))
902+
then
903+
echo "Found preexisting table with data, not adding more raw data."
904+
else
905+
echo "Raw data table is empty, copying VEP output to be loaded."
906+
gcloud storage cp ~{sep=' ' vep_loftee_raw_output} .
907+
for file in ~{sep=' ' vep_loftee_raw_output}
908+
do
909+
filename=$(basename $file)
910+
if [ ! -e load_file.txt ]
911+
then
912+
# Do a wee bit of processing of the raw output to create a load file for raw VEP + LOFTEE data
913+
# - Remove lines beginning with '##'.
914+
# - Remove the leading '#' from the one line that should be left with a single leading '#' so
915+
# the line can serve as a TSV header.
916+
sed -E '/^##/d' $filename | sed -E 's/^#//' > load_file.txt
917+
fi
918+
grep -E -v '^#' $filename >> load_file.txt
919+
done
920+
921+
bq --apilog=false load --project_id=~{project_id} --source_format=CSV --field_delimiter='\t' \
922+
--skip_leading_rows=1 --null_marker="-" ~{dataset_name}.~{raw_data_table} load_file.txt
923+
fi
910924
>>>
911925

912926
runtime {
913927
docker: variants_docker
928+
preemptible: 2
914929
memory: "7 GB"
915930
disks: "local-disk 1000 HDD"
916931
}
@@ -949,6 +964,12 @@ task BigQueryCookVepAndLofteeRawAnnotations {
949964
bq --apilog=false mk --expiration=$DATE --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} ~{cooked_data_table_schema}
950965
fi
951966
967+
num_rows=$(bq --apilog=false show --project_id=~{project_id} ~{dataset_name}.~{cooked_data_table} --format json | jq -r .numRows)
968+
if ((num_rows != 0))
969+
then
970+
echo "Found preexisting table with data, not adding more cooked data."
971+
else
972+
952973
bq --apilog=false query --nouse_legacy_sql --destination_table=~{dataset_name}.~{cooked_data_table} --replace \
953974
--project_id=~{project_id} '
954975
@@ -1021,6 +1042,7 @@ task BigQueryCookVepAndLofteeRawAnnotations {
10211042
WHERE ROW_NUMBER = 1
10221043
10231044
'
1045+
fi
10241046
10251047
>>>
10261048

0 commit comments

Comments
 (0)