Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated WDL Testing #420

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/carrot_push.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Runs workflow tests from the branch when commits have been pushed to a PR
# the workflows to be tested are specified by the "test_names"
# parameter as string seperated by space.

name: carrot-test-on-push
on: [push]
jobs:
publish-test:
runs-on: ubuntu-latest
steps:

# https://github.com/google-github-actions/setup-gcloud#service-account-key-json
- id: auth
uses: google-github-actions/auth@v0
with:
credentials_json: ${{ secrets.CARROT_SA_KEY }}

- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v0

# https://cloud.google.com/pubsub/docs/publisher#publish_messages
- name: Use gcloud CLI
run: >
gcloud pubsub topics publish ${{ secrets.CARROT_TOPIC_NAME }}
--message='{"source":"github",
"author":"${{ github.triggering_actor }}",
"owner":"${{ github.repository_owner }}",
"wdl_tests_dir":"wdl_test",
"repo_url":"${{ github.repositoryUrl }}",
"branch_name":"${{ github.ref_name }}",
"commit":"${{ github.sha }}",
"repo":"${{ github.repository }}",
"test_names": "PBCCSWholeGenome",
"issue_number":"",
"software_name":""
}'
36 changes: 36 additions & 0 deletions .github/workflows/carrot_weekly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Runs all workflow tests every Sunday

name: carrot-test-weekly
on:
schedule:
- cron: '0 7 * * 0' # Run every Sunday at 7am
jobs:
publish-test:
runs-on: ubuntu-latest
steps:

# https://github.com/google-github-actions/setup-gcloud#service-account-key-json
- id: auth
uses: google-github-actions/auth@v0
with:
credentials_json: ${{ secrets.CARROT_SA_KEY }}

- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v0

# https://cloud.google.com/pubsub/docs/publisher#publish_messages
- name: Use gcloud CLI
run: >
gcloud pubsub topics publish ${{ secrets.CARROT_TOPIC_NAME }}
--message='{"source":"github",
"author":"${{ github.triggering_actor }}",
"owner":"${{ github.repository_owner }}",
"wdl_tests_dir":"wdl_test",
"repo_url":"${{ github.repositoryUrl }}",
"branch_name":"${{ github.ref_name }}",
"commit":"${{ github.sha }}",
"repo":"${{ github.repository }}",
"test_names": "",
"issue_number":"",
"software_name":""
}'
53 changes: 53 additions & 0 deletions wdl_test/FileMetadata.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
version 1.0

task CheckFileUpdatedDateGCP {

meta {
description: "Checks if file was updated within a specified time (default: 1 day)"
note: "Specific to GCP"
}


input {
Array[String] file_paths
Int days_back = 1
String image_to_use
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use tag instead of image name, for security reasons.

It looks like you just need gsutil available in the image?

}

#FILE_DATE description: get file info | grep the 'Update Time' row | parse the date info | reformat date info

command <<<
set -eu pipefail

GS_BUCKET_PATHS=("~{sep='" "' file_paths}")
MINIMUM_DATE=`date -d '~{days_back} day ago' +%Y-%m-%d`
return_code=0
echo -e "FileName\tFileUpdatedDate\tMinimumUpdateDate\tValid"

for GS_FILE in ${GS_BUCKET_PATHS[@]};
do
FILE_DATE=`gsutil stat ${GS_FILE} | grep "Update time" | awk -F '[,][ .]+' '{print $2}' | awk '{print "date -d\""$1FS$2FS$3"\" +%Y-%m-%d"}'| bash`

if [[ $FILE_DATE >= $MINIMUM_DATE ]] ; then
echo -e "$GS_FILE\t$FILE_DATE\t$MINIMUM_DATE\tTrue"
else
echo "ERROR: Date for $GS_FILE" >&2
echo "$FILE_DATE is less than $MINIMUM_DATE" >&2

echo -e "$GS_FILE\t$FILE_DATE\t$MINIMUM_DATE\tFalse"
return_code=1
fi
done

if [ $return_code == 1 ]; then
exit 1
fi

>>>
runtime {
docker: image_to_use
}
output {
File file_date_result = stdout()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"PBCCSWholeGenome.aligned_bais": [
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64020e_220303_2002560.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220328_1613170.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220330_0132120.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220215_1930240.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220218_1550340.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220220_0052040.01.downsample.bai",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220303_1959350.01.downsample.bai"
],

"PBCCSWholeGenome.aligned_bams": [
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64020e_220303_2002560.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220328_1613170.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64218e_220330_0132120.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220215_1930240.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220218_1550340.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220220_0052040.01.downsample.bam",
"gs://broad-dsp-lrma-pipeline-test-data/NA24385/downsampled_bam_01/m64297e_220303_1959350.01.downsample.bam"
]
}
229 changes: 229 additions & 0 deletions wdl_test/PBCCSWholeGenome/basic_output_valdation/eval.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
version 1.0

workflow eval_workflow {
input {

#Floats

Float aligned_read_length_N50
Float aligned_num_reads
Float aligned_frac_bases
Float aligned_num_bases
Float aligned_read_length_stdev
Float average_identity
Float aligned_est_fold_cov
Float aligned_read_length_mean
Float median_identity
Float aligned_read_length_median

#Files

String pbsv_tbi
String sniffles_vcf
String clair_gtbi
String dvp_tbi
String dvp_g_tbi
String dvp_vcf
String clair_vcf
String pbsv_vcf
String aligned_pbi
String aligned_bai
String dvp_phased_vcf
#String bed_cov_summary # this ends up being 'null' so not including in array of files for now
String dvp_phased_tbi
String clair_tbi
String clair_gvcf
String aligned_bam
String sniffles_tbi
String dvp_g_vcf

}

Array[Float] workflow_out_floats = [
aligned_read_length_N50,
aligned_num_reads,
aligned_frac_bases,
aligned_num_bases,
aligned_read_length_stdev,
average_identity,
aligned_est_fold_cov,
aligned_read_length_mean,
median_identity,
aligned_read_length_median
]
Array[String] workflow_out_files = [
pbsv_tbi,
sniffles_vcf,
clair_gtbi,
dvp_tbi,
dvp_g_tbi,
dvp_vcf,
clair_vcf,
pbsv_vcf,
aligned_pbi,
aligned_bai,
dvp_phased_vcf,
dvp_phased_tbi,
clair_tbi,
clair_gvcf,
aligned_bam,
sniffles_tbi,
dvp_g_vcf
]

String ubuntu_image = "marketplace.gcr.io/google/ubuntu2004:latest"
String gcloud_slim_image = "gcr.io/google.com/cloudsdktool/cloud-sdk:slim"

################
## Compairing test and expected Floats
## Json inputs like "eval_workflow.workflow_out_floats": [{"Left":1.0,"Right":1.0},{"Left":2.2,"Right":3.2}]
## With Array[Pair[Float,Float]] workflow_out_floats

# Array[Boolean] scattered_float_match = []
# scatter (pair in workflow_out_floats){
# if (pair.left != pair.right) {
# Boolean scattered_float_match = false
# call CheckerWorkflowError{
# input:
# message = "Expected Float "+pair.right+" but got "+pair.left,
# image_to_use = ubuntu_image
# }
# }
# }
################

## Confirm float does not equal to zero test and expected Floats
Array[Boolean] scattered_float_match = []
scatter (in_float in workflow_out_floats){
if (in_float == 0.0) {
Boolean scattered_float_match = false
call CheckerWorkflowError{
input:
message = "Error: Expected a non-zero float but got "+in_float+" .",
image_to_use = ubuntu_image
}
}
}


call CheckFileUpdatedDateGCP {
input:
file_paths = workflow_out_files,
image_to_use = gcloud_slim_image
}
}

task CheckFileUpdatedDateGCP {

meta {
description: "Checks if file was updated within a specified time (default: 1 day)"
note: "Specific to GCP"
}


input {
Array[String] file_paths
Int days_back = 1
String image_to_use
}

#FILE_DATE description: get file info | grep the 'Update Time' row | parse the date info | reformat date info

command <<<
set -eu pipefail

GS_BUCKET_PATHS=("~{sep='" "' file_paths}")
EMPTY_MD5="d41d8cd98f00b204e9800998ecf8427e"
return_code=0
echo -e "FilePath\tFileMD5\tValid"

for GS_FILE in ${GS_BUCKET_PATHS[@]};
do
FILE_MD5=`gsutil hash -hm ${GS_FILE} | grep "md5" | awk -F '[:][\t]+' '{print $2}'`

if [[ $FILE_MD5 != $EMPTY_MD5 ]] ; then
echo -e "$GS_FILE\t$FILE_MD5\tTrue"
else
echo "ERROR: MD5 for $GS_FILE equals md5sum of an empty file: $EMPTY_MD5" >&2

echo -e "$GS_FILE\t$FILE_MD5\tFalse"
return_code=1
fi
done

if [ $return_code == 1 ]; then
exit 1
fi

>>>
runtime {
docker: image_to_use
}
output {
File file_date_result = stdout()
}
}

task CheckerWorkflowError {

input {
String message
String image_to_use
}
command <<<
set -eu pipefail

echo ~{message}
exit 1

>>>
runtime {
docker: image_to_use
}
output {
Boolean errmessage = stdout()
}
}

task ValidFloatOutput {

input {
Array[Pair[Float,Float]] workflow_out_floats
String image_to_use
}
command <<<

echo ~{workflow_out_floats}

>>>
runtime {
docker: image_to_use
}
output {
Boolean comparison_result = read_boolean(stdout())
}
}

task ValidMd5SumOutput {
input {
File data_file
String expectedMd5sum
}
command <<<

md5sum helloworld.txt | sed "s/|/ /" | awk "{print $1, $8}" | read filemd5

if [$filemd5 == $expectedMd5sum]
then
echo "true"
else
echo "false"
fi
>>>
runtime {
docker: "quay.io/agduncan94/my-md5sum"
}
output {
File comparison_result = stdout()
}
}
Loading
Loading