From 35d3cd4b32aa76eab236763b1d30991e5d36e6c8 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Mon, 22 Jul 2024 15:50:18 -0700 Subject: [PATCH] Add ingest-to-phylogenetic GitHub Action... [#14] ...to fully automate `ingest` and subsequent rebuild of `phylogenetic` when data has changed. Includes related configuration updates, as follows: * copy github action and associated build-configs from template * customize nextstrain-automation/config.yaml files for `ingest` and `phylogenetic` builds * add custom file loading to top-level Snakefiles in `ingest` and `phylogenetic` builds * update .gitignore to ignore newly-used `phylogenetic/data` * update phylogenetic/defaults/config.yaml to remove unused keys * update values of various `metadata` and `sequences` inputs/outputs in subsidiary 'phylogenetic/rules/*.smk' files --- .github/workflows/ingest-to-phylogenetic.yaml | 140 ++++++++++++++++++ .gitignore | 1 + ingest/Snakefile | 7 + .../nextstrain-automation/config.yaml | 30 ++++ .../nextstrain-automation/upload.smk | 48 ++++++ phylogenetic/Snakefile | 7 + .../nextstrain-automation/config.yaml | 4 + .../nextstrain-automation/deploy.smk | 18 +++ phylogenetic/defaults/config.yaml | 8 - phylogenetic/rules/construct_phylogeny.smk | 2 +- phylogenetic/rules/export.smk | 2 +- phylogenetic/rules/prepare_sequences.smk | 32 +++- 12 files changed, 287 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/ingest-to-phylogenetic.yaml create mode 100644 ingest/build-configs/nextstrain-automation/config.yaml create mode 100644 ingest/build-configs/nextstrain-automation/upload.smk create mode 100644 phylogenetic/build-configs/nextstrain-automation/config.yaml create mode 100644 phylogenetic/build-configs/nextstrain-automation/deploy.smk diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml new file mode 100644 index 0000000..6f9d5f6 --- /dev/null +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -0,0 +1,140 @@ +name: Ingest to phylogenetic + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + schedule: + # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings. + # + # Note the actual runs might be late. + # Numerous people were confused, about that, including me: + # - https://github.community/t/scheduled-action-running-consistently-late/138025/11 + # - https://github.com/github/docs/issues/3059 + # + # Note, '*' is a special character in YAML, so you have to quote this string. + # + # Docs: + # - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule + # + # Tool that deciphers this particular format of crontab string: + # - https://crontab.guru/ + # + # Runs at 5:30pm UTC (1:30pm EDT/10:30am PDT) since curation by NCBI happens on the East Coast. + # We were running into invalid zip archive errors at 9am PDT, so hoping an hour-thirty + # delay will lower the error frequency + - cron: '30 17 * * *' + + workflow_dispatch: + inputs: + ingest_image: + description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' + required: false + phylogenetic_image: + description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' + required: false + +jobs: + ingest: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }} + run: | + nextstrain build \ + ingest \ + upload_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-build-output + artifact-paths: | + ingest/results/ + ingest/benchmarks/ + ingest/logs/ + ingest/.snakemake/log/ + + # Check if ingest results include new data by checking for the cache + # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) + # GitHub will remove any cache entries that have not been accessed in over 7 days, + # so if the workflow has not been run over 7 days then it will trigger phylogenetic. + check-new-data: + needs: [ingest] + runs-on: ubuntu-latest + outputs: + cache-hit: ${{ steps.check-cache.outputs.cache-hit }} + steps: + - name: Get sha256sum + id: get-sha256sum + env: + AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }} + run: | + s3_urls=( + "s3://nextstrain-data/files/workflows/measles/metadata.tsv.zst" + "s3://nextstrain-data/files/workflows/measles/sequences.fasta.zst" + ) + + # Code below is modified from ingest/upload-to-s3 + # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29 + + + no_hash=0000000000000000000000000000000000000000000000000000000000000000 + + for s3_url in "${s3_urls[@]}"; do + s3path="${s3_url#s3://}" + bucket="${s3path%%/*}" + key="${s3path#*/}" + + s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + echo "${s3_hash}" | tee -a ingest-output-sha256sum + done + + - name: Check cache + id: check-cache + uses: actions/cache@v4 + with: + path: ingest-output-sha256sum + key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }} + lookup-only: true + + phylogenetic: + needs: [check-new-data] + if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }} + run: | + nextstrain build \ + phylogenetic \ + deploy_all \ + --configfile build-configs/nextstrain-automation/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: phylogenetic-build-output + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ diff --git a/.gitignore b/.gitignore index 81e9728..8cbbc7c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ ingest/logs ingest/results phylogenetic/auspice phylogenetic/benchmarks +phylogenetic/data phylogenetic/logs phylogenetic/results diff --git a/ingest/Snakefile b/ingest/Snakefile index 6e38657..398171e 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -45,3 +45,10 @@ rule clean: """ rm -rfv {params.targets} """ + + +# Import custom rules provided via the config. +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml new file mode 100644 index 0000000..fa676d9 --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -0,0 +1,30 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. + +# Custom rules to run as part of the Nextstrain automated workflow +# The paths should be relative to the ingest directory. +custom_rules: + - build-configs/nextstrain-automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads +# This is required as long as we are using the AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +# Replace with the pathogen repo name. +s3_dst: "s3://nextstrain-data/files/workflows/seasonal-cov" + +# Mapping of files to upload +files_to_upload: + ncbi_229e.ndjson.zst: data/229e/ncbi.ndjson + ncbi_hku1.ndjson.zst: data/hku1/ncbi.ndjson + ncbi_nl63.ndjson.zst: data/nl63/ncbi.ndjson + ncbi_oc43.ndjson.zst: data/oc43/ncbi.ndjson + metadata_229e.tsv.zst: results/229e/metadata.tsv + sequences_229e.fasta.zst: results/229e/sequences.fasta + metadata_hku1.tsv.zst: results/hku1/metadata.tsv + sequences_hku1.fasta.zst: results/hku1/sequences.fasta + metadata_nl63.tsv.zst: results/nl63/metadata.tsv + sequences_nl63.fasta.zst: results/nl63/sequences.fasta + metadata_oc43.tsv.zst: results/oc43/metadata.tsv + sequences_oc43.fasta.zst: results/oc43/sequences.fasta diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk new file mode 100644 index 0000000..0f99c12 --- /dev/null +++ b/ingest/build-configs/nextstrain-automation/upload.smk @@ -0,0 +1,48 @@ +""" +This part of the workflow handles uploading files to AWS S3. + +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. + +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" + +The rule `upload_all` can be used as a target to upload all files. +""" + +import os + +slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ +send_notifications = ( + config.get("send_slack_notifications", False) and slack_envvars_defined +) + + +rule upload_to_s3: + input: + file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file], + output: + "results/upload/{remote_file}.upload", + params: + quiet="" if send_notifications else "--quiet", + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], + shell: + """ + ./vendored/upload-to-s3 \ + {params.quiet} \ + {input.file_to_upload:q} \ + {params.s3_dst:q}/{wildcards.remote_file:q} \ + {params.cloudfront_domain} 2>&1 | tee {output} + """ + + +rule upload_all: + input: + uploads=[ + f"results/upload/{remote_file}.upload" + for remote_file in config["files_to_upload"].keys() + ], + output: + touch("results/upload_all.done"), diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 3ddfda3..b2536b6 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -26,3 +26,10 @@ rule clean: """ rm -rfv {params.targets} """ + + +# Import custom rules provided via the config. +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file diff --git a/phylogenetic/build-configs/nextstrain-automation/config.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml new file mode 100644 index 0000000..e549405 --- /dev/null +++ b/phylogenetic/build-configs/nextstrain-automation/config.yaml @@ -0,0 +1,4 @@ +custom_rules: + - build-configs/nextstrain-automation/deploy.smk + +deploy_url: "s3://nextstrain-data" diff --git a/phylogenetic/build-configs/nextstrain-automation/deploy.smk b/phylogenetic/build-configs/nextstrain-automation/deploy.smk new file mode 100644 index 0000000..c4d4423 --- /dev/null +++ b/phylogenetic/build-configs/nextstrain-automation/deploy.smk @@ -0,0 +1,18 @@ +""" +This part of the workflow handles automatic deployments of the measles build. +Uploads the build defined as the default output of the workflow through +the `all` rule from Snakefille +""" + + +rule deploy_all: + input: + *rules.all.input, + output: + touch("results/deploy_all.done"), + params: + deploy_url=config["deploy_url"], + shell: + """ + nextstrain remote upload {params.deploy_url} {input} + """ diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index fea179f..4e0b6b7 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -33,9 +33,7 @@ viruses: 229e: reference: "defaults/229e/reference.fasta" genemap: "defaults/229e/genemap.gff" - metadata: "../ingest/results/229e/metadata.tsv" prepare_sequences: - sequences: "../ingest/results/229e/sequences.fasta" group_by: "country" subsample_max_sequences: 4000 min_length: 20000 @@ -50,9 +48,7 @@ viruses: nl63: reference: "defaults/nl63/reference.fasta" genemap: "defaults/nl63/genemap.gff" - metadata: "../ingest/results/nl63/metadata.tsv" prepare_sequences: - sequences: "../ingest/results/nl63/sequences.fasta" group_by: "country" subsample_max_sequences: 4000 min_length: 20000 @@ -67,9 +63,7 @@ nl63: oc43: reference: "defaults/oc43/reference.fasta" genemap: "defaults/oc43/genemap.gff" - metadata: "../ingest/results/oc43/metadata.tsv" prepare_sequences: - sequences: "../ingest/results/oc43/sequences.fasta" group_by: "country" subsample_max_sequences: 4000 min_length: 20000 @@ -84,9 +78,7 @@ oc43: hku1: reference: "defaults/hku1/reference.fasta" genemap: "defaults/hku1/genemap.gff" - metadata: "../ingest/results/hku1/metadata.tsv" prepare_sequences: - sequences: "../ingest/results/hku1/sequences.fasta" group_by: "country" subsample_max_sequences: 4000 min_length: 20000 diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 5fadecd..849e0f3 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -30,7 +30,7 @@ rule refine: input: tree="results/{virus}/tree_raw.nwk", alignment="results/{virus}/aligned.fasta", - metadata=lambda wildcards: config[wildcards.virus]["metadata"], + metadata="data/{virus}/metadata.tsv", output: tree="results/{virus}/tree.nwk", node_data="results/{virus}/branch_lengths.json", diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 8149395..28f29a9 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -9,7 +9,7 @@ tree and at least one node data JSON. rule export: input: tree="results/{virus}/tree.nwk", - metadata=lambda wildcards: config[wildcards.virus]["metadata"], + metadata="data/{virus}/metadata.tsv", branch_lengths="results/{virus}/branch_lengths.json", nt_muts="results/{virus}/nt_muts.json", aa_muts="results/{virus}/aa_muts.json", diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 3be3a0e..018daa0 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -11,10 +11,38 @@ and will produce an aligned FASTA file of subsampled sequences as an output. """ +rule download: + output: + sequences="data/{virus}/sequences.fasta.zst", + metadata="data/{virus}/metadata.tsv.zst", + params: + sequences_url="https://data.nextstrain.org/files/workflows/seasonal-cov/sequences_{virus}.fasta.zst", + metadata_url="https://data.nextstrain.org/files/workflows/seasonal-cov/metadata_{virus}.tsv.zst", + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + + +rule decompress: + input: + sequences="data/{virus}/sequences.fasta.zst", + metadata="data/{virus}/metadata.tsv.zst", + output: + sequences="data/{virus}/sequences.fasta", + metadata="data/{virus}/metadata.tsv", + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + + rule filter: input: - sequences=lambda wildcards: config[wildcards.virus]["prepare_sequences"]["sequences"], - metadata=lambda wildcards: config[wildcards.virus]["metadata"], + sequences="data/{virus}/sequences.fasta", + metadata="data/{virus}/metadata.tsv", exclude="defaults/{virus}/dropped_strains.txt", output: sequences="results/{virus}/filtered.fasta",