-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ingest-to-phylogenetic GitHub Action... [#14]
...to fully automate `ingest` and subsequent rebuild of `phylogenetic` when data has changed. Includes related configuration updates, as follows: * copy github action and associated build-configs from template * customize nextstrain-automation/config.yaml files for `ingest` and `phylogenetic` builds * add custom file loading to top-level Snakefiles in `ingest` and `phylogenetic` builds * update .gitignore to ignore newly-used `phylogenetic/data` * update phylogenetic/defaults/config.yaml to remove unused keys * update values of various `metadata` and `sequences` inputs/outputs in subsidiary 'phylogenetic/rules/*.smk' files
- Loading branch information
Showing
12 changed files
with
287 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
name: Ingest to phylogenetic | ||
|
||
defaults: | ||
run: | ||
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023: | ||
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell | ||
# | ||
# Completely spelling it out here so that GitHub can't change it out from under us | ||
# and we don't have to refer to the docs to know the expected behavior. | ||
shell: bash --noprofile --norc -eo pipefail {0} | ||
|
||
on: | ||
schedule: | ||
# Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings. | ||
# | ||
# Note the actual runs might be late. | ||
# Numerous people were confused, about that, including me: | ||
# - https://github.community/t/scheduled-action-running-consistently-late/138025/11 | ||
# - https://github.com/github/docs/issues/3059 | ||
# | ||
# Note, '*' is a special character in YAML, so you have to quote this string. | ||
# | ||
# Docs: | ||
# - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule | ||
# | ||
# Tool that deciphers this particular format of crontab string: | ||
# - https://crontab.guru/ | ||
# | ||
# Runs at 5:30pm UTC (1:30pm EDT/10:30am PDT) since curation by NCBI happens on the East Coast. | ||
# We were running into invalid zip archive errors at 9am PDT, so hoping an hour-thirty | ||
# delay will lower the error frequency | ||
- cron: '30 17 * * *' | ||
|
||
workflow_dispatch: | ||
inputs: | ||
ingest_image: | ||
description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")' | ||
required: false | ||
phylogenetic_image: | ||
description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")' | ||
required: false | ||
|
||
jobs: | ||
ingest: | ||
permissions: | ||
id-token: write | ||
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master | ||
secrets: inherit | ||
with: | ||
# Starting with the default docker runtime | ||
# We can migrate to AWS Batch when/if we need to for more resources or if | ||
# the job runs longer than the GH Action limit of 6 hours. | ||
runtime: docker | ||
env: | | ||
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }} | ||
run: | | ||
nextstrain build \ | ||
ingest \ | ||
upload_all \ | ||
--configfile build-configs/nextstrain-automation/config.yaml | ||
# Specifying artifact name to differentiate ingest build outputs from | ||
# the phylogenetic build outputs | ||
artifact-name: ingest-build-output | ||
artifact-paths: | | ||
ingest/results/ | ||
ingest/benchmarks/ | ||
ingest/logs/ | ||
ingest/.snakemake/log/ | ||
# Check if ingest results include new data by checking for the cache | ||
# of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) | ||
# GitHub will remove any cache entries that have not been accessed in over 7 days, | ||
# so if the workflow has not been run over 7 days then it will trigger phylogenetic. | ||
check-new-data: | ||
needs: [ingest] | ||
runs-on: ubuntu-latest | ||
outputs: | ||
cache-hit: ${{ steps.check-cache.outputs.cache-hit }} | ||
steps: | ||
- name: Get sha256sum | ||
id: get-sha256sum | ||
env: | ||
AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }} | ||
run: | | ||
s3_urls=( | ||
"s3://nextstrain-data/files/workflows/measles/metadata.tsv.zst" | ||
"s3://nextstrain-data/files/workflows/measles/sequences.fasta.zst" | ||
) | ||
# Code below is modified from ingest/upload-to-s3 | ||
# https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29 | ||
no_hash=0000000000000000000000000000000000000000000000000000000000000000 | ||
for s3_url in "${s3_urls[@]}"; do | ||
s3path="${s3_url#s3://}" | ||
bucket="${s3path%%/*}" | ||
key="${s3path#*/}" | ||
s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" | ||
echo "${s3_hash}" | tee -a ingest-output-sha256sum | ||
done | ||
- name: Check cache | ||
id: check-cache | ||
uses: actions/cache@v4 | ||
with: | ||
path: ingest-output-sha256sum | ||
key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }} | ||
lookup-only: true | ||
|
||
phylogenetic: | ||
needs: [check-new-data] | ||
if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} | ||
permissions: | ||
id-token: write | ||
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master | ||
secrets: inherit | ||
with: | ||
# Starting with the default docker runtime | ||
# We can migrate to AWS Batch when/if we need to for more resources or if | ||
# the job runs longer than the GH Action limit of 6 hours. | ||
runtime: docker | ||
env: | | ||
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }} | ||
run: | | ||
nextstrain build \ | ||
phylogenetic \ | ||
deploy_all \ | ||
--configfile build-configs/nextstrain-automation/config.yaml | ||
# Specifying artifact name to differentiate ingest build outputs from | ||
# the phylogenetic build outputs | ||
artifact-name: phylogenetic-build-output | ||
artifact-paths: | | ||
phylogenetic/auspice/ | ||
phylogenetic/results/ | ||
phylogenetic/benchmarks/ | ||
phylogenetic/logs/ | ||
phylogenetic/.snakemake/log/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# This configuration file should contain all required configuration parameters | ||
# for the ingest workflow to run with additional Nextstrain automation rules. | ||
|
||
# Custom rules to run as part of the Nextstrain automated workflow | ||
# The paths should be relative to the ingest directory. | ||
custom_rules: | ||
- build-configs/nextstrain-automation/upload.smk | ||
|
||
# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads | ||
# This is required as long as we are using the AWS CLI for uploads | ||
cloudfront_domain: "data.nextstrain.org" | ||
|
||
# Nextstrain AWS S3 Bucket with pathogen prefix | ||
# Replace <pathogen> with the pathogen repo name. | ||
s3_dst: "s3://nextstrain-data/files/workflows/seasonal-cov" | ||
|
||
# Mapping of files to upload | ||
files_to_upload: | ||
ncbi_229e.ndjson.zst: data/229e/ncbi.ndjson | ||
ncbi_hku1.ndjson.zst: data/hku1/ncbi.ndjson | ||
ncbi_nl63.ndjson.zst: data/nl63/ncbi.ndjson | ||
ncbi_oc43.ndjson.zst: data/oc43/ncbi.ndjson | ||
metadata_229e.tsv.zst: results/229e/metadata.tsv | ||
sequences_229e.fasta.zst: results/229e/sequences.fasta | ||
metadata_hku1.tsv.zst: results/hku1/metadata.tsv | ||
sequences_hku1.fasta.zst: results/hku1/sequences.fasta | ||
metadata_nl63.tsv.zst: results/nl63/metadata.tsv | ||
sequences_nl63.fasta.zst: results/nl63/sequences.fasta | ||
metadata_oc43.tsv.zst: results/oc43/metadata.tsv | ||
sequences_oc43.fasta.zst: results/oc43/sequences.fasta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
""" | ||
This part of the workflow handles uploading files to AWS S3. | ||
Files to upload must be defined in the `files_to_upload` config param, where | ||
the keys are the remote files and the values are the local filepaths | ||
relative to the ingest directory. | ||
Produces a single file for each uploaded file: | ||
"results/upload/{remote_file}.upload" | ||
The rule `upload_all` can be used as a target to upload all files. | ||
""" | ||
|
||
import os | ||
|
||
slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ | ||
send_notifications = ( | ||
config.get("send_slack_notifications", False) and slack_envvars_defined | ||
) | ||
|
||
|
||
rule upload_to_s3: | ||
input: | ||
file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file], | ||
output: | ||
"results/upload/{remote_file}.upload", | ||
params: | ||
quiet="" if send_notifications else "--quiet", | ||
s3_dst=config["s3_dst"], | ||
cloudfront_domain=config["cloudfront_domain"], | ||
shell: | ||
""" | ||
./vendored/upload-to-s3 \ | ||
{params.quiet} \ | ||
{input.file_to_upload:q} \ | ||
{params.s3_dst:q}/{wildcards.remote_file:q} \ | ||
{params.cloudfront_domain} 2>&1 | tee {output} | ||
""" | ||
|
||
|
||
rule upload_all: | ||
input: | ||
uploads=[ | ||
f"results/upload/{remote_file}.upload" | ||
for remote_file in config["files_to_upload"].keys() | ||
], | ||
output: | ||
touch("results/upload_all.done"), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
custom_rules: | ||
- build-configs/nextstrain-automation/deploy.smk | ||
|
||
deploy_url: "s3://nextstrain-data" |
18 changes: 18 additions & 0 deletions
18
phylogenetic/build-configs/nextstrain-automation/deploy.smk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
""" | ||
This part of the workflow handles automatic deployments of the measles build. | ||
Uploads the build defined as the default output of the workflow through | ||
the `all` rule from Snakefille | ||
""" | ||
|
||
|
||
rule deploy_all: | ||
input: | ||
*rules.all.input, | ||
output: | ||
touch("results/deploy_all.done"), | ||
params: | ||
deploy_url=config["deploy_url"], | ||
shell: | ||
""" | ||
nextstrain remote upload {params.deploy_url} {input} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters