Add ingest-to-phylogenetic GitHub Action... [#14]

...to fully automate `ingest` and subsequent rebuild of `phylogenetic` when data has changed. Includes related configuration updates, as follows: * copy github action and associated build-configs from template * customize nextstrain-automation/config.yaml files for `ingest` and `phylogenetic` builds * add custom file loading to top-level Snakefiles in `ingest` and `phylogenetic` builds * update .gitignore to ignore newly-used `phylogenetic/data` * update phylogenetic/defaults/config.yaml to remove unused keys * update values of various `metadata` and `sequences` inputs/outputs in subsidiary 'phylogenetic/rules/*.smk' files
nextstrain · Jul 22, 2024 · 35d3cd4 · 35d3cd4
1 parent 77b2720
commit 35d3cd4
Show file tree

Hide file tree

Showing 12 changed files with 287 additions and 12 deletions.
diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml
@@ -0,0 +1,140 @@
+name: Ingest to phylogenetic
+
+defaults:
+  run:
+    # This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
+    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
+    #
+    # Completely spelling it out here so that GitHub can't change it out from under us
+    # and we don't have to refer to the docs to know the expected behavior.
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+on:
+  schedule:
+    # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
+    #
+    # Note the actual runs might be late.
+    # Numerous people were confused, about that, including me:
+    #  - https://github.community/t/scheduled-action-running-consistently-late/138025/11
+    #  - https://github.com/github/docs/issues/3059
+    #
+    # Note, '*' is a special character in YAML, so you have to quote this string.
+    #
+    # Docs:
+    #  - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
+    #
+    # Tool that deciphers this particular format of crontab string:
+    #  - https://crontab.guru/
+    #
+    # Runs at 5:30pm UTC (1:30pm EDT/10:30am PDT) since curation by NCBI happens on the East Coast.
+    # We were running into invalid zip archive errors at 9am PDT, so hoping an hour-thirty
+    # delay will lower the error frequency
+    - cron: '30 17 * * *'
+
+  workflow_dispatch:
+    inputs:
+      ingest_image:
+        description: 'Specific container image to use for ingest workflow (will override the default of "nextstrain build")'
+        required: false
+      phylogenetic_image:
+        description: 'Specific container image to use for phylogenetic workflow (will override the default of "nextstrain build")'
+        required: false
+
+jobs:
+  ingest:
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.ingest_image }}
+      run: |
+        nextstrain build \
+          ingest \
+            upload_all \
+            --configfile build-configs/nextstrain-automation/config.yaml
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: ingest-build-output
+      artifact-paths: |
+        ingest/results/
+        ingest/benchmarks/
+        ingest/logs/
+        ingest/.snakemake/log/
+
+  # Check if ingest results include new data by checking for the cache
+  # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3)
+  # GitHub will remove any cache entries that have not been accessed in over 7 days,
+  # so if the workflow has not been run over 7 days then it will trigger phylogenetic.
+  check-new-data:
+    needs: [ingest]
+    runs-on: ubuntu-latest
+    outputs:
+      cache-hit: ${{ steps.check-cache.outputs.cache-hit }}
+    steps:
+      - name: Get sha256sum
+        id: get-sha256sum
+        env:
+          AWS_DEFAULT_REGION: ${{ vars.AWS_DEFAULT_REGION }}
+        run: |
+          s3_urls=(
+            "s3://nextstrain-data/files/workflows/measles/metadata.tsv.zst"
+            "s3://nextstrain-data/files/workflows/measles/sequences.fasta.zst"
+          )
+
+          # Code below is modified from ingest/upload-to-s3
+          # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29
+
+
+          no_hash=0000000000000000000000000000000000000000000000000000000000000000
+
+          for s3_url in "${s3_urls[@]}"; do
+            s3path="${s3_url#s3://}"
+            bucket="${s3path%%/*}"
+            key="${s3path#*/}"
+
+            s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")"
+            echo "${s3_hash}" | tee -a ingest-output-sha256sum
+          done
+
+      - name: Check cache
+        id: check-cache
+        uses: actions/cache@v4
+        with:
+          path: ingest-output-sha256sum
+          key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }}
+          lookup-only: true
+
+  phylogenetic:
+    needs: [check-new-data]
+    if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }}
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      # Starting with the default docker runtime
+      # We can migrate to AWS Batch when/if we need to for more resources or if
+      # the job runs longer than the GH Action limit of 6 hours.
+      runtime: docker
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.phylogenetic_image }}
+      run: |
+        nextstrain build \
+          phylogenetic \
+            deploy_all \
+            --configfile build-configs/nextstrain-automation/config.yaml
+      # Specifying artifact name to differentiate ingest build outputs from
+      # the phylogenetic build outputs
+      artifact-name: phylogenetic-build-output
+      artifact-paths: |
+        phylogenetic/auspice/
+        phylogenetic/results/
+        phylogenetic/benchmarks/
+        phylogenetic/logs/
+        phylogenetic/.snakemake/log/
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ ingest/logs
 ingest/results
 phylogenetic/auspice
 phylogenetic/benchmarks
+phylogenetic/data
 phylogenetic/logs
 phylogenetic/results
 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -45,3 +45,10 @@ rule clean:
         """
         rm -rfv {params.targets}
         """
+
+
+# Import custom rules provided via the config.
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -0,0 +1,30 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run with additional Nextstrain automation rules.
+
+# Custom rules to run as part of the Nextstrain automated workflow
+# The paths should be relative to the ingest directory.
+custom_rules:
+  - build-configs/nextstrain-automation/upload.smk
+
+# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads
+# This is required as long as we are using the AWS CLI for uploads
+cloudfront_domain: "data.nextstrain.org"
+
+# Nextstrain AWS S3 Bucket with pathogen prefix
+# Replace <pathogen> with the pathogen repo name.
+s3_dst: "s3://nextstrain-data/files/workflows/seasonal-cov"
+
+# Mapping of files to upload
+files_to_upload:
+  ncbi_229e.ndjson.zst: data/229e/ncbi.ndjson
+  ncbi_hku1.ndjson.zst: data/hku1/ncbi.ndjson
+  ncbi_nl63.ndjson.zst: data/nl63/ncbi.ndjson
+  ncbi_oc43.ndjson.zst: data/oc43/ncbi.ndjson
+  metadata_229e.tsv.zst: results/229e/metadata.tsv
+  sequences_229e.fasta.zst: results/229e/sequences.fasta
+  metadata_hku1.tsv.zst: results/hku1/metadata.tsv
+  sequences_hku1.fasta.zst: results/hku1/sequences.fasta
+  metadata_nl63.tsv.zst: results/nl63/metadata.tsv
+  sequences_nl63.fasta.zst: results/nl63/sequences.fasta
+  metadata_oc43.tsv.zst: results/oc43/metadata.tsv
+  sequences_oc43.fasta.zst: results/oc43/sequences.fasta
diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk
@@ -0,0 +1,48 @@
+"""
+This part of the workflow handles uploading files to AWS S3.
+
+Files to upload must be defined in the `files_to_upload` config param, where
+the keys are the remote files and the values are the local filepaths
+relative to the ingest directory.
+
+Produces a single file for each uploaded file:
+    "results/upload/{remote_file}.upload"
+
+The rule `upload_all` can be used as a target to upload all files.
+"""
+
+import os
+
+slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ
+send_notifications = (
+    config.get("send_slack_notifications", False) and slack_envvars_defined
+)
+
+
+rule upload_to_s3:
+    input:
+        file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file],
+    output:
+        "results/upload/{remote_file}.upload",
+    params:
+        quiet="" if send_notifications else "--quiet",
+        s3_dst=config["s3_dst"],
+        cloudfront_domain=config["cloudfront_domain"],
+    shell:
+        """
+        ./vendored/upload-to-s3 \
+            {params.quiet} \
+            {input.file_to_upload:q} \
+            {params.s3_dst:q}/{wildcards.remote_file:q} \
+            {params.cloudfront_domain} 2>&1 | tee {output}
+        """
+
+
+rule upload_all:
+    input:
+        uploads=[
+            f"results/upload/{remote_file}.upload"
+            for remote_file in config["files_to_upload"].keys()
+        ],
+    output:
+        touch("results/upload_all.done"),
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -26,3 +26,10 @@ rule clean:
         """
         rm -rfv {params.targets}
         """
+
+
+# Import custom rules provided via the config.
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/phylogenetic/build-configs/nextstrain-automation/config.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml
@@ -0,0 +1,4 @@
+custom_rules:
+  - build-configs/nextstrain-automation/deploy.smk
+
+deploy_url: "s3://nextstrain-data"
diff --git a/phylogenetic/build-configs/nextstrain-automation/deploy.smk b/phylogenetic/build-configs/nextstrain-automation/deploy.smk
@@ -0,0 +1,18 @@
+"""
+This part of the workflow handles automatic deployments of the measles build.
+Uploads the build defined as the default output of the workflow through
+the `all` rule from Snakefille
+"""
+
+
+rule deploy_all:
+    input:
+        *rules.all.input,
+    output:
+        touch("results/deploy_all.done"),
+    params:
+        deploy_url=config["deploy_url"],
+    shell:
+        """
+        nextstrain remote upload {params.deploy_url} {input}
+        """
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -33,9 +33,7 @@ viruses:
 229e:
   reference: "defaults/229e/reference.fasta"
   genemap: "defaults/229e/genemap.gff"
-  metadata: "../ingest/results/229e/metadata.tsv"
   prepare_sequences:
-    sequences: "../ingest/results/229e/sequences.fasta"
     group_by: "country"
     subsample_max_sequences: 4000
     min_length: 20000
@@ -50,9 +48,7 @@ viruses:
 nl63:
   reference: "defaults/nl63/reference.fasta"
   genemap: "defaults/nl63/genemap.gff"
-  metadata: "../ingest/results/nl63/metadata.tsv"
   prepare_sequences:
-    sequences: "../ingest/results/nl63/sequences.fasta"
     group_by: "country"
     subsample_max_sequences: 4000
     min_length: 20000
@@ -67,9 +63,7 @@ nl63:
 oc43:
   reference: "defaults/oc43/reference.fasta"
   genemap: "defaults/oc43/genemap.gff"
-  metadata: "../ingest/results/oc43/metadata.tsv"
   prepare_sequences:
-    sequences: "../ingest/results/oc43/sequences.fasta"
     group_by: "country"
     subsample_max_sequences: 4000
     min_length: 20000
@@ -84,9 +78,7 @@ oc43:
 hku1:
   reference: "defaults/hku1/reference.fasta"
   genemap: "defaults/hku1/genemap.gff"
-  metadata: "../ingest/results/hku1/metadata.tsv"
   prepare_sequences:
-    sequences: "../ingest/results/hku1/sequences.fasta"
     group_by: "country"
     subsample_max_sequences: 4000
     min_length: 20000

diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
@@ -30,7 +30,7 @@ rule refine:
     input:
         tree="results/{virus}/tree_raw.nwk",
         alignment="results/{virus}/aligned.fasta",
-        metadata=lambda wildcards: config[wildcards.virus]["metadata"],
+        metadata="data/{virus}/metadata.tsv",
     output:
         tree="results/{virus}/tree.nwk",
         node_data="results/{virus}/branch_lengths.json",

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
@@ -9,7 +9,7 @@ tree and at least one node data JSON.
 rule export:
     input:
         tree="results/{virus}/tree.nwk",
-        metadata=lambda wildcards: config[wildcards.virus]["metadata"],
+        metadata="data/{virus}/metadata.tsv",
         branch_lengths="results/{virus}/branch_lengths.json",
         nt_muts="results/{virus}/nt_muts.json",
         aa_muts="results/{virus}/aa_muts.json",

diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
@@ -11,10 +11,38 @@ and will produce an aligned FASTA file of subsampled sequences as an output.
 """
 
 
+rule download:
+    output:
+        sequences="data/{virus}/sequences.fasta.zst",
+        metadata="data/{virus}/metadata.tsv.zst",
+    params:
+        sequences_url="https://data.nextstrain.org/files/workflows/seasonal-cov/sequences_{virus}.fasta.zst",
+        metadata_url="https://data.nextstrain.org/files/workflows/seasonal-cov/metadata_{virus}.tsv.zst",
+    shell:
+        """
+        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+        """
+
+
+rule decompress:
+    input:
+        sequences="data/{virus}/sequences.fasta.zst",
+        metadata="data/{virus}/metadata.tsv.zst",
+    output:
+        sequences="data/{virus}/sequences.fasta",
+        metadata="data/{virus}/metadata.tsv",
+    shell:
+        """
+        zstd -d -c {input.sequences} > {output.sequences}
+        zstd -d -c {input.metadata} > {output.metadata}
+        """
+
+
 rule filter:
     input:
-        sequences=lambda wildcards: config[wildcards.virus]["prepare_sequences"]["sequences"],
-        metadata=lambda wildcards: config[wildcards.virus]["metadata"],
+        sequences="data/{virus}/sequences.fasta",
+        metadata="data/{virus}/metadata.tsv",
         exclude="defaults/{virus}/dropped_strains.txt",
     output:
         sequences="results/{virus}/filtered.fasta",