Skip to content

calculate-times-pedestrian-2024-state-51-weighted #1049

calculate-times-pedestrian-2024-state-51-weighted

calculate-times-pedestrian-2024-state-51-weighted #1049

---
name: calculate-times
run-name: calculate-times-${{ inputs.mode }}-${{ inputs.year }}-${{ inputs.geography }}-${{ inputs.state }}-${{ inputs.centroid_type }}
on:
workflow_dispatch:
inputs:
# Input values match those in params.yaml
mode:
required: true
description: Mode of travel
default: 'auto'
type: choice
options:
- auto
- bicycle
- pedestrian
year:
required: true
description: Census/OSM data year
default: '2020'
type: choice
options:
- '2020'
- '2021'
- '2022'
- '2023'
- '2024'
geography:
required: true
description: Census data geography
default: county
type: choice
options:
- state
- county
- county_subdivision
- tract
- zcta
state:
required: true
description: Target Census state
default: '01'
type: choice
options:
- '01'
- '02'
- '04'
- '05'
- '06'
- '08'
- '09'
- '10'
- '11'
- '12'
- '13'
- '15'
- '16'
- '17'
- '18'
- '19'
- '20'
- '21'
- '22'
- '23'
- '24'
- '25'
- '26'
- '27'
- '28'
- '29'
- '30'
- '31'
- '32'
- '33'
- '34'
- '35'
- '36'
- '37'
- '38'
- '39'
- '40'
- '41'
- '42'
- '44'
- '45'
- '46'
- '47'
- '48'
- '49'
- '50'
- '51'
- '53'
- '54'
- '55'
- '56'
centroid_type:
required: true
description: Whether or not to use population-weighted locations
default: weighted
type: choice
options:
- weighted
- unweighted
override_chunks:
required: false
description: |
Comma-separated chunks to run e.g. 0-5_000-100,6-11_000-100.
Will run all chunks if null
type: string
env:
AWS_DEFAULT_REGION: us-east-1
# See: https://github.com/aws/aws-cli/issues/5262#issuecomment-705832151
AWS_EC2_METADATA_DISABLED: true
PYTHONUNBUFFERED: "1"
jobs:
# Using the location data, split the origins into N jobs (max 256)
setup-jobs:
runs-on: ubuntu-24.04
outputs:
chunks: ${{ steps.create-job-chunks.outputs.chunks }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Cloudflare credentials
uses: ./.github/actions/setup-cloudflare-s3
with:
CLOUDFLARE_S3_API_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_S3_API_ACCESS_KEY_ID }}
CLOUDFLARE_S3_API_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_S3_API_SECRET_ACCESS_KEY }}
- name: Fetch GitHub user and group ID
shell: bash
id: fetch-ids
run: |
echo "USER_ID=$(id -u)" >> $GITHUB_ENV
echo "GROUP_ID=$(id -g)" >> $GITHUB_ENV
- name: Build Dockerized dependencies
uses: ./.github/actions/build-docker
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Load Docker image
run: |
docker load --input /tmp/opentimes.tar
docker image ls -a
- name: Fetch locations data
uses: ./.github/actions/fetch-locations
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-suffix: "site-data"
cache-dependency-glob: |
pyproject.toml
uv.lock
- name: Install Python dependencies
id: install-python-dependencies
shell: bash
run: |
uv python install
uv venv
uv pip install ".[site,data]"
- name: Create job chunks
id: create-job-chunks
working-directory: 'data'
shell: bash
run: |
export USER_ID=${{ env.USER_ID }}
export GROUP_ID=${{ env.GROUP_ID }}
uv run ./src/split_chunks.py \
--mode ${{ inputs.mode }} --year ${{ inputs.year }} \
--geography ${{ inputs.geography }} \
--state ${{ inputs.state }} > chunks.txt
echo "chunks=$(cat chunks.txt)" >> $GITHUB_OUTPUT
# If override chunks are set, use those instead
chunks_parsed=($(cat chunks.txt | jq -r '.[]'))
if [ -n "${{ inputs.override_chunks }}" ]; then
override_chunks_parsed=($(echo "${{ inputs.override_chunks }}" | tr -d ' ' | tr ',' ' '))
for chunk in "${override_chunks_parsed[@]}"; do
if [[ ! " ${chunks_parsed[@]} " =~ " ${chunk} " ]]; then
echo "Error: Override chunk ${chunk} is not in the chunks for this origin"
echo "Chunks include: ${chunks_parsed[@]}"
exit 1
fi
done
chunks_json=$(printf '%s\n' "${override_chunks_parsed[@]}" | jq -c -R . | jq -c -s .)
echo "Creating jobs for chunks: ${override_chunks_parsed[@]}"
echo "chunks=$chunks_json" > $GITHUB_OUTPUT
else
echo "Creating jobs for chunks: ${chunks_parsed[@]}"
fi
run-job:
runs-on: ubuntu-24.04
needs: setup-jobs
strategy:
# Don't fail all chunks if one fails
fail-fast: false
matrix:
chunk: ${{ fromJSON(needs.setup-jobs.outputs.chunks) }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Cloudflare credentials
uses: ./.github/actions/setup-cloudflare-s3
with:
CLOUDFLARE_S3_API_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_S3_API_ACCESS_KEY_ID }}
CLOUDFLARE_S3_API_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_S3_API_SECRET_ACCESS_KEY }}
- name: Fetch locations data
uses: ./.github/actions/fetch-locations
# Don't fetch tile data in setup-jobs because they're very large and
# will churn the Actions cache. We want to wait to fetch it until jobs
# have actually been picked up
- name: Fetch Valhalla tile data
uses: ./.github/actions/fetch-valhalla-tiles
with:
year: ${{ inputs.year }}
state: ${{ inputs.state }}
- name: Fetch GitHub user and group ID
shell: bash
id: fetch-ids
run: |
echo "USER_ID=$(id -u)" >> $GITHUB_ENV
echo "GROUP_ID=$(id -g)" >> $GITHUB_ENV
- name: Fetch Docker image
uses: actions/download-artifact@v4
with:
name: opentimes-docker-${{ hashFiles('./data/Dockerfile', './pyproject.toml') }}
path: /tmp
- name: Load Docker image
run: |
docker load --input /tmp/opentimes.tar
docker image ls -a
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-suffix: "site-data"
cache-dependency-glob: |
pyproject.toml
uv.lock
- name: Install Python dependencies
id: install-python-dependencies
shell: bash
run: |
uv python install
uv venv
uv pip install ".[site,data]"
- name: Extract tiles
shell: bash
working-directory: 'data'
run: |
tile_path="year=${{ inputs.year }}/geography=state/state=${{ inputs.state }}"
ln ./intermediate/valhalla_tiles/"$tile_path"/valhalla_tiles.tar.zst ./build/
tar -xf ./build/valhalla_tiles.tar.zst -C ./build
rm -f ./build/valhalla_tiles.tar.zst
# In rare cases the runner gets killed due to OoM errors. This bumps swap
# to 90% of the space remaining on disk
- name: Increase swapfile
run: |
space_left=$(df /dev/root -B 1 --output=avail | grep -v Avail)
space_mult=0.9
space_alloc=$(echo "${space_left}*${space_mult}" | bc)
space_alloc_rnd=$(printf %.0f $(echo ${space_alloc}))
sudo swapoff -a
sudo fallocate -l ${space_alloc_rnd} /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
sudo swapon --show
- name: Run job chunk
shell: bash
working-directory: 'data'
run: |
export USER_ID=${{ env.USER_ID }}
export GROUP_ID=${{ env.GROUP_ID }}
docker compose up --quiet-pull valhalla-run-fp valhalla-run-sp -d
uv run ./src/calculate_times.py \
--mode ${{ inputs.mode }} --year ${{ inputs.year }} \
--geography ${{ inputs.geography }} --state ${{ inputs.state }} \
--centroid-type ${{ inputs.centroid_type }} \
--chunk ${{ matrix.chunk }} --write-to-s3
# Clear the cache if we're one of the last running jobs. Using this instead
# of a separate workflow step because the steps often come last in the job
# queue and then won't run when many workflows are queued at the same time
- name: Clear workflow cache
if: always()
continue-on-error: true
shell: bash
run: |
endpoint='repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs --paginate -q'
total_jobs=$(gh api $endpoint '.total_count')
complete_jobs=$(gh api $endpoint '.jobs[] | select(.status == "completed")' | wc -l)
in_progress_jobs=$(gh api $endpoint '.jobs[] | select(.status == "in_progress")' | wc -l)
n_remaining=$((total_jobs - complete_jobs))
all_statuses=$((complete_jobs + in_progress_jobs))
echo "Total number of jobs: $total_jobs"
echo "Number of jobs complete: $total_jobs"
echo "Number of jobs remaining: $n_remaining"
echo "Number of jobs run/running: $all_statuses"
if [ "$n_remaining" -lt 2 ] && [ "$total_jobs" -eq "$all_statuses" ]; then
echo "Less than 5 jobs still running. Clearing workflow cache!"
gh cache delete \
valhalla-tiles-${{ inputs.year }}-${{ inputs.state }}-${{ hashFiles('./data/dvc.lock') }} || true
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}