Skip to content

Commit

Permalink
Implement the create source data workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
p2004a committed Sep 9, 2024
1 parent edaae94 commit 23248ce
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 2 deletions.
64 changes: 62 additions & 2 deletions .github/workflows/create-source-data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
with:
token: ${{ steps.app-token.outputs.token }}
image_project: ubuntu-os-cloud
image_family: ubuntu-2204-lts
image_family: ubuntu-2404-lts-amd64
machine_zone: europe-west4-b
machine_type: e2-standard-4
runner_service_account: ${{ vars.RUNNER_GCP_SERVICE_ACCOUNT }}
Expand All @@ -40,4 +40,64 @@ jobs:
needs: create-runner
runs-on: ${{ needs.create-runner.outputs.label }}
steps:
- run: echo "This runs on the GCE VM"
# We are running on barebones VM, so there is more scripting involved
# then needed if we were running on standard GitHub Actions runner.
- name: Checkout source
run: |
mkdir src
cd src
git init
git remote add origin $GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git
git fetch origin $GITHUB_REF
git reset --hard FETCH_HEAD
cd ..
- name: Set up PostgreSQL
run: |
sudo apt-get --yes install postgresql
while ! pg_isready; do
echo "waiting for postgres..."
sleep 1
done
sudo -u postgres psql -c "ALTER USER postgres PASSWORD '12345';"
- name: Setup DuckDB
run: |
sudo apt-get install --yes unzip
curl -L https://github.com/duckdb/duckdb/releases/download/v1.1.0/duckdb_cli-linux-amd64.zip > duckdb.zip
unzip duckdb.zip duckdb
sudo mv duckdb /usr/local/bin
- name: Restore replay db
run: |
REPLAY_BACKUP="$(gcloud storage ls gs://${REPLAY_BACKUPS_GCS_BUCKET} | sort -r | head -n 1)"
gcloud storage cp "$REPLAY_BACKUP" .
psql -c "CREATE DATABASE bar;"
time zstdcat "$(basename "$REPLAY_BACKUP")" | pg_restore -d postgres --clean --create --no-owner --no-privileges
env:
REPLAY_BACKUPS_GCS_BUCKET: ${{ vars.REPLAY_BACKUPS_GCS_BUCKET }}
PGPASSWORD: 12345
PGHOST: 127.0.0.1
PGUSER: postgres
- name: Restore teiserver db
run:
TEI_BACKUP="$(gcloud storage ls gs://${TEISERVER_BACKUPS_GCS_BUCKET} | sort -r | head -n 1)"
gcloud storage cp "$TEI_BACKUP" .
psql -c "CREATE DATABASE teiserver_prod;"
time zstdcat "$(basename "$TEI_BACKUP")" | pg_restore -d postgres --clean --create --no-owner --no-privileges
env:
TEISERVER_BACKUPS_GCS_BUCKET: ${{ vars.TEISERVER_BACKUPS_GCS_BUCKET }}
PGPASSWORD: 12345
PGHOST: 127.0.0.1
PGUSER: postgres
- name: Export parquet files
run: |
mkdir data_export
duckdb < src/scripts/export_prod_data_source.sql
env:
PGPASSWORD: 12345
PGHOST: 127.0.0.1
PGUSER: postgres
- name: Save data export in GCS bucket
run: |
gcloud config set storage/parallel_composite_upload_compatibility_check False
gcloud storage rsync data_export/ gs://$DATA_MART_GCS_BUCKET/pgdumps --recursive --delete-unmatched-destination-objects
env:
DATA_MART_GCS_BUCKET: ${{ vars.DATA_MART_GCS_BUCKET }}
14 changes: 14 additions & 0 deletions scripts/export_prod_data_source.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- noqa: disable=all

ATTACH 'dbname=teiserver_prod' AS teiserver (TYPE POSTGRES, READ_ONLY);

COPY teiserver.public.teiserver_battle_matches TO 'data_export/teiserver_battle_matches.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY teiserver.public.teiserver_battle_match_memberships TO 'data_export/teiserver_battle_match_memberships.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY teiserver.public.teiserver_game_rating_logs TO 'data_export/teiserver_game_rating_logs.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);

ATTACH 'dbname=bar' AS replay (TYPE POSTGRES, READ_ONLY);

COPY replay.public.Demos TO 'data_export/replay_demos.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.AllyTeams TO 'data_export/replay_ally_teams.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.Players TO 'data_export/replay_players.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
COPY replay.public.Maps TO 'data_export/replay_maps.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);

0 comments on commit 23248ce

Please sign in to comment.