Skip to content

Commit 530563d

Browse files
committed
Implement the "Create Dumps from DBs" workflow
Periodically running workflow running on GCE that creates parquet exports from postgres databases restored from teiserver and replay database backup files.
1 parent edaae94 commit 530563d

File tree

2 files changed

+81
-4
lines changed

2 files changed

+81
-4
lines changed

.github/workflows/create-source-data.yaml

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
name: Create Source Data
1+
name: Create Dumps from DBs
22
on:
33
workflow_dispatch:
4+
schedule:
5+
- cron: '0 10 * * 2,6'
46
jobs:
57
create-runner:
68
permissions:
@@ -28,16 +30,77 @@ jobs:
2830
with:
2931
token: ${{ steps.app-token.outputs.token }}
3032
image_project: ubuntu-os-cloud
31-
image_family: ubuntu-2204-lts
33+
image_family: ubuntu-2404-lts-amd64
3234
machine_zone: europe-west4-b
3335
machine_type: e2-standard-4
3436
runner_service_account: ${{ vars.RUNNER_GCP_SERVICE_ACCOUNT }}
3537
preemptible: true
3638
ephemeral: true
3739
boot_disk_type: pd-ssd
3840
disk_size: 70GB
39-
test:
41+
export-pgdumps:
4042
needs: create-runner
4143
runs-on: ${{ needs.create-runner.outputs.label }}
4244
steps:
43-
- run: echo "This runs on the GCE VM"
45+
# We are running on barebones VM, so there is more scripting involved
46+
# then needed if we were running on standard GitHub Actions runner.
47+
- name: Checkout source
48+
run: |
49+
mkdir src
50+
cd src
51+
git init
52+
git remote add origin $GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git
53+
git fetch origin $GITHUB_REF
54+
git reset --hard FETCH_HEAD
55+
cd ..
56+
- name: Set up PostgreSQL
57+
run: |
58+
sudo apt-get --yes install postgresql
59+
while ! pg_isready; do
60+
echo "waiting for postgres..."
61+
sleep 1
62+
done
63+
sudo -u postgres psql -c "ALTER USER postgres PASSWORD '12345';"
64+
- name: Setup DuckDB
65+
run: |
66+
sudo apt-get install --yes unzip
67+
curl -L https://github.com/duckdb/duckdb/releases/download/v1.1.0/duckdb_cli-linux-amd64.zip > duckdb.zip
68+
unzip duckdb.zip duckdb
69+
sudo mv duckdb /usr/local/bin
70+
export HOME=$(pwd)
71+
duckdb :memory: 'INSTALL postgres;'
72+
- name: Restore databases
73+
run: |
74+
function restore {
75+
local BACKUP="$(gcloud storage ls gs://$1 | sort -r | head -n 1)"
76+
gcloud storage cp "$BACKUP" .
77+
psql -c "CREATE DATABASE $2;"
78+
time zstdcat "$(basename "$BACKUP")" \
79+
| pg_restore -d postgres --clean --create --no-owner --no-privileges
80+
}
81+
82+
restore "$REPLAY_BACKUPS_GCS_BUCKET" bar &
83+
restore "$TEISERVER_BACKUPS_GCS_BUCKET" teiserver_prod &
84+
85+
wait %1 %2
86+
env:
87+
REPLAY_BACKUPS_GCS_BUCKET: ${{ vars.REPLAY_BACKUPS_GCS_BUCKET }}
88+
TEISERVER_BACKUPS_GCS_BUCKET: ${{ vars.TEISERVER_BACKUPS_GCS_BUCKET }}
89+
PGPASSWORD: 12345
90+
PGHOST: 127.0.0.1
91+
PGUSER: postgres
92+
- name: Export parquet files
93+
run: |
94+
mkdir data_export
95+
export HOME=$(pwd)
96+
duckdb < src/scripts/export_prod_data_source.sql
97+
env:
98+
PGPASSWORD: 12345
99+
PGHOST: 127.0.0.1
100+
PGUSER: postgres
101+
- name: Save data export in GCS bucket
102+
run: |
103+
gcloud config set storage/parallel_composite_upload_compatibility_check False
104+
gcloud storage rsync data_export/ gs://$DATA_MART_GCS_BUCKET/pgdumps --recursive --delete-unmatched-destination-objects
105+
env:
106+
DATA_MART_GCS_BUCKET: ${{ vars.DATA_MART_GCS_BUCKET }}

scripts/export_prod_data_source.sql

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
-- noqa: disable=all
2+
3+
ATTACH 'dbname=teiserver_prod' AS teiserver (TYPE POSTGRES, READ_ONLY);
4+
5+
COPY teiserver.public.teiserver_battle_matches TO 'data_export/teiserver_battle_matches.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
6+
COPY teiserver.public.teiserver_battle_match_memberships TO 'data_export/teiserver_battle_match_memberships.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
7+
COPY teiserver.public.teiserver_game_rating_logs TO 'data_export/teiserver_game_rating_logs.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
8+
9+
ATTACH 'dbname=bar' AS replay (TYPE POSTGRES, READ_ONLY);
10+
11+
COPY replay.public.Demos TO 'data_export/replay_demos.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
12+
COPY replay.public.AllyTeams TO 'data_export/replay_ally_teams.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
13+
COPY replay.public.Players TO 'data_export/replay_players.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);
14+
COPY replay.public.Maps TO 'data_export/replay_maps.parquet' (FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 9);

0 commit comments

Comments
 (0)