Skip to content

Commit 71c1e28

Browse files
feat: add export_schemas tool for bq (#515)
1 parent d28f069 commit 71c1e28

File tree

3 files changed

+171
-0
lines changed

3 files changed

+171
-0
lines changed

tools/export_schemas/README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# BigQuery Schema Export Tool
2+
3+
This tool exports the DDL (Data Definition Language) for all tables in a BigQuery project's region to local SQL files, organized by dataset.
4+
5+
6+
## Use Cases
7+
8+
- **AI Context Provisioning**: Provide a snapshot of your current DDLs to AI models for high-fidelity schema context.
9+
- **Version Control**: Quickly dump all table schemas to Git for version tracking.
10+
- **Migration**: Export schemas to specific folders to assist in migrating datasets between projects or regions.
11+
- **Backup**: Create a snapshot of your current DDLs for disaster recovery or audit purposes.
12+
- **Local Development**: Analyze table structures offline without querying BigQuery repeatedly.
13+
- **CI/CD**: Use as a step in a CI/CD pipeline to export schemas for testing or validation.
14+
15+
## Prerequisites
16+
17+
- Python 3.x
18+
- Google Cloud BigQuery Client Library: `pip install google-cloud-bigquery`
19+
- IAM Permissions:
20+
- roles/bigquery.jobUser
21+
- roles/bigquery.metadataViewer
22+
- roles/bigquery.dataViewer (to list datasets)
23+
24+
## Usage
25+
26+
```bash
27+
python3 export_schemas.py --project_id <YOUR_PROJECT_ID> [--region <REGION>] [--output_dir <OUTPUT_DIR>]
28+
```
29+
30+
### Arguments
31+
32+
- `--project_id`: (Required) The Google Cloud Project ID.
33+
- `--region`: (Optional) The BigQuery region to query. Defaults to `us`.
34+
- `--output_dir`: (Optional) The directory to save the exported schemas. Defaults to `bq_schemas`.
35+
36+
### Example
37+
38+
```bash
39+
python3 export_schemas.py --project_id my-data-project --region us-east1
40+
```
41+
42+
### Output
43+
44+
The tool will create a folder structure like this:
45+
46+
```
47+
bq_schemas/
48+
├── dataset_a/
49+
│ ├── table1.sql
50+
│ └── table2.sql
51+
└── dataset_b/
52+
└── table3.sql
53+
```
54+
55+
It will also generate a zip file `bq_schema_export.zip` containing all the exported schemas.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import shutil
17+
import argparse
18+
import sys
19+
import concurrent.futures
20+
from google.cloud import bigquery
21+
22+
def parse_args():
23+
parser = argparse.ArgumentParser(description="Export BigQuery table schemas and DDLs to local files.")
24+
parser.add_argument("--project_id", required=True, help="GCP Project ID")
25+
parser.add_argument("--region", default="us", help="BigQuery Region (default: us")
26+
parser.add_argument("--output_dir", default="bq_schemas", help="Output directory for exported schemas (default: bq_schemas)")
27+
return parser.parse_args()
28+
29+
30+
31+
def write_ddl(table_metadata, output_dir):
32+
dataset = table_metadata['table_schema']
33+
table = table_metadata['table_name']
34+
ddl = table_metadata['ddl']
35+
36+
# Create dataset folder if it doesn't exist
37+
ds_path = os.path.join(output_dir, dataset)
38+
os.makedirs(ds_path, exist_ok=True)
39+
40+
# Write the DDL
41+
with open(os.path.join(ds_path, f"{table}.sql"), "w") as sql_file:
42+
sql_file.write(ddl)
43+
44+
def main():
45+
args = parse_args()
46+
47+
project_id = args.project_id
48+
region = args.region
49+
output_dir = args.output_dir
50+
51+
# Construct region scope for INFORMATION_SCHEMA
52+
if region.lower().startswith("region-"):
53+
region_scope = region
54+
else:
55+
region_scope = f"region-{region}"
56+
57+
# 1. Setup clean directory
58+
if os.path.exists(output_dir):
59+
print(f"Cleaning existing directory: {output_dir}")
60+
shutil.rmtree(output_dir)
61+
os.makedirs(output_dir)
62+
63+
print(f"--- Starting Bulk Export for {project_id} ({region_scope}) ---")
64+
65+
# 2. Run the query using BigQuery Client Library
66+
query = f"""
67+
SELECT table_schema, table_name, ddl
68+
FROM `{project_id}.{region_scope}.INFORMATION_SCHEMA.TABLES`
69+
WHERE table_type = 'BASE TABLE'
70+
"""
71+
72+
print("Querying BigQuery metadata...")
73+
74+
try:
75+
client = bigquery.Client(project=project_id, location=region)
76+
query_job = client.query(query)
77+
tables = [dict(table_metadata) for table_metadata in query_job]
78+
except ImportError:
79+
print("\nError: google-cloud-bigquery module not found.")
80+
print("Please install using: pip install google-cloud-bigquery or requirements.txt")
81+
return
82+
except Exception as query_error:
83+
print("\nError running BigQuery query:")
84+
print(query_error)
85+
return
86+
87+
if not tables:
88+
print("No tables found. Check your project ID and region.")
89+
return
90+
91+
print(f"Found {len(tables)} tables. Writing {len(tables)} .sql files...")
92+
93+
# 3. Write files in parallel
94+
with concurrent.futures.ThreadPoolExecutor() as executor:
95+
future_to_table = {
96+
executor.submit(write_ddl, table_metadata, output_dir): table_metadata
97+
for table_metadata in tables
98+
}
99+
for future in concurrent.futures.as_completed(future_to_table):
100+
table_metadata = future_to_table[future]
101+
try:
102+
future.result()
103+
except Exception as exc:
104+
print(f"Table {table_metadata.get('table_name')} generated an exception: {exc}")
105+
106+
# 4. Create Zip
107+
print("Zipping files...")
108+
shutil.make_archive("bq_schema_export", 'zip', output_dir)
109+
110+
print("\n----SUCCESS----!")
111+
print(f"File created: {os.path.abspath('bq_schema_export.zip')}")
112+
print(f"Schemas exported to: {os.path.abspath(output_dir)}")
113+
114+
if __name__ == "__main__":
115+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
google-cloud-bigquery>=3.0.0

0 commit comments

Comments
 (0)