feat: add export_schemas tool for bq (#515)

Andres-Ayala1 · web-flow · commit 71c1e28236ef · 2025-12-22T12:13:53.000-05:00
diff --git a/tools/export_schemas/README.md b/tools/export_schemas/README.md
@@ -0,0 +1,55 @@
+# BigQuery Schema Export Tool
+
+This tool exports the DDL (Data Definition Language) for all tables in a BigQuery project's region to local SQL files, organized by dataset.
+
+
+## Use Cases
+
+- **AI Context Provisioning**: Provide a snapshot of your current DDLs to AI models for high-fidelity schema context.
+- **Version Control**: Quickly dump all table schemas to Git for version tracking.
+- **Migration**: Export schemas to specific folders to assist in migrating datasets between projects or regions.
+- **Backup**: Create a snapshot of your current DDLs for disaster recovery or audit purposes.
+- **Local Development**: Analyze table structures offline without querying BigQuery repeatedly.
+- **CI/CD**: Use as a step in a CI/CD pipeline to export schemas for testing or validation.
+
+## Prerequisites
+
+- Python 3.x
+- Google Cloud BigQuery Client Library: `pip install google-cloud-bigquery`
+- IAM Permissions:
+    - roles/bigquery.jobUser
+    - roles/bigquery.metadataViewer
+    - roles/bigquery.dataViewer (to list datasets)
+
+## Usage
+
+```bash
+python3 export_schemas.py --project_id <YOUR_PROJECT_ID> [--region <REGION>] [--output_dir <OUTPUT_DIR>]
+```
+
+### Arguments
+
+- `--project_id`: (Required) The Google Cloud Project ID.
+- `--region`: (Optional) The BigQuery region to query. Defaults to `us`.
+- `--output_dir`: (Optional) The directory to save the exported schemas. Defaults to `bq_schemas`.
+
+### Example
+
+```bash
+python3 export_schemas.py --project_id my-data-project --region us-east1
+```
+
+### Output
+
+The tool will create a folder structure like this:
+
+```
+bq_schemas/
+  ├── dataset_a/
+  │   ├── table1.sql
+  │   └── table2.sql
+  └── dataset_b/
+      └── table3.sql
+```
+
+It will also generate a zip file `bq_schema_export.zip` containing all the exported schemas.
diff --git a/tools/export_schemas/export_schemas.py b/tools/export_schemas/export_schemas.py
@@ -0,0 +1,115 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import argparse
+import sys
+import concurrent.futures
+from google.cloud import bigquery
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Export BigQuery table schemas and DDLs to local files.")
+    parser.add_argument("--project_id", required=True, help="GCP Project ID")
+    parser.add_argument("--region", default="us", help="BigQuery Region (default: us")
+    parser.add_argument("--output_dir", default="bq_schemas", help="Output directory for exported schemas (default: bq_schemas)")
+    return parser.parse_args()
+
+
+
+def write_ddl(table_metadata, output_dir):
+    dataset = table_metadata['table_schema']
+    table = table_metadata['table_name']
+    ddl = table_metadata['ddl']
+
+    # Create dataset folder if it doesn't exist
+    ds_path = os.path.join(output_dir, dataset)
+    os.makedirs(ds_path, exist_ok=True)
+
+    # Write the DDL
+    with open(os.path.join(ds_path, f"{table}.sql"), "w") as sql_file:
+        sql_file.write(ddl)
+
+def main():
+    args = parse_args()
+    
+    project_id = args.project_id
+    region = args.region
+    output_dir = args.output_dir
+    
+    # Construct region scope for INFORMATION_SCHEMA
+    if region.lower().startswith("region-"):
+          region_scope = region
+    else:
+          region_scope = f"region-{region}"
+
+    # 1. Setup clean directory
+    if os.path.exists(output_dir):
+        print(f"Cleaning existing directory: {output_dir}")
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+
+    print(f"--- Starting Bulk Export for {project_id} ({region_scope}) ---")
+
+    # 2. Run the query using BigQuery Client Library
+    query = f"""
+    SELECT table_schema, table_name, ddl
+    FROM `{project_id}.{region_scope}.INFORMATION_SCHEMA.TABLES`
+    WHERE table_type = 'BASE TABLE'
+    """
+    
+    print("Querying BigQuery metadata...")
+    
+    try:
+        client = bigquery.Client(project=project_id, location=region)
+        query_job = client.query(query)
+        tables = [dict(table_metadata) for table_metadata in query_job]
+    except ImportError:
+        print("\nError: google-cloud-bigquery module not found.")
+        print("Please install using: pip install google-cloud-bigquery or requirements.txt")
+        return
+    except Exception as query_error:
+        print("\nError running BigQuery query:")
+        print(query_error)
+        return
+
+    if not tables:
+        print("No tables found. Check your project ID and region.")
+        return
+
+    print(f"Found {len(tables)} tables. Writing {len(tables)} .sql files...")
+
+    # 3. Write files in parallel
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_table = {
+            executor.submit(write_ddl, table_metadata, output_dir): table_metadata 
+            for table_metadata in tables
+        }
+        for future in concurrent.futures.as_completed(future_to_table):
+            table_metadata = future_to_table[future]
+            try:
+                future.result()
+            except Exception as exc:
+                print(f"Table {table_metadata.get('table_name')} generated an exception: {exc}")
+
+    # 4. Create Zip
+    print("Zipping files...")
+    shutil.make_archive("bq_schema_export", 'zip', output_dir)
+
+    print("\n----SUCCESS----!")
+    print(f"File created: {os.path.abspath('bq_schema_export.zip')}")
+    print(f"Schemas exported to: {os.path.abspath(output_dir)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/export_schemas/requirements.txt b/tools/export_schemas/requirements.txt
@@ -0,0 +1 @@
+google-cloud-bigquery>=3.0.0