catalyst-cooperative · bendnorman · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
@@ -123,6 +123,13 @@ function distribute_parquet() {
     fi
 }
 
+function create_and_distribute_duckdb() {
+    echo "Creating DuckDB file from parquet files and copying it to GCS"
+    GCS_PATH="gs://superset.catalyst.coop"
+    parquet_to_duckdb "$PUDL_OUTPUT/parquet" "$PUDL_OUTPUT/pudl.duckdb" && \
+    gsutil -q -m -u "$GCP_BILLING_PROJECT" cp "$PUDL_OUTPUT/pudl.duckdb" "$GCS_PATH"
+}
+
 function copy_outputs_to_distribution_bucket() {
     # Only attempt to update outputs if we have a real value of BUILD_REF
     # This avoids accidentally blowing away the whole bucket if it's not set.
@@ -282,6 +289,9 @@ if [[ $ETL_SUCCESS == 0 ]]; then
         # Distribute Parquet outputs to a private bucket
         distribute_parquet 2>&1 | tee -a "$LOGFILE"
         DISTRIBUTE_PARQUET_SUCCESS=${PIPESTATUS[0]}
+        # Create duckdb file from parquet files
+        create_duckdb 2>&1 | tee -a "$LOGFILE"
+        CREATE_DUCKDB_SUCCESS=${PIPESTATUS[0]}
         # Remove some cruft from the builds that we don't want to distribute
         clean_up_outputs_for_distribution 2>&1 | tee -a "$LOGFILE"
         CLEAN_UP_OUTPUTS_SUCCESS=${PIPESTATUS[0]}
@@ -313,6 +323,7 @@ if [[ $ETL_SUCCESS == 0 && \
       $UPDATE_STABLE_SUCCESS == 0 && \
       $DATASETTE_SUCCESS == 0 && \
       $DISTRIBUTE_PARQUET_SUCCESS == 0 && \
+      $CREATE_DUCKDB_SUCCESS == 0 && \
       $CLEAN_UP_OUTPUTS_SUCCESS == 0 && \
       $DISTRIBUTION_BUCKET_SUCCESS == 0 && \
       $GCS_TEMPORARY_HOLD_SUCCESS == 0 && \

diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml
diff --git a/environments/conda-lock.yml b/environments/conda-lock.yml
diff --git a/environments/conda-osx-64.lock.yml b/environments/conda-osx-64.lock.yml
diff --git a/environments/conda-osx-arm64.lock.yml b/environments/conda-osx-arm64.lock.yml
diff --git a/migrations/versions/7a16ce1fe774_adjust_some_code_table_pk_types.py b/migrations/versions/7a16ce1fe774_adjust_some_code_table_pk_types.py
@@ -0,0 +1,68 @@
+"""Adjust some code table pk types
+
+Revision ID: 7a16ce1fe774
+Revises: 49d2f4f7d7b7
+Create Date: 2024-07-25 19:09:50.613978
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '7a16ce1fe774'
+down_revision = '49d2f4f7d7b7'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('core_eia__codes_sector_consolidated', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.TEXT(),
+               type_=sa.Integer(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    with op.batch_alter_table('core_eia__codes_steam_plant_types', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.TEXT(),
+               type_=sa.Integer(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    with op.batch_alter_table('core_eia__codes_wind_quality_class', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.TEXT(),
+               type_=sa.Integer(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('core_eia__codes_wind_quality_class', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.Integer(),
+               type_=sa.TEXT(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    with op.batch_alter_table('core_eia__codes_steam_plant_types', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.Integer(),
+               type_=sa.TEXT(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    with op.batch_alter_table('core_eia__codes_sector_consolidated', schema=None) as batch_op:
+        batch_op.alter_column('code',
+               existing_type=sa.Integer(),
+               type_=sa.TEXT(),
+               existing_nullable=False,
+               autoincrement=False)
+
+    # ### end Alembic commands ###
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "dask-expr", # Required for dask[dataframe]
     "datasette>=0.64",
     "doc8>=1.1",
+    "duckdb-engine>=0.12.1",
     "email-validator>=1.0.3", # pydantic[email]
     "frictionless>=5,<6",
     "fsspec>=2024",
@@ -129,6 +130,7 @@ keywords = [
 [project.scripts]
 ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
 metadata_to_rst = "pudl.convert.metadata_to_rst:metadata_to_rst"
+parquet_to_duckdb = "pudl.convert.parquet_to_duckdb:parquet_to_duckdb"
 pudl_check_fks = "pudl.etl.check_foreign_keys:pudl_check_fks"
 pudl_datastore = "pudl.workspace.datastore:pudl_datastore"
 pudl_etl = "pudl.etl.cli:pudl_etl"

diff --git a/src/pudl/convert/parquet_to_duckdb.py b/src/pudl/convert/parquet_to_duckdb.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python
+"""Script that creates a DuckDB database from a collection of PUDL Parquet files."""
+
+import logging
+from pathlib import Path
+
+import click
+import duckdb
+import sqlalchemy as sa
+
+from pudl.metadata import PUDL_PACKAGE
+from pudl.metadata.classes import Package
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@click.command(
+    name="parquet_to_duckdb",
+    context_settings={"help_option_names": ["-h", "--help"]},
+)
+@click.argument("parquet_dir", type=click.Path(exists=True, resolve_path=True))
+@click.argument(
+    "duckdb_path", type=click.Path(resolve_path=True, writable=True, allow_dash=False)
+)
+@click.option(
+    "--no-load",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Only create metadata, don't load data.",
+)
+@click.option(
+    "--check-fks",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="If true, enable foreign keys in the database. Currently,"
+    "the parquet load process freezes up when foreign keys are enabled.",
+)
+def parquet_to_duckdb(
+    parquet_dir: str, duckdb_path: str, no_load: bool, check_fks: bool
+):
+    """Convert a directory of Parquet files to a DuckDB database.
+
+    Args:
+        parquet_dir: Path to a directory of parquet files.
+        duckdb_path: Path to the new DuckDB database file (should not exist).
+        no_load: Only create metadata, don't load data.
+        check_fks: If true, enable foreign keys in the database. Currently,
+            the parquet load process freezes up when foreign keys are enabled.
+
+    Example:
+        parquet_to_duckdb /path/to/parquet/directory duckdb.db
+    """
+    parquet_dir = Path(parquet_dir)
+    duckdb_path = Path(duckdb_path)
+
+    # Check if DuckDB file already exists
+    if duckdb_path.exists():
+        click.echo(
+            f"Error: DuckDB file '{duckdb_path}' already exists. Please provide a new filename."
+        )
+        return
+
+    # create duck db schema from pudl package
+    resource_ids = (r.name for r in PUDL_PACKAGE.resources if len(r.name) <= 63)
+    package = Package.from_resource_ids(resource_ids)
+
+    metadata = package.to_sql(dialect="duckdb", check_foreign_keys=check_fks)
+    engine = sa.create_engine(f"duckdb:///{duckdb_path}")
+    metadata.create_all(engine)
+
+    if not no_load:
+        with duckdb.connect(database=str(duckdb_path)) as duckdb_conn:
+            duckdb_cursor = duckdb_conn.cursor()
+            # Load data into the DuckDB database from parquet files, if requested:
+            # Iterate through the tables in order of foreign key dependency
+            for table in metadata.sorted_tables:
+                parquet_file_path = parquet_dir / f"{table.name}.parquet"
+                if parquet_file_path.exists():
+                    logger.info(
+                        f"Loading parquet file: {parquet_file_path} into {duckdb_path}"
+                    )
+                    sql_command = f"""
+                        COPY {table.name} FROM '{parquet_file_path}' (FORMAT PARQUET);
+                    """
+                    duckdb_cursor.execute(sql_command)
+                else:
+                    raise FileNotFoundError("Parquet file not found for: ", table.name)
+
+
+if __name__ == "__main__":
+    parquet_to_duckdb()