-
-
Notifications
You must be signed in to change notification settings - Fork 124
Create pudl.duckdb
from parquet files
#3741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 14 commits
11faecb
e090808
218460d
6649cfd
7ed930f
c94abf7
0eee1e1
c727735
75883dc
7344506
6662078
7aa56d7
9fbf4b2
98a137b
faeae02
f0b61f6
221ce77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
"""Adjust some code table pk types | ||
|
||
Revision ID: 7a16ce1fe774 | ||
Revises: 49d2f4f7d7b7 | ||
Create Date: 2024-07-25 19:09:50.613978 | ||
|
||
""" | ||
from alembic import op | ||
import sqlalchemy as sa | ||
|
||
|
||
# revision identifiers, used by Alembic. | ||
revision = '7a16ce1fe774' | ||
down_revision = '49d2f4f7d7b7' | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
with op.batch_alter_table('core_eia__codes_sector_consolidated', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.TEXT(), | ||
type_=sa.Integer(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
with op.batch_alter_table('core_eia__codes_steam_plant_types', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.TEXT(), | ||
type_=sa.Integer(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
with op.batch_alter_table('core_eia__codes_wind_quality_class', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.TEXT(), | ||
type_=sa.Integer(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
# ### end Alembic commands ### | ||
|
||
|
||
def downgrade() -> None: | ||
# ### commands auto generated by Alembic - please adjust! ### | ||
with op.batch_alter_table('core_eia__codes_wind_quality_class', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.Integer(), | ||
type_=sa.TEXT(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
with op.batch_alter_table('core_eia__codes_steam_plant_types', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.Integer(), | ||
type_=sa.TEXT(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
with op.batch_alter_table('core_eia__codes_sector_consolidated', schema=None) as batch_op: | ||
batch_op.alter_column('code', | ||
existing_type=sa.Integer(), | ||
type_=sa.TEXT(), | ||
existing_nullable=False, | ||
autoincrement=False) | ||
|
||
# ### end Alembic commands ### |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#! /usr/bin/env python | ||
"""Script that creates a DuckDB database from a collection of PUDL Parquet files.""" | ||
|
||
import logging | ||
from pathlib import Path | ||
|
||
import click | ||
import duckdb | ||
import sqlalchemy as sa | ||
|
||
from pudl.metadata import PUDL_PACKAGE | ||
from pudl.metadata.classes import Package | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@click.command( | ||
name="parquet_to_duckdb", | ||
context_settings={"help_option_names": ["-h", "--help"]}, | ||
) | ||
@click.argument("parquet_dir", type=click.Path(exists=True, resolve_path=True)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can require that it be a directory here too. |
||
@click.argument( | ||
"duckdb_path", type=click.Path(resolve_path=True, writable=True, allow_dash=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the expected / desired behavior if the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see below. I think you can require that it not already exist in this check too? |
||
) | ||
@click.option( | ||
"--no-load", | ||
is_flag=True, | ||
show_default=True, | ||
default=False, | ||
help="Only create metadata, don't load data.", | ||
) | ||
@click.option( | ||
"--check-fks", | ||
is_flag=True, | ||
show_default=True, | ||
default=False, | ||
help="If true, enable foreign keys in the database. Currently," | ||
"the parquet load process freezes up when foreign keys are enabled.", | ||
) | ||
def parquet_to_duckdb( | ||
parquet_dir: str, duckdb_path: str, no_load: bool, check_fks: bool | ||
): | ||
"""Convert a directory of Parquet files to a DuckDB database. | ||
|
||
Args: | ||
parquet_dir: Path to a directory of parquet files. | ||
duckdb_path: Path to the new DuckDB database file (should not exist). | ||
no_load: Only create metadata, don't load data. | ||
check_fks: If true, enable foreign keys in the database. Currently, | ||
the parquet load process freezes up when foreign keys are enabled. | ||
Comment on lines
+51
to
+52
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dunno if you checked which tables were grinding to a halt, but I'm curious what happens if you don't load any of the hourly tables, which have 10s to 100s of millions of rows -- especially |
||
|
||
Example: | ||
parquet_to_duckdb /path/to/parquet/directory duckdb.db | ||
""" | ||
parquet_dir = Path(parquet_dir) | ||
duckdb_path = Path(duckdb_path) | ||
|
||
# Check if DuckDB file already exists | ||
if duckdb_path.exists(): | ||
click.echo( | ||
f"Error: DuckDB file '{duckdb_path}' already exists. Please provide a new filename." | ||
) | ||
return | ||
|
||
# create duck db schema from pudl package | ||
resource_ids = (r.name for r in PUDL_PACKAGE.resources if len(r.name) <= 63) | ||
package = Package.from_resource_ids(resource_ids) | ||
|
||
metadata = package.to_sql(dialect="duckdb", check_foreign_keys=check_fks) | ||
engine = sa.create_engine(f"duckdb:///{duckdb_path}") | ||
metadata.create_all(engine) | ||
|
||
if not no_load: | ||
with duckdb.connect(database=str(duckdb_path)) as duckdb_conn: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I messed around with this loop over the weekend to explicitly add the row and table comments to the schema, but later realized that:
Thus, the only change that stuck around was switching to using a context manager. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What a journey - at least we learned something 😅 , thanks for doing the digging! |
||
duckdb_cursor = duckdb_conn.cursor() | ||
# Load data into the DuckDB database from parquet files, if requested: | ||
# Iterate through the tables in order of foreign key dependency | ||
for table in metadata.sorted_tables: | ||
parquet_file_path = parquet_dir / f"{table.name}.parquet" | ||
if parquet_file_path.exists(): | ||
logger.info( | ||
f"Loading parquet file: {parquet_file_path} into {duckdb_path}" | ||
) | ||
sql_command = f""" | ||
COPY {table.name} FROM '{parquet_file_path}' (FORMAT PARQUET); | ||
""" | ||
Comment on lines
+86
to
+88
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even without the foreign keys turned on, I got an out-of-memory error while it was attempting to load the EPA CEMS parquet (5GB on disk). Do we need to do some kind of chunking? Maybe by row-groups? |
||
duckdb_cursor.execute(sql_command) | ||
else: | ||
raise FileNotFoundError("Parquet file not found for: ", table.name) | ||
|
||
|
||
if __name__ == "__main__": | ||
parquet_to_duckdb() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
create_duckdb()
orcreate_and_distribute_duckdb()
?(I have been loving the ShellCheck linter for VSCode)