retool ingest validation commands for cli

fvankrieken · fvankrieken · commit b2348b6e3b29 · 2024-11-13T15:41:12.000-05:00
diff --git a/dcpy/data/compare.py b/dcpy/data/compare.py
@@ -234,10 +234,11 @@ def get_sql_keyed_report(
     client: postgres.PostgresClient,
     *,
     ignore_columns: list[str] | None = None,
-) -> comparison.Report:
+) -> comparison.SqlReport:
     left_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {left}")
     right_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {right}")
-    return comparison.Report(
+    return comparison.SqlReport(
+        tables=comparison.Simple[str](left=left, right=right),
         row_count=comparison.Simple[int](
             left=left_rows["count"][0], right=right_rows["count"][0]
         ),
@@ -258,10 +259,11 @@ def get_sql_report(
     client: postgres.PostgresClient,
     *,
     ignore_columns: list[str] | None = None,
-) -> comparison.Report:
+) -> comparison.SqlReport:
     left_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {left}")
     right_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {right}")
-    return comparison.Report(
+    return comparison.SqlReport(
+        tables=comparison.Simple[str](left=left, right=right),
         row_count=comparison.Simple[int](
             left=left_rows["count"][0], right=right_rows["count"][0]
         ),
diff --git a/dcpy/lifecycle/scripts/_cli.py b/dcpy/lifecycle/scripts/_cli.py
@@ -2,9 +2,11 @@
 from .package_and_distribute import app as package_dist_app
 from .product_metadata import app as product_metadata_app
 from .ingest_with_library_fallback import run as ingest_or_library_archive
+from .ingest_validation import app as ingest_validation_app
 
 app = typer.Typer()
 
 app.add_typer(package_dist_app, name="package_and_dist")
 app.add_typer(product_metadata_app, name="product_metadata")
+app.add_typer(ingest_validation_app, name="validate_ingest")
 app.command(name="ingest_or_library_archive")(ingest_or_library_archive)
diff --git a/dcpy/lifecycle/scripts/ingest_validation.py b/dcpy/lifecycle/scripts/ingest_validation.py
@@ -1,50 +1,95 @@
+import os
 from pathlib import Path
 import shutil
+import typer
+from typing import Literal
 
 from dcpy.utils import postgres
+from dcpy.utils.collections import indented_report
 from dcpy.models.data import comparison
 from dcpy.data import compare
 from dcpy.connectors.edm import recipes
 from dcpy.lifecycle.ingest import run as ingest
+from dcpy.lifecycle.builds import metadata as build_metadata
+
+DATABASE = "sandbox"
+LIBRARY_PATH = recipes.LIBRARY_DEFAULT_PATH / "datasets"
+print(os.environ.get("BUILD_NAME"))
+SCHEMA = build_metadata.build_name(os.environ.get("BUILD_NAME"))
+print(SCHEMA)
+
+
+def call_library(dataset: str, version: str | None = None, file_type="pgdump"):
+    # BEWARE: once you import library, parquet file writing fails
+    # Something to do with gdal's interaction with parquet file driver
+    from dcpy.library.archive import Archive
+
+    a = Archive()
+    config = a(name=dataset, output_format=file_type, version=version)
+    # We're running ingest too, so change version after the fact
+    # Can't just feed this version to archive call because of datasets that template in the version
+    target_dir = LIBRARY_PATH / dataset / "library"
+    if target_dir.is_dir():
+        shutil.rmtree(target_dir)
+    os.rename(LIBRARY_PATH / dataset / config.version, target_dir)
+
+
+def call_ingest(
+    dataset: str, version: str | None = None, ingest_parent_dir: Path = ingest.TMP_DIR
+) -> None:
+    ingest_dir = ingest_parent_dir / dataset / "staging"
+    if ingest_dir.is_dir():
+        shutil.rmtree(ingest_dir)
+    ingest.run(dataset, version=version, staging_dir=ingest_dir, skip_archival=True)
+
+    ingest_output_path = ingest_dir / f"{dataset}.parquet"
+    ingest_path = LIBRARY_PATH / dataset / "ingest" / f"{dataset}.parquet"
+
+    ingest_path.parent.mkdir(exist_ok=True, parents=True)
+    shutil.copy(ingest_output_path, ingest_path)
+
+
+def load_recipe(
+    dataset: str,
+    version: Literal["library", "ingest"],
+    file_type: recipes.DatasetType | None = None,
+) -> None:
+    if not file_type:
+        if version == "library":
+            file_type = recipes.DatasetType.pg_dump
+        else:
+            file_type = recipes.DatasetType.parquet
+
+    target_table = f"{dataset}_{version}"
+
+    client = postgres.PostgresClient(schema=SCHEMA, database=DATABASE)
+    client.drop_table(dataset)
+    client.drop_table(target_table)
+
+    left_ds = recipes.Dataset(id=dataset, version=version, file_type=file_type)
+    recipes.import_dataset(
+        left_ds,
+        client,
+        import_as=target_table,
+    )
 
 
 def compare_recipes_in_postgres(
     dataset: str,
-    left_version: str,
-    right_version: str,
+    left_version: str = "library",
+    right_version: str = "ingest",
     *,
-    build_name: str,
     key_columns: list[str] | None = None,
     ignore_columns: list[str] | None = None,
-    local_library_dir: Path = recipes.LIBRARY_DEFAULT_PATH,
-    left_type: recipes.DatasetType = recipes.DatasetType.pg_dump,
-    right_type: recipes.DatasetType = recipes.DatasetType.pg_dump,
-) -> comparison.Report:
+) -> comparison.SqlReport:
     ignore_columns = ignore_columns or []
+    ignore_columns.append("ogc_fid")
     ignore_columns.append("data_library_version")
-    left_table = dataset + "_left"
-    right_table = dataset + "_right"
 
-    client = postgres.PostgresClient(schema=build_name, database="sandbox")
-    client.drop_table(dataset)
-    client.drop_table(left_table)
-    client.drop_table(right_table)
-
-    left_ds = recipes.Dataset(id=dataset, version=left_version, file_type=left_type)
-    right_ds = recipes.Dataset(id=dataset, version=right_version, file_type=right_type)
+    client = postgres.PostgresClient(schema=SCHEMA, database="sandbox")
+    left_table = dataset + "_" + left_version
+    right_table = dataset + "_" + right_version
 
-    recipes.import_dataset(
-        left_ds,
-        client,
-        import_as=left_table,
-        local_library_dir=local_library_dir,
-    )
-    recipes.import_dataset(
-        right_ds,
-        client,
-        import_as=right_table,
-        local_library_dir=local_library_dir,
-    )
     if key_columns:
         return compare.get_sql_keyed_report(
             left_table,
@@ -62,50 +107,48 @@ def compare_recipes_in_postgres(
         )
 
 
-def run_ingest_and_library(
-    dataset: str,
-    ingest_parent_dir: Path = Path("."),
-    library_file_type: str = "pg_dump",
-) -> None:
-    ingest_dir = ingest_parent_dir / dataset / "special_folder"
-    ingest.run(dataset, staging_dir=ingest_dir, skip_archival=True)
+app = typer.Typer()
 
-    # BEWARE: once you import library, parquet file writing fails
-    # Something to do with gdal's interaction with parquet file driver
-    from dcpy.library.archive import Archive
 
-    a = Archive()
-    a(name=dataset, output_format=library_file_type, version="library")
+@app.command("load_single")
+def load_single(
+    tool: str = typer.Argument(),
+    dataset: str = typer.Argument(),
+    version: str | None = typer.Option(None, "--version", "-v"),
+):
+    if tool == "library":
+        call_library(dataset, version)
+    elif tool == "ingest":
+        call_ingest(dataset, version)
+    else:
+        raise NotImplementedError("'tool' must be either 'library' or 'ingest'")
 
-    ingest_output_path = ingest_dir / f"{dataset}.parquet"
-    ingest_path = (
-        Path(".library") / "datasets" / dataset / "ingest" / f"{dataset}.parquet"
-    )
-    ingest_path.parent.mkdir(exist_ok=True, parents=True)
-    shutil.copy(ingest_output_path, ingest_path)
+    load_recipe(dataset, tool)  # type: ignore
 
 
-def compare_ingest_and_library(
-    dataset: str,
-    key_columns: list[str] | None,
-    build_name: str,
-    *,
-    ignore_columns: list[str] | None = None,
-    library_file_type: str = "pgdump",
-    ingest_parent_dir: Path = Path("."),
-) -> comparison.Report:
-    run_ingest_and_library(
-        dataset,
-        ingest_parent_dir=ingest_parent_dir,
-        library_file_type=library_file_type,
+@app.command("load")
+def _load_both(
+    dataset: str = typer.Argument(),
+    version: str | None = typer.Option(None, "--version", "-v"),
+):
+    call_ingest(dataset, version)
+    call_library(dataset, version)
+
+    load_recipe(dataset, "library")
+    load_recipe(dataset, "ingest")
+
+
+@app.command("compare")
+def _compare(
+    dataset: str = typer.Argument(),
+    key_columns: list[str] = typer.Option(None, "-k", "--key"),
+    ignore_columns: list[str] = typer.Option(None, "-i", "--ignore"),
+):
+    report = compare_recipes_in_postgres(
+        dataset, key_columns=key_columns, ignore_columns=ignore_columns
     )
-    return compare_recipes_in_postgres(
-        dataset,
-        "library",
-        "ingest",
-        key_columns=key_columns,
-        build_name=build_name,
-        left_type=recipes.DatasetType.pg_dump,
-        right_type=recipes.DatasetType.parquet,
-        ignore_columns=ignore_columns,
+    print(
+        indented_report(
+            report.model_dump(), pretty_print_fields=True, include_line_breaks=True
+        )
     )
diff --git a/dcpy/models/data/comparison.py b/dcpy/models/data/comparison.py
@@ -2,7 +2,7 @@
 from pydantic import BaseModel, Field
 from typing import TypeVar, Generic
 
-from dcpy.models.base import ModelWithDataFrame
+from dcpy.models.base import SortedSerializedBase, ModelWithDataFrame
 
 T = TypeVar("T")
 
@@ -37,7 +37,21 @@ class SimpleTable(ModelWithDataFrame):
     right_only: pd.DataFrame | None
 
 
-class Report(BaseModel):
+class Report(SortedSerializedBase):
     row_count: Simple[int]
     column_comparison: Columns
     data_comparison: KeyedTable | SimpleTable
+
+    _exclude_falsey_values: bool = False
+    _head_sort_order: list[str] = ["row_count", "column_comparison", "data_comparison"]
+
+
+class SqlReport(Report):
+    tables: Simple[str]
+
+    _head_sort_order: list[str] = [
+        "tables",
+        "row_count",
+        "column_comparison",
+        "data_comparison",
+    ]