Skip to content

Commit

Permalink
retool ingest validation commands for cli
Browse files Browse the repository at this point in the history
  • Loading branch information
fvankrieken committed Nov 13, 2024
1 parent 3804b49 commit b2348b6
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 74 deletions.
10 changes: 6 additions & 4 deletions dcpy/data/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,11 @@ def get_sql_keyed_report(
client: postgres.PostgresClient,
*,
ignore_columns: list[str] | None = None,
) -> comparison.Report:
) -> comparison.SqlReport:
left_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {left}")
right_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {right}")
return comparison.Report(
return comparison.SqlReport(

Check warning on line 240 in dcpy/data/compare.py

View check run for this annotation

Codecov / codecov/patch

dcpy/data/compare.py#L240

Added line #L240 was not covered by tests
tables=comparison.Simple[str](left=left, right=right),
row_count=comparison.Simple[int](
left=left_rows["count"][0], right=right_rows["count"][0]
),
Expand All @@ -258,10 +259,11 @@ def get_sql_report(
client: postgres.PostgresClient,
*,
ignore_columns: list[str] | None = None,
) -> comparison.Report:
) -> comparison.SqlReport:
left_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {left}")
right_rows = client.execute_select_query(f"SELECT count(*) AS count FROM {right}")
return comparison.Report(
return comparison.SqlReport(

Check warning on line 265 in dcpy/data/compare.py

View check run for this annotation

Codecov / codecov/patch

dcpy/data/compare.py#L265

Added line #L265 was not covered by tests
tables=comparison.Simple[str](left=left, right=right),
row_count=comparison.Simple[int](
left=left_rows["count"][0], right=right_rows["count"][0]
),
Expand Down
2 changes: 2 additions & 0 deletions dcpy/lifecycle/scripts/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from .package_and_distribute import app as package_dist_app
from .product_metadata import app as product_metadata_app
from .ingest_with_library_fallback import run as ingest_or_library_archive
from .ingest_validation import app as ingest_validation_app

Check warning on line 5 in dcpy/lifecycle/scripts/_cli.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/_cli.py#L5

Added line #L5 was not covered by tests

app = typer.Typer()

app.add_typer(package_dist_app, name="package_and_dist")
app.add_typer(product_metadata_app, name="product_metadata")
app.add_typer(ingest_validation_app, name="validate_ingest")

Check warning on line 11 in dcpy/lifecycle/scripts/_cli.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/_cli.py#L11

Added line #L11 was not covered by tests
app.command(name="ingest_or_library_archive")(ingest_or_library_archive)
179 changes: 111 additions & 68 deletions dcpy/lifecycle/scripts/ingest_validation.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,95 @@
import os

Check warning on line 1 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L1

Added line #L1 was not covered by tests
from pathlib import Path
import shutil
import typer
from typing import Literal

Check warning on line 5 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L4-L5

Added lines #L4 - L5 were not covered by tests

from dcpy.utils import postgres
from dcpy.utils.collections import indented_report
from dcpy.models.data import comparison
from dcpy.data import compare

Check warning on line 10 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L8-L10

Added lines #L8 - L10 were not covered by tests
from dcpy.connectors.edm import recipes
from dcpy.lifecycle.ingest import run as ingest
from dcpy.lifecycle.builds import metadata as build_metadata

Check warning on line 13 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L13

Added line #L13 was not covered by tests

DATABASE = "sandbox"
LIBRARY_PATH = recipes.LIBRARY_DEFAULT_PATH / "datasets"
print(os.environ.get("BUILD_NAME"))
SCHEMA = build_metadata.build_name(os.environ.get("BUILD_NAME"))
print(SCHEMA)

Check warning on line 19 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L15-L19

Added lines #L15 - L19 were not covered by tests


def call_library(dataset: str, version: str | None = None, file_type="pgdump"):

Check warning on line 22 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L22

Added line #L22 was not covered by tests
# BEWARE: once you import library, parquet file writing fails
# Something to do with gdal's interaction with parquet file driver
from dcpy.library.archive import Archive

Check warning on line 25 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L25

Added line #L25 was not covered by tests

a = Archive()
config = a(name=dataset, output_format=file_type, version=version)

Check warning on line 28 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L27-L28

Added lines #L27 - L28 were not covered by tests
# We're running ingest too, so change version after the fact
# Can't just feed this version to archive call because of datasets that template in the version
target_dir = LIBRARY_PATH / dataset / "library"

Check warning on line 31 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L31

Added line #L31 was not covered by tests
if target_dir.is_dir():
shutil.rmtree(target_dir)
os.rename(LIBRARY_PATH / dataset / config.version, target_dir)

Check warning on line 34 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L33-L34

Added lines #L33 - L34 were not covered by tests


def call_ingest(

Check warning on line 37 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L37

Added line #L37 was not covered by tests
dataset: str, version: str | None = None, ingest_parent_dir: Path = ingest.TMP_DIR
) -> None:
ingest_dir = ingest_parent_dir / dataset / "staging"

Check warning on line 40 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L40

Added line #L40 was not covered by tests
if ingest_dir.is_dir():
shutil.rmtree(ingest_dir)
ingest.run(dataset, version=version, staging_dir=ingest_dir, skip_archival=True)

Check warning on line 43 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L42-L43

Added lines #L42 - L43 were not covered by tests

ingest_output_path = ingest_dir / f"{dataset}.parquet"
ingest_path = LIBRARY_PATH / dataset / "ingest" / f"{dataset}.parquet"

Check warning on line 46 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L45-L46

Added lines #L45 - L46 were not covered by tests

ingest_path.parent.mkdir(exist_ok=True, parents=True)
shutil.copy(ingest_output_path, ingest_path)

Check warning on line 49 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L48-L49

Added lines #L48 - L49 were not covered by tests


def load_recipe(

Check warning on line 52 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L52

Added line #L52 was not covered by tests
dataset: str,
version: Literal["library", "ingest"],
file_type: recipes.DatasetType | None = None,
) -> None:
if not file_type:
if version == "library":
file_type = recipes.DatasetType.pg_dump

Check warning on line 59 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L59

Added line #L59 was not covered by tests
else:
file_type = recipes.DatasetType.parquet

Check warning on line 61 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L61

Added line #L61 was not covered by tests

target_table = f"{dataset}_{version}"

Check warning on line 63 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L63

Added line #L63 was not covered by tests

client = postgres.PostgresClient(schema=SCHEMA, database=DATABASE)
client.drop_table(dataset)
client.drop_table(target_table)

Check warning on line 67 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L65-L67

Added lines #L65 - L67 were not covered by tests

left_ds = recipes.Dataset(id=dataset, version=version, file_type=file_type)
recipes.import_dataset(

Check warning on line 70 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L69-L70

Added lines #L69 - L70 were not covered by tests
left_ds,
client,
import_as=target_table,
)


def compare_recipes_in_postgres(
dataset: str,
left_version: str,
right_version: str,
left_version: str = "library",
right_version: str = "ingest",
*,
build_name: str,
key_columns: list[str] | None = None,
ignore_columns: list[str] | None = None,
local_library_dir: Path = recipes.LIBRARY_DEFAULT_PATH,
left_type: recipes.DatasetType = recipes.DatasetType.pg_dump,
right_type: recipes.DatasetType = recipes.DatasetType.pg_dump,
) -> comparison.Report:
) -> comparison.SqlReport:
ignore_columns = ignore_columns or []
ignore_columns.append("ogc_fid")

Check warning on line 86 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L86

Added line #L86 was not covered by tests
ignore_columns.append("data_library_version")
left_table = dataset + "_left"
right_table = dataset + "_right"

client = postgres.PostgresClient(schema=build_name, database="sandbox")
client.drop_table(dataset)
client.drop_table(left_table)
client.drop_table(right_table)

left_ds = recipes.Dataset(id=dataset, version=left_version, file_type=left_type)
right_ds = recipes.Dataset(id=dataset, version=right_version, file_type=right_type)
client = postgres.PostgresClient(schema=SCHEMA, database="sandbox")
left_table = dataset + "_" + left_version
right_table = dataset + "_" + right_version

Check warning on line 91 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L89-L91

Added lines #L89 - L91 were not covered by tests

recipes.import_dataset(
left_ds,
client,
import_as=left_table,
local_library_dir=local_library_dir,
)
recipes.import_dataset(
right_ds,
client,
import_as=right_table,
local_library_dir=local_library_dir,
)
if key_columns:
return compare.get_sql_keyed_report(
left_table,
Expand All @@ -62,50 +107,48 @@ def compare_recipes_in_postgres(
)


def run_ingest_and_library(
dataset: str,
ingest_parent_dir: Path = Path("."),
library_file_type: str = "pg_dump",
) -> None:
ingest_dir = ingest_parent_dir / dataset / "special_folder"
ingest.run(dataset, staging_dir=ingest_dir, skip_archival=True)
app = typer.Typer()

Check warning on line 110 in dcpy/lifecycle/scripts/ingest_validation.py

View check run for this annotation

Codecov / codecov/patch

dcpy/lifecycle/scripts/ingest_validation.py#L110

Added line #L110 was not covered by tests

# BEWARE: once you import library, parquet file writing fails
# Something to do with gdal's interaction with parquet file driver
from dcpy.library.archive import Archive

a = Archive()
a(name=dataset, output_format=library_file_type, version="library")
@app.command("load_single")
def load_single(
tool: str = typer.Argument(),
dataset: str = typer.Argument(),
version: str | None = typer.Option(None, "--version", "-v"),
):
if tool == "library":
call_library(dataset, version)
elif tool == "ingest":
call_ingest(dataset, version)
else:
raise NotImplementedError("'tool' must be either 'library' or 'ingest'")

ingest_output_path = ingest_dir / f"{dataset}.parquet"
ingest_path = (
Path(".library") / "datasets" / dataset / "ingest" / f"{dataset}.parquet"
)
ingest_path.parent.mkdir(exist_ok=True, parents=True)
shutil.copy(ingest_output_path, ingest_path)
load_recipe(dataset, tool) # type: ignore


def compare_ingest_and_library(
dataset: str,
key_columns: list[str] | None,
build_name: str,
*,
ignore_columns: list[str] | None = None,
library_file_type: str = "pgdump",
ingest_parent_dir: Path = Path("."),
) -> comparison.Report:
run_ingest_and_library(
dataset,
ingest_parent_dir=ingest_parent_dir,
library_file_type=library_file_type,
@app.command("load")
def _load_both(
dataset: str = typer.Argument(),
version: str | None = typer.Option(None, "--version", "-v"),
):
call_ingest(dataset, version)
call_library(dataset, version)

load_recipe(dataset, "library")
load_recipe(dataset, "ingest")


@app.command("compare")
def _compare(
dataset: str = typer.Argument(),
key_columns: list[str] = typer.Option(None, "-k", "--key"),
ignore_columns: list[str] = typer.Option(None, "-i", "--ignore"),
):
report = compare_recipes_in_postgres(
dataset, key_columns=key_columns, ignore_columns=ignore_columns
)
return compare_recipes_in_postgres(
dataset,
"library",
"ingest",
key_columns=key_columns,
build_name=build_name,
left_type=recipes.DatasetType.pg_dump,
right_type=recipes.DatasetType.parquet,
ignore_columns=ignore_columns,
print(
indented_report(
report.model_dump(), pretty_print_fields=True, include_line_breaks=True
)
)
18 changes: 16 additions & 2 deletions dcpy/models/data/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pydantic import BaseModel, Field
from typing import TypeVar, Generic

from dcpy.models.base import ModelWithDataFrame
from dcpy.models.base import SortedSerializedBase, ModelWithDataFrame

T = TypeVar("T")

Expand Down Expand Up @@ -37,7 +37,21 @@ class SimpleTable(ModelWithDataFrame):
right_only: pd.DataFrame | None


class Report(BaseModel):
class Report(SortedSerializedBase):
row_count: Simple[int]
column_comparison: Columns
data_comparison: KeyedTable | SimpleTable

_exclude_falsey_values: bool = False
_head_sort_order: list[str] = ["row_count", "column_comparison", "data_comparison"]


class SqlReport(Report):
tables: Simple[str]

_head_sort_order: list[str] = [
"tables",
"row_count",
"column_comparison",
"data_comparison",
]

0 comments on commit b2348b6

Please sign in to comment.