NYCPlanning · fvankrieken · Oct 16, 2024 · Oct 8, 2024 · Sep 26, 2024 · Sep 30, 2024
diff --git a/dcpy/connectors/edm/recipes.py b/dcpy/connectors/edm/recipes.py
@@ -73,7 +73,7 @@ def _archive_dataset(config: ingest.Config, file_path: Path, s3_path: str) -> No
             BUCKET,
             tmp_dir_path,
             Path(s3_path),
-            acl=config.acl,
+            acl=config.archival.acl,
             contents_only=True,
         )
 
@@ -103,7 +103,7 @@ def archive_dataset(config: ingest.Config, file_path: Path, *, latest: bool = Fa
     s3_path = s3_folder_path(config.dataset_key)
     _archive_dataset(config, file_path, s3_path)
     if latest:
-        set_latest(config.dataset_key, config.acl)
+        set_latest(config.dataset_key, config.archival.acl)
 
 
 def update_freshness(ds: DatasetKey, timestamp: datetime) -> datetime:
@@ -113,16 +113,16 @@ def update_freshness(ds: DatasetKey, timestamp: datetime) -> datetime:
         raise TypeError(
             f"Cannot update freshness of dataset {ds} as it was archived by library, not ingest"
         )
-    config.check_timestamps.append(timestamp)
+    config.archival.check_timestamps.append(timestamp)
     config_str = json.dumps(config.model_dump(mode="json"))
     s3.upload_file_obj(
         BytesIO(config_str.encode()),
         BUCKET,
         path,
-        config.acl,
+        config.archival.acl,
         metadata=s3.get_custom_metadata(BUCKET, path),
     )
-    return config.archival_timestamp
+    return config.archival.archival_timestamp
 
 
 def get_config(name: str, version="latest") -> library.Config | ingest.Config:

diff --git a/dcpy/lifecycle/ingest/configure.py b/dcpy/lifecycle/ingest/configure.py
@@ -7,6 +7,8 @@
 import yaml
 
 from dcpy.models.lifecycle.ingest import (
+    ArchivalMetadata,
+    Ingestion,
     LocalFileSource,
     S3Source,
     ScriptSource,
@@ -97,19 +99,25 @@ def get_filename(source: Source, ds_id: str) -> str:
 
 
 def get_config(
-    dataset_id: str, version: str | None = None, mode: str | None = None
+    dataset_id: str,
+    version: str | None = None,
+    *,
+    mode: str | None = None,
+    template_dir: Path = TEMPLATE_DIR,
 ) -> Config:
     """Generate config object for dataset and optional version"""
     run_details = metadata.get_run_details()
-    template = read_template(dataset_id, version=version)
-    filename = get_filename(template.source, template.id)
-    version = version or get_version(template.source, run_details.timestamp)
-    template = read_template(dataset_id, version=version)
-    processing_steps = template.processing_steps
+    template = read_template(dataset_id, version=version, template_dir=template_dir)
 
-    if template.target_crs:
+    filename = get_filename(template.ingestion.source, template.id)
+    version = version or get_version(template.ingestion.source, run_details.timestamp)
+    template = read_template(dataset_id, version=version, template_dir=template_dir)
+
+    processing_steps = template.ingestion.processing_steps
+
+    if template.ingestion.target_crs:
         reprojection = PreprocessingStep(
-            name="reproject", args={"target_crs": template.target_crs}
+            name="reproject", args={"target_crs": template.ingestion.target_crs}
         )
         processing_steps = [reprojection] + processing_steps
 
@@ -121,24 +129,39 @@ def get_config(
         )
         processing_steps.append(clean_column_names)
 
+    if "multi" not in processing_step_names and template.has_geom:
+        multi = PreprocessingStep(name="multi")
+        processing_steps.append(multi)
+
     if mode:
         modes = {s.mode for s in processing_steps}
         if mode not in modes:
             raise ValueError(f"mode '{mode}' is not present in template '{dataset_id}'")
 
     processing_steps = [s for s in processing_steps if s.mode is None or s.mode == mode]
 
-    # create config object
-    return Config(
-        id=template.id,
-        version=version,
+    archival = ArchivalMetadata(
         archival_timestamp=run_details.timestamp,
         raw_filename=filename,
         acl=template.acl,
-        target_crs=template.target_crs,
-        source=template.source,
-        file_format=template.file_format,
+    )
+
+    ingestion = Ingestion(
+        target_crs=template.ingestion.target_crs,
+        source=template.ingestion.source,
+        file_format=template.ingestion.file_format,
         processing_mode=mode,
         processing_steps=processing_steps,
+    )
+
+    # create config object
+    return Config(
+        id=template.id,
+        version=version,
+        crs=ingestion.target_crs,
+        attributes=template.attributes,
+        archival=archival,
+        ingestion=ingestion,
+        columns=template.columns,
         run_details=run_details,
     )
diff --git a/...ecycle/ingest/templates/bpl_libraries.yml → ...le/ingest/dev_templates/bpl_libraries.yml b/...ecycle/ingest/templates/bpl_libraries.yml → ...le/ingest/dev_templates/bpl_libraries.yml
diff --git a/...est/templates/dca_operatingbusinesses.yml → ...dev_templates/dca_operatingbusinesses.yml b/...est/templates/dca_operatingbusinesses.yml → ...dev_templates/dca_operatingbusinesses.yml
diff --git a/...le/ingest/templates/dcp_addresspoints.yml → ...ngest/dev_templates/dcp_addresspoints.yml b/...le/ingest/templates/dcp_addresspoints.yml → ...ngest/dev_templates/dcp_addresspoints.yml
diff --git a/...e/ingest/templates/dcp_atomicpolygons.yml → ...gest/dev_templates/dcp_atomicpolygons.yml b/...e/ingest/templates/dcp_atomicpolygons.yml → ...gest/dev_templates/dcp_atomicpolygons.yml
diff --git a/dcpy/lifecycle/ingest/templates/dcp_pad.yml → ...ifecycle/ingest/dev_templates/dcp_pad.yml b/dcpy/lifecycle/ingest/templates/dcp_pad.yml → ...ifecycle/ingest/dev_templates/dcp_pad.yml
diff --git a/.../lifecycle/ingest/templates/dcp_sfpsd.yml → ...ecycle/ingest/dev_templates/dcp_sfpsd.yml b/.../lifecycle/ingest/templates/dcp_sfpsd.yml → ...ecycle/ingest/dev_templates/dcp_sfpsd.yml
diff --git a/.../lifecycle/ingest/templates/dob_cofos.yml → ...ecycle/ingest/dev_templates/dob_cofos.yml b/.../lifecycle/ingest/templates/dob_cofos.yml → ...ecycle/ingest/dev_templates/dob_cofos.yml
diff --git a/.../ingest/templates/dob_jobapplications.yml → ...est/dev_templates/dob_jobapplications.yml b/.../ingest/templates/dob_jobapplications.yml → ...est/dev_templates/dob_jobapplications.yml
diff --git a/...ingest/templates/dob_now_applications.yml → ...st/dev_templates/dob_now_applications.yml b/...ingest/templates/dob_now_applications.yml → ...st/dev_templates/dob_now_applications.yml
diff --git a/...ycle/ingest/templates/dob_now_permits.yml → .../ingest/dev_templates/dob_now_permits.yml b/...ycle/ingest/templates/dob_now_permits.yml → .../ingest/dev_templates/dob_now_permits.yml
diff --git a/...e/ingest/templates/dob_permitissuance.yml → ...gest/dev_templates/dob_permitissuance.yml b/...e/ingest/templates/dob_permitissuance.yml → ...gest/dev_templates/dob_permitissuance.yml
diff --git a/...e/ingest/templates/doe_pepmeetingurls.yml → ...gest/dev_templates/doe_pepmeetingurls.yml b/...e/ingest/templates/doe_pepmeetingurls.yml → ...gest/dev_templates/doe_pepmeetingurls.yml
diff --git a/.../ingest/templates/dpr_capitalprojects.yml → ...est/dev_templates/dpr_capitalprojects.yml b/.../ingest/templates/dpr_capitalprojects.yml → ...est/dev_templates/dpr_capitalprojects.yml
diff --git a/...est/templates/fisa_capitalcommitments.yml → ...dev_templates/fisa_capitalcommitments.yml b/...est/templates/fisa_capitalcommitments.yml → ...dev_templates/fisa_capitalcommitments.yml
diff --git a/...cle/ingest/templates/fisa_dailybudget.yml → ...ingest/dev_templates/fisa_dailybudget.yml b/...cle/ingest/templates/fisa_dailybudget.yml → ...ingest/dev_templates/fisa_dailybudget.yml
diff --git a/...cycle/ingest/templates/nypl_libraries.yml → ...e/ingest/dev_templates/nypl_libraries.yml b/...cycle/ingest/templates/nypl_libraries.yml → ...e/ingest/dev_templates/nypl_libraries.yml
diff --git a/.../ingest/templates/nysdoh_nursinghomes.yml → ...est/dev_templates/nysdoh_nursinghomes.yml b/.../ingest/templates/nysdoh_nursinghomes.yml → ...est/dev_templates/nysdoh_nursinghomes.yml
diff --git a/...t/templates/nysed_nonpublicenrollment.yml → ...v_templates/nysed_nonpublicenrollment.yml b/...t/templates/nysed_nonpublicenrollment.yml → ...v_templates/nysed_nonpublicenrollment.yml
diff --git a/dcpy/lifecycle/ingest/run.py b/dcpy/lifecycle/ingest/run.py
@@ -1,3 +1,4 @@
+import json
 import pandas as pd
 from pathlib import Path
 import typer
@@ -28,11 +29,11 @@ def update_freshness(
     comparison = recipes.read_df(config.dataset)
     if new.equals(comparison):
         original_archival_timestamp = recipes.update_freshness(
-            config.dataset_key, config.archival_timestamp
+            config.dataset_key, config.archival.archival_timestamp
         )
-        config.archival_timestamp = original_archival_timestamp
+        config.archival.archival_timestamp = original_archival_timestamp
         if latest:
-            recipes.set_latest(config.dataset_key, config.acl)
+            recipes.set_latest(config.dataset_key, config.archival.acl)
         return config
     else:
         raise FileExistsError(
@@ -49,34 +50,46 @@ def run(
     latest: bool = False,
     skip_archival: bool = False,
     output_csv: bool = False,
+    template_dir: Path = configure.TEMPLATE_DIR,
 ) -> Config:
-    config = configure.get_config(dataset_id, version=version, mode=mode)
-    transform.validate_processing_steps(config.id, config.processing_steps)
+    config = configure.get_config(
+        dataset_id, version=version, mode=mode, template_dir=template_dir
+    )
+    transform.validate_processing_steps(config.id, config.ingestion.processing_steps)
 
     if not staging_dir:
-        staging_dir = TMP_DIR / dataset_id / config.archival_timestamp.isoformat()
+        staging_dir = (
+            TMP_DIR / dataset_id / config.archival.archival_timestamp.isoformat()
+        )
         staging_dir.mkdir(parents=True)
     else:
         staging_dir.mkdir(parents=True, exist_ok=True)
 
     # download dataset
     extract.download_file_from_source(
-        config.source, config.raw_filename, config.version, staging_dir
+        config.ingestion.source,
+        config.archival.raw_filename,
+        config.version,
+        staging_dir,
     )
-    file_path = staging_dir / config.raw_filename
+    file_path = staging_dir / config.archival.raw_filename
 
     if not skip_archival:
         # archive to edm-recipes/raw_datasets
-        recipes.archive_raw_dataset(config, staging_dir / config.raw_filename)
+        recipes.archive_raw_dataset(config, file_path)
 
     init_parquet = "init.parquet"
     transform.to_parquet(
-        config.file_format, file_path, dir=staging_dir, output_filename=init_parquet
+        config.ingestion.file_format,
+        file_path,
+        dir=staging_dir,
+        output_filename=init_parquet,
     )
 
     transform.preprocess(
         config.id,
-        config.processing_steps,
+        config.ingestion.processing_steps,
+        config.columns,
         staging_dir / init_parquet,
         staging_dir / config.filename,
         output_csv=output_csv,
@@ -90,6 +103,8 @@ def run(
                 config, staging_dir / config.filename, latest=latest
             )
 
+    with open(staging_dir / "config.json", "w") as f:
+        json.dump(config.model_dump(mode="json"), f, indent=4)
     return config
 
 

diff --git a/dcpy/lifecycle/ingest/templates/dcp_commercialoverlay.yml b/dcpy/lifecycle/ingest/templates/dcp_commercialoverlay.yml
@@ -0,0 +1,32 @@
+id: dcp_commercialoverlay
+acl: public-read
+
+attributes:
+  name: DCP NYC Commercial Overlay Districts
+  description: |
+    Polygon features representing the within-tax-block limits for commercial overlay districts,
+    as shown on the DCP zoning maps. Commercial overlay district designations are indicated in the OVERLAY attribute.
+  url: https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-gis-zoning.page#metadata
+
+ingestion:
+  target_crs: EPSG:4326
+  source:
+    type: edm_publishing_gis_dataset
+    name: dcp_commercial_overlays
+  file_format:
+    type: shapefile
+    crs: EPSG:2263
+  processing_steps:
+  - name: rename_columns
+    args:
+      map: {"geom": "wkb_geometry"}
+
+columns:
+- id: overlay
+  data_type: text
+- id: shape_leng
+  data_type: decimal
+- id: shape_area
+  data_type: decimal
+- id: wkb_geometry
+  data_type: geometry
diff --git a/dcpy/lifecycle/ingest/templates/dcp_pop_acs2010_demographic.yml b/dcpy/lifecycle/ingest/templates/dcp_pop_acs2010_demographic.yml
@@ -1,52 +1,28 @@
 id: dcp_pop_acs2010_demographic
 acl: public-read
-source:
-  type: local_file
-  path: .library/upload/CCD2023_ACS0610Data_for1822Update.xlsx
-file_format:
-  type: xlsx
-  sheet_name: CCD2023_Dem0610
-  dtype:
-    GeoID: str
-processing_steps:
-- name: clean_column_names
-  args:
-    lower: true
-- name: append_prev
-  mode: append
-- name: upsert_column_of_previous_version
-  args:
-    key: [geotype, geoid]
-    insert_behavior: error
-    missing_key_behavior: error
-  mode: update_column
 
-library_dataset:
-  name: dcp_pop_acs2010_demographic
-  version: ""
-  acl: public-read
-  source:
-    script: 
-      name: excel
-      path: https://nyc3.digitaloceanspaces.com/edm-recipes/inbox/dcp_pop_acs2010/{{ version }}/dcp_pop_acs.xlsx
-      sheet_name: Dem0610
-    geometry:
-      SRS: null
-      type: NONE
-
-  destination:
-    geometry:
-      SRS: null
-      type: NONE
-    fields: []
-    sql: null
+attributes:
+  name: DCP Population 2010 ACS Demographic Data
+  description: |
+    This file is produced internally by the Population division. 2010 version is used as a reference dataset
+    for the latest ACS data, and occasionally is modified so these different subsections are archived as their
+    own recipe datasets so that they can easily be updated individually
 
-  info:
-    description: |
-      ## 2010 ACS file from Population
-      This file is produced internally by the Population division. 2010 version is used as a reference dataset
-      for the latest ACS data, and occasionally is modified so these different subsections are archived as their
-      own recipe datasets so that they can easily be updated individually
-
-    url: null
-    dependents: []
+ingestion:
+  source:
+    type: local_file
+    path: .library/upload/CCD2023_ACS0610Data_for1822Update.xlsx
+  file_format:
+    type: xlsx
+    sheet_name: CCD2023_Dem0610
+    dtype:
+      GeoID: str
+  processing_steps:
+  - name: append_prev
+    mode: append
+  - name: upsert_column_of_previous_version
+    args:
+      key: [geotype, geoid]
+      insert_behavior: error
+      missing_key_behavior: error
+    mode: update_column
diff --git a/dcpy/lifecycle/ingest/templates/dcp_pop_acs2010_economic.yml b/dcpy/lifecycle/ingest/templates/dcp_pop_acs2010_economic.yml
@@ -1,52 +1,28 @@
 id: dcp_pop_acs2010_economic
 acl: public-read
-source:
-  type: local_file
-  path: .library/upload/CCD2023_ACS0610Data_for1822Update.xlsx
-file_format:
-  type: xlsx
-  sheet_name: CCD2023_Econ0610_NotInflated
-  dtype:
-    GeoID: str
-processing_steps:
-- name: clean_column_names
-  args:
-    lower: true
-- name: append_prev
-  mode: append
-- name: upsert_column_of_previous_version
-  args:
-    key: [geotype, geoid]
-    insert_behavior: error
-    missing_key_behavior: error
-  mode: update_column
 
-library_dataset:
-  name: dcp_pop_acs2010_economic
-  version: ""
-  acl: public-read
-  source:
-    script: 
-      name: excel
-      path: https://nyc3.digitaloceanspaces.com/edm-recipes/inbox/dcp_pop_acs2010/{{ version }}/dcp_pop_acs.xlsx
-      sheet_name: Econ0610
-    geometry:
-      SRS: null
-      type: NONE
-
-  destination:
-    geometry:
-      SRS: null
-      type: NONE
-    fields: []
-    sql: null
+attributes:
+  name: DCP Population 2010 ACS Economic Data
+  description: |
+    This file is produced internally by the Population division. 2010 version is used as a reference dataset
+    for the latest ACS data, and occasionally is modified so these different subsections are archived as their
+    own recipe datasets so that they can easily be updated individually
 
-  info:
-    description: |
-      ## 2010 ACS file from Population
-      This file is produced internally by the Population division. 2010 version is used as a reference dataset
-      for the latest ACS data, and occasionally is modified so these different subsections are archived as their
-      own recipe datasets so that they can easily be updated individually
-
-    url: null
-    dependents: []
+ingestion:
+  source:
+    type: local_file
+    path: .library/upload/CCD2023_ACS0610Data_for1822Update.xlsx
+  file_format:
+    type: xlsx
+    sheet_name: CCD2023_Econ0610_NotInflated
+    dtype:
+      GeoID: str
+  processing_steps:
+  - name: append_prev
+    mode: append
+  - name: upsert_column_of_previous_version
+    args:
+      key: [geotype, geoid]
+      insert_behavior: error
+      missing_key_behavior: error
+    mode: update_column