catalyst-cooperative · e-belfer · Sep 9, 2024 · Sep 3, 2024
diff --git a/.github/ISSUE_TEMPLATE/early_release_checker.md b/.github/ISSUE_TEMPLATE/early_release_checker.md
@@ -0,0 +1,57 @@
+---
+name: Early release checker
+about: In the months of early release data, run archiver daily.
+title: Publish {{ date | date('MMMM Do YYYY') }} early release archives
+labels: automation, zenodo
+assignees: aesharpe
+
+---
+
+# Summary of results:
+See the job run logs and results [here]({{ env.RUN_URL }}).
+
+# Review and publish archives
+
+For each of the following archives, find the run status in the Github archiver run. If validation tests pass, manually review the archive and publish. If no changes detected, delete the draft. If changes are detected, manually review the archive following the guidelines in step 3 of `README.md`, then publish the new version. Then check the box here to confirm publication status, adding a note on the status (e.g., "v1 published", "no changes detected, draft deleted"):
+
+```[tasklist]
+- [ ] eia176
+- [ ] eia191
+- [ ] eia757a
+- [ ] eia860
+- [ ] eia860m
+- [ ] eia861
+- [ ] eia923
+- [ ] eia930
+- [ ] eiaaeo
+- [ ] eiawater
+- [ ] eia_bulk_elec
+- [ ] epacamd_eia
+- [ ] ferc1
+- [ ] ferc2
+- [ ] ferc6
+- [ ] ferc60
+- [ ] ferc714
+- [ ] mshamines
+- [ ] nrelatb
+- [ ] phmsagas
+- [ ] epacems
+```
+
+# Validation failures
+For each run that failed because of validation test failures (seen in the GHA logs), add it to the tasklist. Download the run summary JSON by going into the "Upload run summaries" tab of the GHA run for each dataset, and follow the link. Investigate the validation failure.
+
+If the validation failure is deemed ok after manual review (e.g., Q2 of 2024 data doubles the size of a file that only had Q1 data previously, but the new data looks as expected), go ahead and approve the archive and leave a note explaining your decision in the task list.
+
+If the validation failure is blocking (e.g., file format incorrect, whole dataset changes size by 200%), make an issue to resolve it.
+
+```[tasklist]
+- [ ] dataset
+```
+
+# Other failures
+For each run that failed because of another reason (e.g., underlying data changes, code failures), create an issue describing the failure and take necessary steps to resolve it.
+
+```[tasklist]
+- [ ] dataset
+```
diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml
@@ -64,7 +64,7 @@ jobs:
           ZENODO_TOKEN_UPLOAD: ${{ secrets.ZENODO_TOKEN_UPLOAD }}
           ZENODO_TOKEN_PUBLISH: ${{ secrets.ZENODO_TOKEN_PUBLISH }}
         run: |
-          pudl_archiver --datasets ${{ matrix.dataset }} --summary-file ${{ matrix.dataset }}_run_summary.json
+          pudl_archiver --datasets ${{ matrix.dataset }} --summary-file ${{ matrix.dataset }}_run_summary.json --clobber-unchanged
 
       - name: Upload run summaries
         if: always()
@@ -117,6 +117,7 @@ jobs:
           ZENODO_TOKEN_PUBLISH: ${{ secrets.ZENODO_TOKEN_PUBLISH }}
         run: |
           pudl_archiver --datasets ${{ matrix.dataset }} --summary-file ${{ matrix.dataset }}_run_summary.json
+        # Here we don't clobber the draft if it's unchanged, as it's far more labor intensive to recreate if needed!
 
       - name: Upload run summaries
         if: failure() || success()

diff --git a/src/pudl_archiver/archivers/ferc/xbrl.py b/src/pudl_archiver/archivers/ferc/xbrl.py
@@ -249,7 +249,7 @@ async def archive_taxonomies(
     taxonomy_versions = []
     archive_path = output_dir / f"ferc{form.as_int()}-xbrl-taxonomies.zip"
     with zipfile.ZipFile(archive_path, "w", compression=ZIP_DEFLATED) as archive:
-        for taxonomy_entry_point in taxonomies_referenced:
+        for taxonomy_entry_point in sorted(taxonomies_referenced):
             logger.info(f"Archiving {taxonomy_entry_point}.")
 
             # Use Arelle to parse taxonomy

diff --git a/src/pudl_archiver/cli.py b/src/pudl_archiver/cli.py
@@ -47,6 +47,11 @@ def parse_main(args=None):
         action="store_true",
         help="Initialize new deposition by preserving a DOI",
     )
+    parser.add_argument(
+        "--clobber-unchanged",
+        action="store_true",
+        help="Delete draft deposition if unchanged.",
+    )
     parser.add_argument(
         "--summary-file",
         type=Path,

diff --git a/src/pudl_archiver/depositors/depositor.py b/src/pudl_archiver/depositors/depositor.py
@@ -231,6 +231,11 @@ async def cleanup_after_error(self, e: Exception):
         """Cleanup draft after an error during an archive run."""
         ...
 
+    @abstractmethod
+    async def delete_deposition(self):
+        """Delete deposition if no changes found."""
+        ...
+
     @abstractmethod
     def generate_datapackage(
         self, resource_info: dict[str, ResourceInfo]
@@ -246,7 +251,11 @@ async def add_resource(
         return await self._apply_change(change)
 
     async def publish_if_valid(
-        self, run_summary: RunSummary, datapackage_updated: bool, auto_publish: bool
+        self,
+        run_summary: RunSummary,
+        datapackage_updated: bool,
+        clobber_unchanged: bool,
+        auto_publish: bool,
     ) -> PublishedDeposition | None:
         """Check that deposition is valid and worth changing, then publish if so."""
         if not run_summary.success:
@@ -256,10 +265,14 @@ async def publish_if_valid(
             )
             return run_summary
         if len(run_summary.file_changes) == 0 and not datapackage_updated:
-            logger.info(
-                "No changes detected, kept draft at "
-                f"{self.get_deposition_link()} for inspection."
-            )
+            if clobber_unchanged:
+                await self.delete_deposition()
+                logger.info("No changes detected, deleted draft.")
+            else:
+                logger.info(
+                    "No changes detected, kept draft at "
+                    f"{self.get_deposition_link()} for inspection."
+                )
             return None
         if not auto_publish:
             logger.info(

diff --git a/src/pudl_archiver/depositors/zenodo/depositor.py b/src/pudl_archiver/depositors/zenodo/depositor.py
@@ -234,6 +234,33 @@ async def delete_file(
 
         return await self.get_deposition_by_id(deposition.id_)
 
+    async def delete_deposition(self, deposition: Deposition) -> None:
+        """Delete an un-submitted deposition.
+
+        As of 2023-11-22, Zenodo API times out on first few deletion attempts,
+        occasionally 500s, and then 404s once the delete has actually gone
+        through.
+
+        Args:
+            deposition: the deposition you want to delete.
+        """
+        try:
+            await self._request(
+                "DELETE",
+                deposition.links.self,
+                log_label="Deleting deposition",
+                headers=self.auth_write,
+                parse_json=False,
+                retry_count=5,
+            )
+        except ZenodoClientError as e:
+            if e.status != 404:
+                raise e
+            logger.info(
+                f"404 Not Found when deleting {deposition.links.self}, assume "
+                "earlier delete succeeded."
+            )
+
     async def publish(self, deposition: Deposition) -> Deposition:
         """Publish draft deposition and return new depositor with updated deposition."""
         url = deposition.links.publish
@@ -709,6 +736,10 @@ async def cleanup_after_error(self, e: Exception):
             f"Failed while creating new deposition: {traceback.print_exception(e)}"
         )
 
+    async def delete_deposition(self) -> None:
+        """Delete an un-submitted deposition."""
+        return await self.api_client.delete_deposition(self.deposition)
+
 
 register_depositor(
     "zenodo", ZenodoAPIClient, ZenodoPublishedDeposition, ZenodoDraftDeposition

diff --git a/src/pudl_archiver/orchestrator.py b/src/pudl_archiver/orchestrator.py
@@ -49,6 +49,9 @@ async def orchestrate_run(
         draft.get_deposition_link(),
     )
     published = await draft.publish_if_valid(
-        summary, datapackage_updated, run_settings.auto_publish
+        summary,
+        datapackage_updated,
+        run_settings.clobber_unchanged,
+        run_settings.auto_publish,
     )
     return summary, published
diff --git a/src/pudl_archiver/package_data/zenodo_doi.yaml b/src/pudl_archiver/package_data/zenodo_doi.yaml
@@ -26,7 +26,7 @@ eia930:
   production_doi: 10.5281/zenodo.10840077
   sandbox_doi: 10.5072/zenodo.38409
 eiaaeo:
-  production_doi: 10.5281/zenodo.10838487
+  production_doi: 10.5281/zenodo.10838488
   sandbox_doi: 10.5072/zenodo.37746
 eia_bulk_elec:
   production_doi: 10.5281/zenodo.7067366

diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py
@@ -127,6 +127,7 @@ class RunSettings(BaseModel):
     only_years: list[int] | None = []
     summary_file: Path | None = None
     download_dir: str | None = None
+    clobber_unchanged: bool = False
     auto_publish: bool = False
     refresh_metadata: bool = False
     resume_run: bool = False

diff --git a/tests/integration/zenodo_depositor_test.py b/tests/integration/zenodo_depositor_test.py
diff --git a/tests/integration/zenodo_test.py b/tests/integration/zenodo_test.py
@@ -146,6 +146,7 @@ async def test_zenodo_workflow(
     deposition_metadata: DepositionMetadata,
     datasource: DataSource,
     mocker,
+    caplog,
 ):
     """Test the entire zenodo client workflow."""
     # Mock settings path
@@ -157,6 +158,7 @@ async def test_zenodo_workflow(
 
     settings = RunSettings(
         sandbox=True,
+        clobber_unchanged=True,
         auto_publish=False,
         refresh_metadata=False,
         initialize=True,
@@ -250,13 +252,14 @@ async def identity(x):
     )
 
     # Update files
+    settings.initialize = False
+
     v2_resources = {
         file_data["path"].name: ResourceInfo(
             local_path=file_data["path"], partitions={}
         )
         for file_data in test_files["updated"]
     }
-    settings.initialize = False
 
     # Should fail due to deleted file
     downloader = TestDownloader(v2_resources, session=session)
@@ -306,6 +309,7 @@ async def identity(x):
         session=session,
     )
     assert len(v4_summary.file_changes) == 0
+    assert caplog.records[-1].msg == "No changes detected, deleted draft."
 
     # legacy Zenodo API "get latest for concept DOI" endpoint is very slow to update,
     # but requesting the DOI directly updates quickly.
@@ -314,3 +318,5 @@ async def identity(x):
         timeout=10.0,
     )
     assert str(v3_refreshed.deposition.id_) in res.text
+    # Assert last draft actually deleted, getting DOI from end of record URL
+    assert str(v4_summary.record_url).split("/")[-1] not in res.text