transform: files upload prototype

kpsherva · kpsherva · commit 4eba27b3efbb · 2024-07-25T16:30:54.000+02:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -50,6 +50,7 @@ recursive-include examples *.sh
 recursive-include cds_migrator_kit *.gitkeep
 recursive-include cds_migrator_kit *.md
 recursive-include cds_migrator_kit *.yaml
+recursive-include scripts *.py
 exclude scripts/migrator.sh
 
 
diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py
@@ -355,3 +355,4 @@ def _(x):  # needed to avoid start time failure with lazy strings
 base_path = os.path.dirname(os.path.realpath(__file__))
 logs_dir = os.path.join(base_path, "tmp/logs/")
 CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
+INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"
diff --git a/cds_migrator_kit/rdm/migration/README.md b/cds_migrator_kit/rdm/migration/README.md
@@ -135,8 +135,32 @@ current_rdm_records_service.indexer.bulk_index((rec.id for rec in records))
 ```
 
 
-### To visualise the errors:
+### To visualise the errors (locally):
 
 ```shell
 gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
 ```
+
+
+
+
+### Full migration workflow of one collection
+
+#### Legacy
+
+```shell
+ssh cds-wn-31 # inveniomigrator tool installed here
+kinit cdsrdmeosdev
+cd /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump
+inveniomigrator dump records -q '980__:NOTE 037__:CERN-STUDENTS-Note-* -980:DELETED' --file-prefix summer-studends-notes --latest-only --chunk-size=1000
+python copy_collection_files.py --dump-folder /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump --files-destination /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/files
+```
+
+
+#### Openshift migration pod
+
+```shell
+invenio migration run
+```
+
+visit https://migration-cds-rdm-dev.app.cern.ch for report
diff --git a/cds_migrator_kit/rdm/migration/cli.py b/cds_migrator_kit/rdm/migration/cli.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 
 import click
+from flask import current_app
 from flask.cli import with_appcontext
 
 from cds_migrator_kit.rdm.migration.runner import Runner
@@ -26,8 +27,9 @@ def migration():
 @with_appcontext
 def run():
     """Run."""
+    stream_config = current_app.config["INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG"]
     runner = Runner(
         stream_definitions=[RecordStreamDefinition],
-        config_filepath=Path("cds_migrator_kit/rdm/migration/streams.yaml").absolute(),
+        config_filepath=Path(stream_config).absolute(),
     )
     runner.run()
diff --git a/cds_migrator_kit/rdm/migration/load.py b/cds_migrator_kit/rdm/migration/load.py
@@ -7,11 +7,19 @@
 
 """CDS-RDM migration load module."""
 
+import os
+
 from invenio_access.permissions import system_identity
 from invenio_rdm_migrator.load.base import Load
 from invenio_rdm_records.proxies import current_rdm_records_service
 
 
+def import_legacy_files(filepath):
+    """Download file from legacy."""
+    filestream = open(filepath, "rb")
+    return filestream
+
+
 class CDSRecordServiceLoad(Load):
     """CDSRecordServiceLoad."""
 
@@ -29,10 +37,31 @@ def _prepare(self, entry):
 
     def _load(self, entry):
         """Use the services to load the entries."""
-        identity = system_identity  # Should we create an idenity for the migration?
+        identity = system_identity  # Should we create an identity for the migration?
         draft = current_rdm_records_service.create(identity, entry["record"]["json"])
+        draft_files = entry["draft_files"]
+
+        for file in draft_files:
+            current_rdm_records_service.draft_files.init_files(
+                identity,
+                draft.id,
+                data=[
+                    {"key": file["key"], "metadata": file["metadata"],
+                     "access": {"hidden": False}}],
+            )
+            current_rdm_records_service.draft_files.set_file_content(
+                identity, draft.id, file["key"],
+                import_legacy_files(file["eos_tmp_path"])
+            )
+            result = current_rdm_records_service.draft_files.commit_file(identity,
+                                                                         draft.id,
+                                                                         file["key"])
+            legacy_checksum = f"md5:{file['checksum']}"
+            new_checksum = result.to_dict()["checksum"]
+            assert legacy_checksum == new_checksum
         current_rdm_records_service.publish(system_identity, draft["id"])
 
+
     def _cleanup(self, *args, **kwargs):
         """Cleanup the entries."""
         pass
diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml
@@ -8,3 +8,5 @@ new_secret_key: CHANGE_ME
 records:
   extract:
     dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports
+  transform:
+    files_dump_dir: cds_migrator_kit/rdm/migration/data/files/
diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py
@@ -9,6 +9,7 @@
 
 import datetime
 import logging
+from pathlib import Path
 
 from invenio_rdm_migrator.streams.records.transform import (
     RDMRecordEntry,
@@ -84,7 +85,8 @@ def _pids(self, json_entry):
 
     def _files(self, record_dump):
         """Transform the files of a record."""
-        files = record_dump.prepare_files()
+        record_dump.prepare_files()
+        files = record_dump.files
         return {"enabled": True if files else False}
 
     def _communities(self, json_entry):
@@ -158,6 +160,11 @@ def transform(self, entry):
 class CDSToRDMRecordTransform(RDMRecordTransform):
     """CDSToRDMRecordTransform."""
 
+    def __init__(self, workers=None, throw=False, files_dump_dir=None):
+        """Constructor."""
+        self.files_dump_dir = Path(files_dump_dir).absolute().as_posix()
+        super().__init__(workers, throw)
+
     def _community_id(self, entry, record):
         communities = record.get("communities")
         if communities:
@@ -201,26 +208,64 @@ def _transform(self, entry):
         }
 
     def _record(self, entry):
+        # could be in draft as well, depends on how we decide to publish
         return CDSToRDMRecordEntry().transform(entry)
 
     def _draft(self, entry):
         return None
 
     def _draft_files(self, entry):
-        return None
+        """Point to temporary eos storage to import files from."""
+        _files = entry["files"]
+        draft_files = []
+        legacy_path_root = Path("/opt/cdsweb/var/data/files/")
+        tmp_eos_root = Path(self.files_dump_dir)
+
+        for file in _files:
+            full_path = Path(file["full_path"])
+            draft_files.append({
+                "eos_tmp_path": tmp_eos_root / full_path.relative_to(legacy_path_root),
+                "key": file["full_name"],
+                "metadata": {},
+                "mimetype": file["mime"],
+                "checksum": file["checksum"]
+            })
+        return draft_files
 
     def _record_files(self, entry, record):
-        # files = entry["json"].get("_files", [])
-        # return [
-        #     {
-        #         "key": f["key"],
-        #         "object_version": {
-        #             "file": {
-        #                 "size": f["size"],
-        #                 "checksum": f["checksum"],
-        #             },
-        #         },
-        #     }
-        #     for f in files
-        # ]
+        """Record files entries transform."""
+        # TO implement if we decide not to go via draft publish
         return []
+
+    #
+    #
+    # "files": [
+    #   {
+    #     "comment": null,
+    #     "status": "firerole: allow group \"council-full [CERN]\"\ndeny until \"1996-02-01\"\nallow all",
+    #     "version": 1,
+    #     "encoding": null,
+    #     "creation_date": "2009-11-03T12:29:06+00:00",
+    #     "bibdocid": 502379,
+    #     "mime": "application/pdf",
+    #     "full_name": "CM-P00080632-e.pdf",
+    #     "superformat": ".pdf",
+    #     "recids_doctype": [[32097, "Main", "CM-P00080632-e.pdf"]],
+    #     "path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1",
+    #     "size": 5033532,
+    #     "license": {},
+    #     "modification_date": "2009-11-03T12:29:06+00:00",
+    #     "copyright": {},
+    #     "url": "http://cds.cern.ch/record/32097/files/CM-P00080632-e.pdf",
+    #     "checksum": "ed797ce5d024dcff0040db79c3396da9",
+    #     "description": "English",
+    #     "format": ".pdf",
+    #     "name": "CM-P00080632-e",
+    #     "subformat": "",
+    #     "etag": "\"502379.pdf1\"",
+    #     "recid": 32097,
+    #     "flags": [],
+    #     "hidden": false,
+    #     "type": "Main",
+    #     "full_path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1"
+    #   },]
diff --git a/scripts/copy_collection_files.py b/scripts/copy_collection_files.py
@@ -0,0 +1,52 @@
+import argparse
+import json
+import os
+import shutil
+
+
+def copy_collection_file(dump_files, destination_prefix, working_dir):
+    file_log = open(os.path.join(working_dir, "files.log"), "w")
+
+    for dump_file in dump_files:
+        with open(os.path.join(working_dir, dump_file), "r") as json_dump:
+            data = json.load(json_dump)
+            for record in data:
+                legacy_record_files = record["files"]
+                for legacy_record_file in legacy_record_files:
+                    full_path = legacy_record_file["full_path"]
+                    # important: last slash
+                    path_to_replace = "/opt/cdsweb/var/data/files/"
+
+                    rel_path = full_path.replace(path_to_replace, "")
+                    destination_path = os.path.join(destination_prefix, rel_path)
+                    parent_dest_path = os.path.dirname(destination_path)
+                    if not os.path.exists(parent_dest_path):
+                        os.makedirs(parent_dest_path)
+                    shutil.copy(full_path, destination_path)
+                    file_log.writelines(
+                        [f"RECID: {record['recid']},"
+                         f" bibdocid: {legacy_record_file['bibdocid']}"
+                         f" file: {legacy_record_file['full_name']},"
+                         f" destination: {destination_path}"])
+    file_log.close()
+
+
+def get_dump_files_paths(working_dir):
+    dump_files = []
+    # get all dump files in the folder
+    for (root, dirs, files) in os.walk(working_dir, topdown=True):
+        dump_files += [os.path.join(root, filename) for filename in files]
+    return dump_files
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Copy files over script')
+    parser.add_argument('--dump-folder', metavar='path', required=True,
+                        help='the path to dump folder')
+    parser.add_argument('--files-destination', metavar='path', required=True,
+                        help='path to destination folder on EOS')
+    args = parser.parse_args()
+
+    dump_folder = args.dump_folder
+
+    collection_dump_file_list = get_dump_files_paths(dump_folder)