Skip to content

Commit 4eba27b

Browse files
committed
transform: files upload prototype
1 parent 980c32e commit 4eba27b

File tree

8 files changed

+174
-18
lines changed

8 files changed

+174
-18
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ recursive-include examples *.sh
5050
recursive-include cds_migrator_kit *.gitkeep
5151
recursive-include cds_migrator_kit *.md
5252
recursive-include cds_migrator_kit *.yaml
53+
recursive-include scripts *.py
5354
exclude scripts/migrator.sh
5455

5556

cds_migrator_kit/migration_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,3 +355,4 @@ def _(x): # needed to avoid start time failure with lazy strings
355355
base_path = os.path.dirname(os.path.realpath(__file__))
356356
logs_dir = os.path.join(base_path, "tmp/logs/")
357357
CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
358+
INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml"

cds_migrator_kit/rdm/migration/README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,32 @@ current_rdm_records_service.indexer.bulk_index((rec.id for rec in records))
135135
```
136136

137137

138-
### To visualise the errors:
138+
### To visualise the errors (locally):
139139

140140
```shell
141141
gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app
142142
```
143+
144+
145+
146+
147+
### Full migration workflow of one collection
148+
149+
#### Legacy
150+
151+
```shell
152+
ssh cds-wn-31 # inveniomigrator tool installed here
153+
kinit cdsrdmeosdev
154+
cd /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump
155+
inveniomigrator dump records -q '980__:NOTE 037__:CERN-STUDENTS-Note-* -980:DELETED' --file-prefix summer-studends-notes --latest-only --chunk-size=1000
156+
python copy_collection_files.py --dump-folder /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump --files-destination /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/files
157+
```
158+
159+
160+
#### Openshift migration pod
161+
162+
```shell
163+
invenio migration run
164+
```
165+
166+
visit https://migration-cds-rdm-dev.app.cern.ch for report

cds_migrator_kit/rdm/migration/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pathlib import Path
1111

1212
import click
13+
from flask import current_app
1314
from flask.cli import with_appcontext
1415

1516
from cds_migrator_kit.rdm.migration.runner import Runner
@@ -26,8 +27,9 @@ def migration():
2627
@with_appcontext
2728
def run():
2829
"""Run."""
30+
stream_config = current_app.config["INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG"]
2931
runner = Runner(
3032
stream_definitions=[RecordStreamDefinition],
31-
config_filepath=Path("cds_migrator_kit/rdm/migration/streams.yaml").absolute(),
33+
config_filepath=Path(stream_config).absolute(),
3234
)
3335
runner.run()

cds_migrator_kit/rdm/migration/load.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,19 @@
77

88
"""CDS-RDM migration load module."""
99

10+
import os
11+
1012
from invenio_access.permissions import system_identity
1113
from invenio_rdm_migrator.load.base import Load
1214
from invenio_rdm_records.proxies import current_rdm_records_service
1315

1416

17+
def import_legacy_files(filepath):
18+
"""Download file from legacy."""
19+
filestream = open(filepath, "rb")
20+
return filestream
21+
22+
1523
class CDSRecordServiceLoad(Load):
1624
"""CDSRecordServiceLoad."""
1725

@@ -29,10 +37,31 @@ def _prepare(self, entry):
2937

3038
def _load(self, entry):
3139
"""Use the services to load the entries."""
32-
identity = system_identity # Should we create an idenity for the migration?
40+
identity = system_identity # Should we create an identity for the migration?
3341
draft = current_rdm_records_service.create(identity, entry["record"]["json"])
42+
draft_files = entry["draft_files"]
43+
44+
for file in draft_files:
45+
current_rdm_records_service.draft_files.init_files(
46+
identity,
47+
draft.id,
48+
data=[
49+
{"key": file["key"], "metadata": file["metadata"],
50+
"access": {"hidden": False}}],
51+
)
52+
current_rdm_records_service.draft_files.set_file_content(
53+
identity, draft.id, file["key"],
54+
import_legacy_files(file["eos_tmp_path"])
55+
)
56+
result = current_rdm_records_service.draft_files.commit_file(identity,
57+
draft.id,
58+
file["key"])
59+
legacy_checksum = f"md5:{file['checksum']}"
60+
new_checksum = result.to_dict()["checksum"]
61+
assert legacy_checksum == new_checksum
3462
current_rdm_records_service.publish(system_identity, draft["id"])
3563

64+
3665
def _cleanup(self, *args, **kwargs):
3766
"""Cleanup the entries."""
3867
pass

cds_migrator_kit/rdm/migration/streams.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ new_secret_key: CHANGE_ME
88
records:
99
extract:
1010
dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports
11+
transform:
12+
files_dump_dir: cds_migrator_kit/rdm/migration/data/files/

cds_migrator_kit/rdm/migration/transform/transform.py

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import datetime
1111
import logging
12+
from pathlib import Path
1213

1314
from invenio_rdm_migrator.streams.records.transform import (
1415
RDMRecordEntry,
@@ -84,7 +85,8 @@ def _pids(self, json_entry):
8485

8586
def _files(self, record_dump):
8687
"""Transform the files of a record."""
87-
files = record_dump.prepare_files()
88+
record_dump.prepare_files()
89+
files = record_dump.files
8890
return {"enabled": True if files else False}
8991

9092
def _communities(self, json_entry):
@@ -158,6 +160,11 @@ def transform(self, entry):
158160
class CDSToRDMRecordTransform(RDMRecordTransform):
159161
"""CDSToRDMRecordTransform."""
160162

163+
def __init__(self, workers=None, throw=False, files_dump_dir=None):
164+
"""Constructor."""
165+
self.files_dump_dir = Path(files_dump_dir).absolute().as_posix()
166+
super().__init__(workers, throw)
167+
161168
def _community_id(self, entry, record):
162169
communities = record.get("communities")
163170
if communities:
@@ -201,26 +208,64 @@ def _transform(self, entry):
201208
}
202209

203210
def _record(self, entry):
211+
# could be in draft as well, depends on how we decide to publish
204212
return CDSToRDMRecordEntry().transform(entry)
205213

206214
def _draft(self, entry):
207215
return None
208216

209217
def _draft_files(self, entry):
210-
return None
218+
"""Point to temporary eos storage to import files from."""
219+
_files = entry["files"]
220+
draft_files = []
221+
legacy_path_root = Path("/opt/cdsweb/var/data/files/")
222+
tmp_eos_root = Path(self.files_dump_dir)
223+
224+
for file in _files:
225+
full_path = Path(file["full_path"])
226+
draft_files.append({
227+
"eos_tmp_path": tmp_eos_root / full_path.relative_to(legacy_path_root),
228+
"key": file["full_name"],
229+
"metadata": {},
230+
"mimetype": file["mime"],
231+
"checksum": file["checksum"]
232+
})
233+
return draft_files
211234

212235
def _record_files(self, entry, record):
213-
# files = entry["json"].get("_files", [])
214-
# return [
215-
# {
216-
# "key": f["key"],
217-
# "object_version": {
218-
# "file": {
219-
# "size": f["size"],
220-
# "checksum": f["checksum"],
221-
# },
222-
# },
223-
# }
224-
# for f in files
225-
# ]
236+
"""Record files entries transform."""
237+
# TO implement if we decide not to go via draft publish
226238
return []
239+
240+
#
241+
#
242+
# "files": [
243+
# {
244+
# "comment": null,
245+
# "status": "firerole: allow group \"council-full [CERN]\"\ndeny until \"1996-02-01\"\nallow all",
246+
# "version": 1,
247+
# "encoding": null,
248+
# "creation_date": "2009-11-03T12:29:06+00:00",
249+
# "bibdocid": 502379,
250+
# "mime": "application/pdf",
251+
# "full_name": "CM-P00080632-e.pdf",
252+
# "superformat": ".pdf",
253+
# "recids_doctype": [[32097, "Main", "CM-P00080632-e.pdf"]],
254+
# "path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1",
255+
# "size": 5033532,
256+
# "license": {},
257+
# "modification_date": "2009-11-03T12:29:06+00:00",
258+
# "copyright": {},
259+
# "url": "http://cds.cern.ch/record/32097/files/CM-P00080632-e.pdf",
260+
# "checksum": "ed797ce5d024dcff0040db79c3396da9",
261+
# "description": "English",
262+
# "format": ".pdf",
263+
# "name": "CM-P00080632-e",
264+
# "subformat": "",
265+
# "etag": "\"502379.pdf1\"",
266+
# "recid": 32097,
267+
# "flags": [],
268+
# "hidden": false,
269+
# "type": "Main",
270+
# "full_path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1"
271+
# },]

scripts/copy_collection_files.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import argparse
2+
import json
3+
import os
4+
import shutil
5+
6+
7+
def copy_collection_file(dump_files, destination_prefix, working_dir):
8+
file_log = open(os.path.join(working_dir, "files.log"), "w")
9+
10+
for dump_file in dump_files:
11+
with open(os.path.join(working_dir, dump_file), "r") as json_dump:
12+
data = json.load(json_dump)
13+
for record in data:
14+
legacy_record_files = record["files"]
15+
for legacy_record_file in legacy_record_files:
16+
full_path = legacy_record_file["full_path"]
17+
# important: last slash
18+
path_to_replace = "/opt/cdsweb/var/data/files/"
19+
20+
rel_path = full_path.replace(path_to_replace, "")
21+
destination_path = os.path.join(destination_prefix, rel_path)
22+
parent_dest_path = os.path.dirname(destination_path)
23+
if not os.path.exists(parent_dest_path):
24+
os.makedirs(parent_dest_path)
25+
shutil.copy(full_path, destination_path)
26+
file_log.writelines(
27+
[f"RECID: {record['recid']},"
28+
f" bibdocid: {legacy_record_file['bibdocid']}"
29+
f" file: {legacy_record_file['full_name']},"
30+
f" destination: {destination_path}"])
31+
file_log.close()
32+
33+
34+
def get_dump_files_paths(working_dir):
35+
dump_files = []
36+
# get all dump files in the folder
37+
for (root, dirs, files) in os.walk(working_dir, topdown=True):
38+
dump_files += [os.path.join(root, filename) for filename in files]
39+
return dump_files
40+
41+
42+
if __name__ == "__main__":
43+
parser = argparse.ArgumentParser(description='Copy files over script')
44+
parser.add_argument('--dump-folder', metavar='path', required=True,
45+
help='the path to dump folder')
46+
parser.add_argument('--files-destination', metavar='path', required=True,
47+
help='path to destination folder on EOS')
48+
args = parser.parse_args()
49+
50+
dump_folder = args.dump_folder
51+
52+
collection_dump_file_list = get_dump_files_paths(dump_folder)

0 commit comments

Comments
 (0)