Skip to content

Commit ae29c5b

Browse files
committed
scripts: fix datacite export script
1 parent 7990447 commit ae29c5b

File tree

2 files changed

+68
-55
lines changed

2 files changed

+68
-55
lines changed

scripts/export-datacite.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import tarfile
2+
from datetime import datetime
3+
from io import BytesIO
4+
from pathlib import Path
5+
6+
from flask import current_app
7+
from flask_principal import identity_changed
8+
from invenio_access.permissions import any_user, authenticated_user
9+
from invenio_access.utils import get_identity
10+
from invenio_accounts.proxies import current_datastore
11+
from invenio_app.factory import create_api
12+
from invenio_rdm_records.oai import oai_datacite_etree
13+
from invenio_rdm_records.proxies import current_rdm_records_service as service
14+
from lxml import etree
15+
16+
USER_ID = "CHANGEME"
17+
EXPORT_DIR = Path("/tmp/export")
18+
19+
20+
def identity_for(id_or_email):
21+
idty = get_identity(current_datastore.get_user(id_or_email))
22+
with current_app.test_request_context():
23+
identity_changed.send(current_app, identity=idty)
24+
# Needs to be added manually
25+
idty.provides.add(authenticated_user)
26+
idty.provides.add(any_user)
27+
return idty
28+
29+
30+
def export_datacite():
31+
# Change this to the user ID or email you want to use
32+
idty = identity_for(USER_ID)
33+
34+
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
35+
tar_path = f"{EXPORT_DIR}/zenodo-{datetime.today().isoformat()}.tar.gz"
36+
failed_path = f"{EXPORT_DIR}/failed.txt"
37+
with tarfile.open(tar_path, "w|gz") as tar, open(failed_path, "w") as failed:
38+
res = service.scan(idty, params={"allversions": True})
39+
for idx, record in enumerate(res.hits):
40+
if idx % 1000 == 0:
41+
print(datetime.now().isoformat(), idx)
42+
record_id = record.get("id")
43+
if not record_id:
44+
continue
45+
46+
try:
47+
oai_etree = oai_datacite_etree(None, {"_source": record})
48+
xml_bytes = etree.tostring(
49+
oai_etree,
50+
xml_declaration=True,
51+
encoding="UTF-8",
52+
)
53+
54+
tar_info = tarfile.TarInfo(f"{record_id}.xml")
55+
file_content = BytesIO()
56+
file_content.name = f"{record_id}.xml"
57+
file_content.write(xml_bytes)
58+
file_content.seek(0)
59+
tar_info.size = len(xml_bytes)
60+
tar.addfile(tar_info, fileobj=file_content)
61+
except Exception as e:
62+
print(f"Error serializing {record_id}: {e}")
63+
failed.write(f"{record_id}\n")
64+
65+
66+
if __name__ == "__main__":
67+
with create_api().app_context():
68+
export_datacite()

scripts/export_datacite.py

-55
This file was deleted.

0 commit comments

Comments
 (0)