Skip to content

Commit 8da85eb

Browse files
committed
identifiers: drop proquest
subjects: differentiate euproject subjects, freetext and controlled vocab identifiers: add ISSN identifiers: handle different variations of HDL internal_notes: process internal notes affiliations: validate affiliations in 901 field collection: validate collections, allow EU project thesis
1 parent 01a1f47 commit 8da85eb

File tree

7 files changed

+234
-67
lines changed

7 files changed

+234
-67
lines changed

cds_migrator_kit/rdm/data/thesis/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@ invenio rdm-records custom-fields init
1515
1. Run affiliations
1616
2. Run users
1717
3. Run duplicates (and 981__b) mergers
18+
3.1. add duplicated_pids.json file
1819
4. Identify UDC records
1920
5. Identify records with relations (2)
2021
6. irecords with comments, migrate comments
22+
23+
24+
next deployment
25+
26+
change branch installed in migrator-kit from feature to master
27+
28+
1. on worker pod
29+
2.
30+
invenio rdm-records add-to-fixture programmes
31+
invenio rdm-records add-to-fixture awards
32+
33+
2. both on migration and worker pod
34+
invenio rdm-records custom-fields init
35+
invenio communities custom-fields init

cds_migrator_kit/rdm/records/load/load.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,9 @@ def _pre_publish(self, identity, entry, version, draft):
228228
)
229229
if draft.errors:
230230
raise ManualImportRequired(
231-
message=str(draft.errors),
231+
message=f"{str(draft.errors)}: {str(entry['record']['json'])}",
232232
field="validation",
233233
stage="load",
234-
description="Draft has errors",
235234
recid=entry["record"]["recid"],
236235
priority="warning",
237236
value=draft._record.pid.pid_value,

cds_migrator_kit/rdm/records/transform/config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@
1313
PIDS_SCHEMES_ALLOWED = ["DOI"]
1414

1515
# stores the identifiers found in PIDs field in the alternative identifiers instead
16-
PID_SCHEMES_TO_STORE_IN_IDENTIFIERS = ["ARXIV", "HDL", "HANDLE", "URN", "INIS"]
16+
PID_SCHEMES_TO_STORE_IN_IDENTIFIERS = ["ARXIV", "HDL", "HANDLE", "URN", "INIS", "CERCER"]
1717

18-
IDENTIFIERS_SCHEMES_TO_DROP = ["SPIRES", "HAL", "OSTI", "SLAC", ]
18+
IDENTIFIERS_SCHEMES_TO_DROP = ["SPIRES", "HAL", "OSTI", "SLAC", "PROQUEST"]
1919
IDENTIFIERS_VALUES_TO_DROP = "oai:arXiv.org"
2020

2121
CONTROLLED_SUBJECTS_SCHEMES = ["szgecern", "cern", "cds"]
2222

23-
RECOGNISED_KEYWORD_SCHEMES = ["author", "cms", "arxiv", "inspire", "spr", "inis", "lanl eds", "in2p3"]
24-
KEYWORD_SCHEMES_TO_DROP = ["proquest"]
23+
RECOGNISED_KEYWORD_SCHEMES = ["author", "cms", "arxiv", "inspire", "spr", "inis", "lanl eds", "in2p3", "eucard", "inspec"]
24+
KEYWORD_SCHEMES_TO_DROP = ["proquest", "disxa"]
2525

26-
ALLOWED_THESIS_COLLECTIONS = ["thesis", "publcms", "book"]
26+
ALLOWED_THESIS_COLLECTIONS = ["thesis", "publcms", "book", "aida", "eucard", "eucard2", "cern"]
2727
IGNORED_THESIS_COLLECTIONS = ["cern"]
2828

2929

cds_migrator_kit/rdm/records/transform/models/thesis.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class ThesisModel(CdsOverdo):
4646
"035__m", # oai harvest tag
4747
"035__t", # oai harvest tag
4848
"035__u", # oai harvest tag
49+
"035__z", # oai harvest tag
4950
"037__c", # arxiv subject
5051
# "084__2", # SORT ID, spreadsheet, also PACS and missing report numbers, requested curation
5152
# "084__a", # SORT ID, spreadsheet
@@ -69,6 +70,7 @@ class ThesisModel(CdsOverdo):
6970
"701__m", # supervisors's email <-- decided not to keep in RDM,
7071
"720__a", # author's duplicate
7172
"773__o", # spreadsheet
73+
"773__x", # spreadsheet
7274
"8564_8", # Files system field
7375
"8564_s", # Files system field
7476
"8564_x", # Files system field
@@ -82,7 +84,7 @@ class ThesisModel(CdsOverdo):
8284
"961__x", # CDS modification tag # TODO
8385
"964__a", # spreadsheet
8486
"970__b", # spreadsheet
85-
"980__a", # collection tag
87+
# "980__a", # collection tag
8688
"981__a", # duplicated record marker # TODO -> decide how to handle these
8789
"999C50", # https://cds.cern.ch/record/2284609/export/hm?ln=en CMS contributions
8890
"999C52", # https://cds.cern.ch/record/2640188/export/hm?ln=en

cds_migrator_kit/rdm/records/transform/transform.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import arrow
1616
from cds_rdm.legacy.models import CDSMigrationAffiliationMapping
1717
from idutils import normalize_ror
18+
from idutils.validators import is_doi
1819
from invenio_access.permissions import system_identity
1920
from invenio_accounts.models import User
2021
from invenio_db import db
@@ -51,6 +52,9 @@
5152
def search_vocabulary(term, vocab_type):
5253
"""Search vocabulary utility function."""
5354
service = current_service_registry.get("vocabularies")
55+
if "/" in term:
56+
# escape the slashes
57+
term = term.replace("/", "\\/")
5458
try:
5559
vocabulary_result = service.search(
5660
system_identity, type=vocab_type, q=f"{term}"
@@ -141,7 +145,7 @@ def _pids(self, json_entry):
141145
if key.upper() in PIDS_SCHEMES_TO_DROP:
142146
del output_pids[key]
143147

144-
elif key.upper() not in PIDS_SCHEMES_ALLOWED:
148+
elif key and key.upper() not in PIDS_SCHEMES_ALLOWED:
145149
raise UnexpectedValue(
146150
field=key,
147151
subfield="2",
@@ -150,9 +154,14 @@ def _pids(self, json_entry):
150154
stage="transform",
151155
value=identifier
152156
)
157+
elif not key and is_doi(identifier):
158+
# assume it is DOI
159+
key = "DOI"
153160
if key.upper() == "DOI":
154161
doi_identifier = deepcopy(identifier)
155162
if identifier["identifier"].startswith(DATACITE_PREFIX):
163+
if not json_entry.get("publisher"):
164+
json_entry["publisher"] = "CERN"
156165
doi_identifier["provider"] = "datacite"
157166
else:
158167
doi_identifier["provider"] = "external"

0 commit comments

Comments
 (0)