Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inspire harvester #204

Open
wants to merge 2 commits into
base: prod
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modules/bibconvert/etc/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ kb_DATA = entdec-to-latin1.kb entdec-to-utf8.kb \

xsldir = $(sysconfdir)/bibconvert/config
xsl_DATA = oaidc2marcxml.xsl oaimarc2marcxml.xsl oaiarxiv2marcxml.xsl \
oaidmf2marcxml.xsl authorlist2marcxml.xsl crossref2marcxml.xsl bibtex2marcxml.cfg
oaidmf2marcxml.xsl oaiinspire2marcxml.xsl authorlist2marcxml.xsl \
crossref2marcxml.xsl bibtex2marcxml.cfg

EXTRA_DIST = $(kb_DATA) $(xsl_DATA)

Expand Down
134 changes: 134 additions & 0 deletions modules/bibconvert/etc/oaiinspire2marcxml.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- $Id$

This file is part of Invenio.
Copyright (C) 2016 CERN.

Invenio is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.

Invenio is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with Invenio; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
-->
<!-- This transformation keeps the same source file, with the following exceptions:
- OAI envelope is removed
- Records marked with status='deleted' are returned as deleted for Invenio
- subfield 980 $w is removed
- add 'Inspire' in 035__$9

This stylesheet is provided only as an example of transformation.
Please look for 'CUSTOMIZEME' labels in this stylesheet in order to find
key parts that you should customize to fit your installation needs.

Also note that this stylesheet expect source file to correctly refers to
http://www.loc.gov/MARC21/slim and
http://www.openarchives.org/OAI/2.0/ namespaces
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/"
xmlns:marc="http://www.loc.gov/MARC21/slim"
xmlns:OAI-provenance="http://www.openarchives.org/OAI/2.0/provenance"
xmlns:fn="http://cdsweb.cern.ch/bibconvert/fn"
exclude-result-prefixes="OAI-PMH marc OAI-provenance fn">
<xsl:output method="xml" encoding="UTF-8" />
<xsl:template match="/">
<collection>
<xsl:for-each select="//OAI-PMH:record">
<xsl:choose>
<xsl:when test="./OAI-PMH:header[@status='deleted']">
<record>
<xsl:if test="./OAI-PMH:header/OAI-PMH:identifier | ./OAI-PMH:header/OAI-PMH:setSpec">
<!-- CUSTOMIZEME: Modify the datafield below with tag and indicators used
in your Invenio installation for the OAI Provenance Field -->
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a"><xsl:value-of select="./OAI-PMH:header/OAI-PMH:identifier" /></subfield>
<subfield code="u"><xsl:value-of select="//OAI-PMH:request" /></subfield>
<subfield code="9">Inspire</subfield>
<subfield code="d"><xsl:value-of select="./OAI-PMH:header/OAI-PMH:datestamp" /></subfield>
<subfield code="h"><xsl:value-of select="//OAI-PMH:responseDate" /></subfield>
<subfield code="m"><xsl:value-of select="//OAI-PMH:request/@metadataPrefix" /></subfield>
<xsl:if test="./OAI-PMH:about/OAI-provenance:provenance/OAI-provenance:originDescription">
<xsl:variable name="origin" select="./OAI-PMH:about/OAI-provenance:provenance/OAI-provenance:originDescription" />
<subfield code="o"><value-of select="fn:escape($origin)" /></subfield>
</xsl:if>
<subfield code="t">false</subfield>
</datafield>
</xsl:if>
<datafield tag="980" ind1="" ind2="">
<subfield code="c">DELETED</subfield>
</datafield>
</record>
</xsl:when>
<xsl:otherwise>
<record>
<xsl:for-each select="./OAI-PMH:metadata/marc:record/marc:datafield">
<!-- CUSTOMIZEME: Modify below in order to choose which
datafield/subfield will be kept
and which will be dropped

Sample below: -keep if tag is not 980__ and has subfields
-if tag is 980__ , keep datafield only if some
subfield with code != 'w' exist. Remove
subfield 980__w.
-->
<xsl:choose>
<xsl:when test="not(@tag='980' and (@ind1='' or @ind1=' ') and (@ind2='' or @ind2=' ') and ./marc:subfield)">

<xsl:element name="{local-name(.)}">
<xsl:copy-of select="@*" />
<xsl:for-each select="./marc:subfield">
<xsl:element name="{local-name(.)}">
<xsl:copy-of select="@*" />
<xsl:value-of select="." />
</xsl:element>
</xsl:for-each>
</xsl:element>
</xsl:when>
<xsl:otherwise>
<xsl:if test="./marc:subfield[@code!='w']">
<xsl:element name="{local-name(.)}">
<xsl:copy-of select="@*" />
<xsl:for-each select="./marc:subfield[@code!='w']">
<xsl:element name="{local-name(.)}">
<xsl:copy-of select="@*" />
<xsl:value-of select="." />
</xsl:element>
</xsl:for-each>
</xsl:element>
</xsl:if>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
<xsl:if test="./OAI-PMH:header/OAI-PMH:identifier">
<!-- CUSTOMIZEME: Modify the datafield below with tag and indicators used
in your Invenio installation for the OAI Provenance Field -->
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a"><xsl:value-of select="./OAI-PMH:header/OAI-PMH:identifier" /></subfield>
<subfield code="u"><xsl:value-of select="//OAI-PMH:request" /></subfield>
<subfield code="9">Inspire</subfield>
<subfield code="d"><xsl:value-of select="./OAI-PMH:header/OAI-PMH:datestamp" /></subfield>
<subfield code="h"><xsl:value-of select="//OAI-PMH:responseDate" /></subfield>
<subfield code="m"><xsl:value-of select="//OAI-PMH:request/@metadataPrefix" /></subfield>
<xsl:if test="./OAI-PMH:about/OAI-provenance:provenance/OAI-provenance:originDescription">
<xsl:variable name="origin" select="./OAI-PMH:about/OAI-provenance:provenance/OAI-provenance:originDescription" />
<subfield code="o"><xsl:value-of select="fn:escape($origin)" /></subfield>
</xsl:if>
<subfield code="t">false</subfield>
</datafield>
</xsl:if>
</record>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</collection>
</xsl:template>
</xsl:stylesheet>
8 changes: 7 additions & 1 deletion modules/oaiharvest/lib/oai_harvest_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,14 @@ def OAI_Session(server, script, http_param_dict , method="POST", output="",

# FIXME We should NOT use regular expressions to parse XML. This works
# for the time being to escape namespaces.
# Regexp for big files (for example from Inspire will be extremally
# slow) but we know that the resumption token should be located at the
# beginning of file, so we can search only in the first 10 000
# characters
# rt_obj = re.search('<.*resumptionToken.*>(.*)</.*resumptionToken.*>',
# harvested_data, re.DOTALL)
rt_obj = re.search('<.*resumptionToken.*>(.*)</.*resumptionToken.*>',
harvested_data, re.DOTALL)
harvested_data[0:10000], re.DOTALL)
if rt_obj is not None and rt_obj.group(1) != "":
http_param_dict = http_param_resume(http_param_dict, rt_obj.group(1))
i = i + 1
Expand Down