Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update ckanext-dcat including schemaorg-functionality #1

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 91 additions & 13 deletions ckanext/dcat/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ckan.model.license import LicenseRegister
from ckan.plugins import toolkit
from ckan.lib.munge import munge_tag
from ckan.lib.helpers import url_for

from ckanext.dcat.utils import resource_uri, publisher_uri_from_dataset_dict, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS

Expand Down Expand Up @@ -43,8 +44,11 @@
'locn': LOCN,
'gsp': GSP,
'owl': OWL,
'spdx': SPDX,
}

PREFIX_MAILTO = u'mailto:'


class RDFProfile(object):
'''Base class with helper methods for implementing RDF parsing profiles
Expand Down Expand Up @@ -255,7 +259,9 @@ def _contact_details(self, subject, predicate):

contact['name'] = self._object_value(agent, VCARD.fn)

contact['email'] = self._object_value(agent, VCARD.hasEmail)
contact['email'] = self._without_mailto(
self._object_value(agent, VCARD.hasEmail)
)

return contact

Expand Down Expand Up @@ -471,15 +477,19 @@ def _add_triple_from_dict(self, _dict, subject, predicate, key,
fallbacks=None,
list_value=False,
date_value=False,
_type=Literal):
_type=Literal,
value_modifier=None):
'''
Adds a new triple to the graph with the provided parameters

The subject and predicate of the triple are passed as the relevant
RDFLib objects (URIRef or BNode). The object is always a literal value,
which is extracted from the dict using the provided key (see
`_get_dict_value`). If the value for the key is not found, then
RDFLib objects (URIRef or BNode). As default, the object is a
literal value, which is extracted from the dict using the provided key
(see `_get_dict_value`). If the value for the key is not found, then
additional fallback keys are checked.
Using `value_modifier`, a function taking the extracted value and
returning a modified value can be passed.
If a value was found, the modifier is applied before adding the value.

If `list_value` or `date_value` are True, then the value is treated as
a list or a date respectively (see `_add_list_triple` and
Expand All @@ -492,6 +502,10 @@ def _add_triple_from_dict(self, _dict, subject, predicate, key,
if value:
break

# if a modifying function was given, apply it to the value
if value and callable(value_modifier):
value = value_modifier(value)

if value and list_value:
self._add_list_triple(subject, predicate, value, _type)
elif value and date_value:
Expand Down Expand Up @@ -571,6 +585,25 @@ def _last_catalog_modification(self):
return result['results'][0]['metadata_modified']
return None

def _add_mailto(self, mail_addr):
'''
Ensures that the mail address has an URIRef-compatible mailto: prefix.
Can be used as modifier function for `_add_triple_from_dict`.
'''
if mail_addr:
return PREFIX_MAILTO + self._without_mailto(mail_addr)
else:
return mail_addr

def _without_mailto(self, mail_addr):
'''
Ensures that the mail address string has no mailto: prefix.
'''
if mail_addr:
return unicode(mail_addr).replace(PREFIX_MAILTO, u'')
else:
return mail_addr

def _get_source_catalog(self, dataset_ref):
'''
Returns Catalog reference that is source for this dataset.
Expand Down Expand Up @@ -957,13 +990,17 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
g.add((contact_details, RDF.type, VCARD.Organization))
g.add((dataset_ref, DCAT.contactPoint, contact_details))

items = [
('contact_name', VCARD.fn, ['maintainer', 'author'], Literal),
('contact_email', VCARD.hasEmail, ['maintainer_email',
'author_email'], Literal),
]

self._add_triples_from_dict(dataset_dict, contact_details, items)
self._add_triple_from_dict(
dataset_dict, contact_details,
VCARD.fn, 'contact_name', ['maintainer', 'author']
)
# Add mail address as URIRef, and ensure it has a mailto: prefix
self._add_triple_from_dict(
dataset_dict, contact_details,
VCARD.hasEmail, 'contact_email', ['maintainer_email',
'author_email'],
_type=URIRef, value_modifier=self._add_mailto
)

# Publisher
if any([
Expand Down Expand Up @@ -1116,6 +1153,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
# Checksum
if resource_dict.get('hash'):
checksum = BNode()
g.add((checksum, RDF.type, SPDX.Checksum))
g.add((checksum, SPDX.checksumValue,
Literal(resource_dict['hash'],
datatype=XSD.hexBinary)))
Expand Down Expand Up @@ -1184,6 +1222,12 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
# Basic fields
self._basic_fields_graph(dataset_ref, dataset_dict)

# Catalog
self._catalog_graph(dataset_ref, dataset_dict)

# Groups
self._groups_graph(dataset_ref, dataset_dict)

# Tags
self._tags_graph(dataset_ref, dataset_dict)

Expand All @@ -1201,6 +1245,18 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):

# Resources
self._resources_graph(dataset_ref, dataset_dict)

# Additional fields
self.additional_fields(dataset_ref, dataset_dict)

def additional_fields(self, dataset_ref, dataset_dict):
'''
Adds any additional fields.

For a custom schema you should extend this class and
implement this method.
'''
pass

def _add_date_triple(self, subject, predicate, value, _type=Literal):
'''
Expand Down Expand Up @@ -1232,6 +1288,7 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict):
('version', SCHEMA.version, ['dcat_version'], Literal),
('issued', SCHEMA.datePublished, ['metadata_created'], Literal),
('modified', SCHEMA.dateModified, ['metadata_modified'], Literal),
('license', SCHEMA.license, ['license_url', 'license_title'], Literal),
]
self._add_triples_from_dict(dataset_dict, dataset_ref, items)

Expand All @@ -1242,14 +1299,35 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict):

self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

# Dataset URL
dataset_url = url_for('dataset_read',
id=dataset_dict['name'],
qualified=True)
self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url)))

def _catalog_graph(self, dataset_ref, dataset_dict):
data_catalog = BNode()
self.g.add((dataset_ref, SCHEMA.includedInDataCatalog, data_catalog))
self.g.add((data_catalog, RDF.type, SCHEMA.DataCatalog))
self.g.add((data_catalog, SCHEMA.name, Literal(config.get('ckan.site_title'))))
self.g.add((data_catalog, SCHEMA.description, Literal(config.get('ckan.site_description'))))
self.g.add((data_catalog, SCHEMA.url, Literal(config.get('ckan.site_url'))))

def _groups_graph(self, dataset_ref, dataset_dict):
for group in dataset_dict.get('groups', []):
group_url = url_for(controller='group',
action='read',
id=group.get('id'),
qualified=True)
self.g.add((dataset_ref, SCHEMA.genre, Literal(group_url)))

def _tags_graph(self, dataset_ref, dataset_dict):
for tag in dataset_dict.get('tags', []):
self.g.add((dataset_ref, SCHEMA.keywords, Literal(tag['name'])))

def _list_fields_graph(self, dataset_ref, dataset_dict):
items = [
('language', SCHEMA.inLanguage, None, Literal),
('theme', SCHEMA.about, None, URIRef),
]
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

Expand Down
3 changes: 2 additions & 1 deletion ckanext/dcat/tests/test_base_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,4 +300,5 @@ def test_contact_details(self):
contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint)

eq_(contact['name'], 'Point of Contact')
eq_(contact['email'], 'mailto:[email protected]')
# mailto gets removed for storage and is added again on output
eq_(contact['email'], '[email protected]')
6 changes: 4 additions & 2 deletions ckanext/dcat/tests/test_euro_dcatap_profile_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def _get_extra_value_as_list(key):
eq_(_get_extra_value('publisher_url'), 'http://some.org')
eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation')
eq_(_get_extra_value('contact_name'), 'Point of Contact')
eq_(_get_extra_value('contact_email'), 'mailto:[email protected]')
# mailto gets removed for storage and is added again on output
eq_(_get_extra_value('contact_email'), '[email protected]')
eq_(_get_extra_value('access_rights'), 'public')
eq_(_get_extra_value('provenance'), 'Some statement about provenance')
eq_(_get_extra_value('dcat_type'), 'test-type')
Expand Down Expand Up @@ -555,7 +556,8 @@ def test_dataset_json_ld_1(self):
eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics')

eq_(extras['contact_name'], 'Jane Doe')
eq_(extras['contact_email'], 'mailto:[email protected]')
# mailto gets removed for storage and is added again on output
eq_(extras['contact_email'], '[email protected]')
eq_(extras['publisher_name'], 'Widget Services')
eq_(extras['publisher_email'], '[email protected]')

Expand Down
28 changes: 25 additions & 3 deletions ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def test_contact_details_extras(self):
assert contact_details
eq_(unicode(contact_details), extras['contact_uri'])
assert self._triple(g, contact_details, VCARD.fn, extras['contact_name'])
assert self._triple(g, contact_details, VCARD.hasEmail, extras['contact_email'])
assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + extras['contact_email']))

def test_contact_details_maintainer(self):
dataset = {
Expand All @@ -240,7 +240,7 @@ def test_contact_details_maintainer(self):
assert contact_details
assert_true(isinstance(contact_details, BNode))
assert self._triple(g, contact_details, VCARD.fn, dataset['maintainer'])
assert self._triple(g, contact_details, VCARD.hasEmail, dataset['maintainer_email'])
assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['maintainer_email']))

def test_contact_details_author(self):
dataset = {
Expand All @@ -259,7 +259,27 @@ def test_contact_details_author(self):
assert contact_details
assert_true(isinstance(contact_details, BNode))
assert self._triple(g, contact_details, VCARD.fn, dataset['author'])
assert self._triple(g, contact_details, VCARD.hasEmail, dataset['author_email'])
assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['author_email']))

def test_contact_details_no_duplicate_mailto(self):
# tests that mailto: isn't added again if it is stored in the dataset
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
'author': 'Example Author',
'author_email': 'mailto:[email protected]',
}

s = RDFSerializer()
g = s.g

dataset_ref = s.graph_from_dataset(dataset)

contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2]
assert contact_details
assert_true(isinstance(contact_details, BNode))
assert self._triple(g, contact_details, VCARD.fn, dataset['author'])
assert self._triple(g, contact_details, VCARD.hasEmail, URIRef(dataset['author_email']))

def test_publisher_extras(self):
dataset = {
Expand Down Expand Up @@ -559,6 +579,7 @@ def test_distribution_fields(self):
# Checksum
checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
assert checksum
assert self._triple(g, checksum, RDF.type, SPDX.Checksum)
assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm']))

Expand Down Expand Up @@ -936,6 +957,7 @@ def test_hash_algorithm_not_uri(self):

checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
assert checksum
assert self._triple(g, checksum, RDF.type, SPDX.Checksum)
assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
assert self._triple(g, checksum, SPDX.algorithm, resource['hash_algorithm'])

Expand Down
53 changes: 52 additions & 1 deletion ckanext/dcat/tests/test_schemaorg_profile_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def test_graph_from_dataset(self):
'version': '1.0b',
'metadata_created': '2015-06-26T15:21:09.034694',
'metadata_modified': '2015-06-26T15:21:09.075774',
'license_title': 'CC-BY 3.0',
'license_url': 'http://creativecommons.org/licenses/by/3.0/',
'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
'extras': [
{'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'},
Expand Down Expand Up @@ -67,7 +69,11 @@ def test_graph_from_dataset(self):
assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title'])
assert self._triple(g, dataset_ref, SCHEMA.description, dataset['notes'])
assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version'])
assert self._triple(g, dataset_ref, SCHEMA.license, dataset['license_url'])
assert self._triple(g, dataset_ref, SCHEMA.identifier, extras['identifier'])
url = self._triple(g, dataset_ref, SCHEMA.url, None)[2]
assert url
eq_(url, Literal('http://test.ckan.net/dataset/%s' % dataset['name']))

# Dates
assert self._triple(g, dataset_ref, SCHEMA.datePublished, dataset['metadata_created'])
Expand All @@ -81,7 +87,6 @@ def test_graph_from_dataset(self):
# List
for item in [
('language', SCHEMA.inLanguage, Literal),
('theme', SCHEMA.about, URIRef),
]:
values = json.loads(extras[item[0]])
eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
Expand Down Expand Up @@ -149,6 +154,52 @@ def test_publisher_org(self):
assert self._triple(g, publisher, RDF.type, SCHEMA.Organization)
assert self._triple(g, publisher, SCHEMA.name, dataset['organization']['title'])

def test_groups(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
'groups': [
{
'id': 'geography',
'name': 'geography',
'display_name': 'Geography',
},
{
'id': 'statistics',
'name': 'statistics',
'display_name': 'Statistics',
},
]
}

s = RDFSerializer(profiles=['schemaorg'])
g = s.g

dataset_ref = s.graph_from_dataset(dataset)

genres = self._triples(g, dataset_ref, SCHEMA.genre, None)
assert len(genres) == 2, 'There are not exactly 2 groups'
assert self._triple(g, dataset_ref, SCHEMA.genre, 'http://test.ckan.net/group/statistics')

@helpers.change_config('ckan.site_url', 'http://ckan.example.org')
@helpers.change_config('ckan.site_description', 'CKAN Portal')
@helpers.change_config('ckan.site_title', 'ckan.example.org')
def test_catalog(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
}
s = RDFSerializer(profiles=['schemaorg'])
g = s.g

dataset_ref = s.graph_from_dataset(dataset)
data_catalog = self._triple(g, dataset_ref, SCHEMA.includedInDataCatalog, None)[2]
assert data_catalog
assert self._triple(g, data_catalog, RDF.type, SCHEMA.DataCatalog)
assert self._triple(g, data_catalog, SCHEMA.url, 'http://ckan.example.org')
assert self._triple(g, data_catalog, SCHEMA.name, 'ckan.example.org')
assert self._triple(g, data_catalog, SCHEMA.description, 'CKAN Portal')

def test_temporal_start_and_end(self):
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
Expand Down