opendata-swiss · stefina · Aug 29, 2018 · Aug 29, 2018 · Aug 29, 2018 · Sep 12, 2018
diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py
@@ -14,6 +14,7 @@
 from ckan.model.license import LicenseRegister
 from ckan.plugins import toolkit
 from ckan.lib.munge import munge_tag
+from ckan.lib.helpers import url_for
 
 from ckanext.dcat.utils import resource_uri, publisher_uri_from_dataset_dict, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS
 
@@ -43,8 +44,11 @@
     'locn': LOCN,
     'gsp': GSP,
     'owl': OWL,
+    'spdx': SPDX,
 }
 
+PREFIX_MAILTO = u'mailto:'
+
 
 class RDFProfile(object):
     '''Base class with helper methods for implementing RDF parsing profiles
@@ -255,7 +259,9 @@ def _contact_details(self, subject, predicate):
 
             contact['name'] = self._object_value(agent, VCARD.fn)
 
-            contact['email'] = self._object_value(agent, VCARD.hasEmail)
+            contact['email'] = self._without_mailto(
+                self._object_value(agent, VCARD.hasEmail)
+            )
 
         return contact
 
@@ -471,15 +477,19 @@ def _add_triple_from_dict(self, _dict, subject, predicate, key,
                               fallbacks=None,
                               list_value=False,
                               date_value=False,
-                              _type=Literal):
+                              _type=Literal,
+                              value_modifier=None):
         '''
         Adds a new triple to the graph with the provided parameters
 
         The subject and predicate of the triple are passed as the relevant
-        RDFLib objects (URIRef or BNode). The object is always a literal value,
-        which is extracted from the dict using the provided key (see
-        `_get_dict_value`). If the value for the key is not found, then
+        RDFLib objects (URIRef or BNode). As default, the object is a
+        literal value, which is extracted from the dict using the provided key
+        (see `_get_dict_value`). If the value for the key is not found, then
         additional fallback keys are checked.
+        Using `value_modifier`, a function taking the extracted value and
+        returning a modified value can be passed.
+        If a value was found, the modifier is applied before adding the value.
 
         If `list_value` or `date_value` are True, then the value is treated as
         a list or a date respectively (see `_add_list_triple` and
@@ -492,6 +502,10 @@ def _add_triple_from_dict(self, _dict, subject, predicate, key,
                 if value:
                     break
 
+        # if a modifying function was given, apply it to the value
+        if value and callable(value_modifier):
+            value = value_modifier(value)
+
         if value and list_value:
             self._add_list_triple(subject, predicate, value, _type)
         elif value and date_value:
@@ -571,6 +585,25 @@ def _last_catalog_modification(self):
             return result['results'][0]['metadata_modified']
         return None
 
+    def _add_mailto(self, mail_addr):
+        '''
+        Ensures that the mail address has an URIRef-compatible mailto: prefix.
+        Can be used as modifier function for `_add_triple_from_dict`.
+        '''
+        if mail_addr:
+            return PREFIX_MAILTO + self._without_mailto(mail_addr)
+        else:
+            return mail_addr
+
+    def _without_mailto(self, mail_addr):
+        '''
+        Ensures that the mail address string has no mailto: prefix.
+        '''
+        if mail_addr:
+            return unicode(mail_addr).replace(PREFIX_MAILTO, u'')
+        else:
+            return mail_addr
+
     def _get_source_catalog(self, dataset_ref):
         '''
         Returns Catalog reference that is source for this dataset. 
@@ -957,13 +990,17 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             g.add((contact_details, RDF.type, VCARD.Organization))
             g.add((dataset_ref, DCAT.contactPoint, contact_details))
 
-            items = [
-                ('contact_name', VCARD.fn, ['maintainer', 'author'], Literal),
-                ('contact_email', VCARD.hasEmail, ['maintainer_email',
-                                                   'author_email'], Literal),
-            ]
-
-            self._add_triples_from_dict(dataset_dict, contact_details, items)
+            self._add_triple_from_dict(
+                dataset_dict, contact_details,
+                VCARD.fn, 'contact_name', ['maintainer', 'author']
+            )
+            # Add mail address as URIRef, and ensure it has a mailto: prefix
+            self._add_triple_from_dict(
+                dataset_dict, contact_details,
+                VCARD.hasEmail, 'contact_email', ['maintainer_email',
+                                                  'author_email'],
+                _type=URIRef, value_modifier=self._add_mailto
+            )
 
         # Publisher
         if any([
@@ -1116,6 +1153,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             # Checksum
             if resource_dict.get('hash'):
                 checksum = BNode()
+                g.add((checksum, RDF.type, SPDX.Checksum))
                 g.add((checksum, SPDX.checksumValue,
                        Literal(resource_dict['hash'],
                                datatype=XSD.hexBinary)))
@@ -1184,6 +1222,12 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
         # Basic fields
         self._basic_fields_graph(dataset_ref, dataset_dict)
 
+        # Catalog
+        self._catalog_graph(dataset_ref, dataset_dict)
+
+        # Groups
+        self._groups_graph(dataset_ref, dataset_dict)
+
         # Tags
         self._tags_graph(dataset_ref, dataset_dict)
 
@@ -1201,6 +1245,18 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
 
         # Resources
         self._resources_graph(dataset_ref, dataset_dict)
+
+        # Additional fields
+        self.additional_fields(dataset_ref, dataset_dict)
+
+    def additional_fields(self, dataset_ref, dataset_dict):
+        '''
+        Adds any additional fields.
+
+        For a custom schema you should extend this class and
+        implement this method.
+        '''
+        pass
 
     def _add_date_triple(self, subject, predicate, value, _type=Literal):
         '''
@@ -1232,6 +1288,7 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict):
             ('version', SCHEMA.version, ['dcat_version'], Literal),
             ('issued', SCHEMA.datePublished, ['metadata_created'], Literal),
             ('modified', SCHEMA.dateModified, ['metadata_modified'], Literal),
+            ('license', SCHEMA.license, ['license_url', 'license_title'], Literal),
         ]
         self._add_triples_from_dict(dataset_dict, dataset_ref, items)
 
@@ -1242,14 +1299,35 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict):
 
         self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)
 
+        # Dataset URL
+        dataset_url = url_for('dataset_read',
+                              id=dataset_dict['name'],
+                              qualified=True)
+        self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url)))
+
+    def _catalog_graph(self, dataset_ref, dataset_dict):
+        data_catalog = BNode()
+        self.g.add((dataset_ref, SCHEMA.includedInDataCatalog, data_catalog))
+        self.g.add((data_catalog, RDF.type, SCHEMA.DataCatalog))
+        self.g.add((data_catalog, SCHEMA.name, Literal(config.get('ckan.site_title'))))
+        self.g.add((data_catalog, SCHEMA.description, Literal(config.get('ckan.site_description'))))
+        self.g.add((data_catalog, SCHEMA.url, Literal(config.get('ckan.site_url'))))
+
+    def _groups_graph(self, dataset_ref, dataset_dict):
+        for group in dataset_dict.get('groups', []):
+            group_url = url_for(controller='group',
+                                action='read',
+                                id=group.get('id'),
+                                qualified=True)
+            self.g.add((dataset_ref, SCHEMA.genre, Literal(group_url)))
+
     def _tags_graph(self, dataset_ref, dataset_dict):
         for tag in dataset_dict.get('tags', []):
             self.g.add((dataset_ref, SCHEMA.keywords, Literal(tag['name'])))
 
     def _list_fields_graph(self, dataset_ref, dataset_dict):
         items = [
             ('language', SCHEMA.inLanguage, None, Literal),
-            ('theme', SCHEMA.about, None, URIRef),
         ]
         self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)
 

diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py
@@ -300,4 +300,5 @@ def test_contact_details(self):
         contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint)
 
         eq_(contact['name'], 'Point of Contact')
-        eq_(contact['email'], 'mailto:[email protected]')
+        # mailto gets removed for storage and is added again on output
+        eq_(contact['email'], '[email protected]')
diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py
@@ -91,7 +91,8 @@ def _get_extra_value_as_list(key):
         eq_(_get_extra_value('publisher_url'), 'http://some.org')
         eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation')
         eq_(_get_extra_value('contact_name'), 'Point of Contact')
-        eq_(_get_extra_value('contact_email'), 'mailto:[email protected]')
+        # mailto gets removed for storage and is added again on output
+        eq_(_get_extra_value('contact_email'), '[email protected]')
         eq_(_get_extra_value('access_rights'), 'public')
         eq_(_get_extra_value('provenance'), 'Some statement about provenance')
         eq_(_get_extra_value('dcat_type'), 'test-type')
@@ -555,7 +556,8 @@ def test_dataset_json_ld_1(self):
         eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics')
 
         eq_(extras['contact_name'], 'Jane Doe')
-        eq_(extras['contact_email'], 'mailto:[email protected]')
+        # mailto gets removed for storage and is added again on output
+        eq_(extras['contact_email'], '[email protected]')
         eq_(extras['publisher_name'], 'Widget Services')
         eq_(extras['publisher_email'], '[email protected]')
 

diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
@@ -219,7 +219,7 @@ def test_contact_details_extras(self):
         assert contact_details
         eq_(unicode(contact_details), extras['contact_uri'])
         assert self._triple(g, contact_details, VCARD.fn, extras['contact_name'])
-        assert self._triple(g, contact_details, VCARD.hasEmail, extras['contact_email'])
+        assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + extras['contact_email']))
 
     def test_contact_details_maintainer(self):
         dataset = {
@@ -240,7 +240,7 @@ def test_contact_details_maintainer(self):
         assert contact_details
         assert_true(isinstance(contact_details, BNode))
         assert self._triple(g, contact_details, VCARD.fn, dataset['maintainer'])
-        assert self._triple(g, contact_details, VCARD.hasEmail, dataset['maintainer_email'])
+        assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['maintainer_email']))
 
     def test_contact_details_author(self):
         dataset = {
@@ -259,7 +259,27 @@ def test_contact_details_author(self):
         assert contact_details
         assert_true(isinstance(contact_details, BNode))
         assert self._triple(g, contact_details, VCARD.fn, dataset['author'])
-        assert self._triple(g, contact_details, VCARD.hasEmail, dataset['author_email'])
+        assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['author_email']))
+
+    def test_contact_details_no_duplicate_mailto(self):
+        # tests that mailto: isn't added again if it is stored in the dataset
+        dataset = {
+            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
+            'name': 'test-dataset',
+            'author': 'Example Author',
+            'author_email': 'mailto:[email protected]',
+        }
+
+        s = RDFSerializer()
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset)
+
+        contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2]
+        assert contact_details
+        assert_true(isinstance(contact_details, BNode))
+        assert self._triple(g, contact_details, VCARD.fn, dataset['author'])
+        assert self._triple(g, contact_details, VCARD.hasEmail, URIRef(dataset['author_email']))
 
     def test_publisher_extras(self):
         dataset = {
@@ -559,6 +579,7 @@ def test_distribution_fields(self):
         # Checksum
         checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
         assert checksum
+        assert self._triple(g, checksum, RDF.type, SPDX.Checksum)
         assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
         assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm']))
 
@@ -936,6 +957,7 @@ def test_hash_algorithm_not_uri(self):
 
         checksum = self._triple(g, distribution, SPDX.checksum, None)[2]
         assert checksum
+        assert self._triple(g, checksum, RDF.type, SPDX.Checksum)
         assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary')
         assert self._triple(g, checksum, SPDX.algorithm, resource['hash_algorithm'])
 

diff --git a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py
@@ -33,6 +33,8 @@ def test_graph_from_dataset(self):
             'version': '1.0b',
             'metadata_created': '2015-06-26T15:21:09.034694',
             'metadata_modified': '2015-06-26T15:21:09.075774',
+            'license_title': 'CC-BY 3.0',
+            'license_url': 'http://creativecommons.org/licenses/by/3.0/',
             'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}],
             'extras': [
                 {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'},
@@ -67,7 +69,11 @@ def test_graph_from_dataset(self):
         assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title'])
         assert self._triple(g, dataset_ref, SCHEMA.description, dataset['notes'])
         assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version'])
+        assert self._triple(g, dataset_ref, SCHEMA.license, dataset['license_url'])
         assert self._triple(g, dataset_ref, SCHEMA.identifier, extras['identifier'])
+        url = self._triple(g, dataset_ref, SCHEMA.url, None)[2]
+        assert url
+        eq_(url, Literal('http://test.ckan.net/dataset/%s' % dataset['name']))
 
         # Dates
         assert self._triple(g, dataset_ref, SCHEMA.datePublished, dataset['metadata_created'])
@@ -81,7 +87,6 @@ def test_graph_from_dataset(self):
         # List
         for item in [
             ('language', SCHEMA.inLanguage, Literal),
-            ('theme', SCHEMA.about, URIRef),
         ]:
             values = json.loads(extras[item[0]])
             eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values))
@@ -149,6 +154,52 @@ def test_publisher_org(self):
         assert self._triple(g, publisher, RDF.type, SCHEMA.Organization)
         assert self._triple(g, publisher, SCHEMA.name, dataset['organization']['title'])
 
+    def test_groups(self):
+        dataset = {
+            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
+            'name': 'test-dataset',
+            'groups': [
+                {
+                    'id': 'geography',
+                    'name': 'geography',
+                    'display_name': 'Geography',
+                },
+                {
+                    'id': 'statistics',
+                    'name': 'statistics',
+                    'display_name': 'Statistics',
+                },
+            ]
+        }
+
+        s = RDFSerializer(profiles=['schemaorg'])
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset)
+
+        genres = self._triples(g, dataset_ref, SCHEMA.genre, None)
+        assert len(genres) == 2, 'There are not exactly 2 groups'
+        assert self._triple(g, dataset_ref, SCHEMA.genre, 'http://test.ckan.net/group/statistics')
+
+    @helpers.change_config('ckan.site_url', 'http://ckan.example.org')
+    @helpers.change_config('ckan.site_description', 'CKAN Portal')
+    @helpers.change_config('ckan.site_title', 'ckan.example.org')
+    def test_catalog(self):
+        dataset = {
+            'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
+            'name': 'test-dataset',
+        }
+        s = RDFSerializer(profiles=['schemaorg'])
+        g = s.g
+
+        dataset_ref = s.graph_from_dataset(dataset)
+        data_catalog = self._triple(g, dataset_ref, SCHEMA.includedInDataCatalog, None)[2]
+        assert data_catalog
+        assert self._triple(g, data_catalog, RDF.type, SCHEMA.DataCatalog)
+        assert self._triple(g, data_catalog, SCHEMA.url, 'http://ckan.example.org')
+        assert self._triple(g, data_catalog, SCHEMA.name, 'ckan.example.org')
+        assert self._triple(g, data_catalog, SCHEMA.description, 'CKAN Portal')
+
     def test_temporal_start_and_end(self):
         dataset = {
             'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',