Merge pull request #2 from uc-cdis/chore/json

feat(json): encode utf-8
bioteam · Jan 9, 2018 · 2a4c8a3 · 2a4c8a3
2 parents 35899a8 + 50143b4
commit 2a4c8a3
Show file tree

Hide file tree

Showing 9 changed files with 337 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,6 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+# artifacts
+artifacts/
diff --git a/bin/dump_schema.py b/bin/dump_schema.py
@@ -0,0 +1,19 @@
+"""
+This script dumps all schema files in currently installed gdcdictionary
+to one json schema to ./artifacts folder.
+
+"""
+from exceptions import OSError
+import json
+import os
+
+from gdcdictionary import SCHEMA_DIR
+from dictionaryutils import dump_schemas_from_dir
+try:
+    os.mkdir('artifacts')
+except OSError:
+    pass
+
+with open(os.path.join('artifacts', 'schema.json'), 'w') as f:
+    json.dump(
+        dump_schemas_from_dir(SCHEMA_DIR), f)
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -0,0 +1,3 @@
+-e git+https://[email protected]/NCI-GDC/cdisutils.git@8a8e599fdab5ade9bd8c586132d974a102e5d72d#egg=cdisutils
+-e git+https://[email protected]/NCI-GDC/psqlgraph.git@7b5de7d56aa3159a9526940eb273579ddbf084ca#egg=psqlgraph
+-e git+https://[email protected]/NCI-GDC/gdcdatamodel.git@755c6d7c380b69dc36dced55700bc9e24a084db1#egg=gdcdatamodel
diff --git a/dictionaryutils/__init__.py b/dictionaryutils/__init__.py
@@ -1,3 +1,5 @@
+from .json_load import json_loads_byteified
+
 from copy import deepcopy
 from collections import namedtuple
 from contextlib import contextmanager
@@ -45,8 +47,8 @@ def load_schemas_from_url(url, logger):
         logger.error('Fail to get schema from {}: {}'.format(url, r.text))
         raise
     schemas, resolvers = {}, {}
-
-    for key, schema in r.json().iteritems():
+    response = json_loads_byteified(r.text)
+    for key, schema in response.iteritems():
         schemas[key] = schema
         resolver = RefResolver('{}#'.format(key), schema)
         resolvers[key] = ResolverPair(resolver, schema)

diff --git a/dictionaryutils/dictionary.py b/dictionaryutils/dictionary.py
@@ -0,0 +1,65 @@
+"""
+This modules provide the same interface as gdcdictionary.gdcdictionary
+It can be 'reinstialized' after it's called init() with another dictionary
+For example, using
+``gdcdictionary.gdcdictionary`` as the dictionary:
+
+.. code-block:: python
+
+    dictionary.init(gdcdictionary.gdcdictionary)
+"""
+
+import sys
+
+
+# Get this module as a variable so its attributes can be set later.
+this_module = sys.modules[__name__]
+
+#: The data dictionary must implement these attributes.
+required_attrs = [
+    'resolvers',
+    'schema',
+]
+
+optional_attrs = [
+    'settings',
+]
+
+resolvers = None
+schema = None
+settings = None
+
+
+def init(dictionary):
+    """
+    Initialize this file with the same attributes as ``dictionary``
+
+    Args:
+        dictionary (DataDictionary): a dictionary instance
+
+    Return:
+        None
+    """
+    for required_attr in required_attrs:
+        try:
+            # Basically do: this_module.required_attr = models.required_attr
+            setattr(
+                this_module, required_attr, getattr(dictionary, required_attr)
+            )
+        except AttributeError:
+            raise ValueError('given dictionary does not define ' + required_attr)
+
+    for optional_attr in optional_attrs:
+        try:
+            # Basically do: this_module.required_attr = models.required_attr
+            setattr(
+                this_module, optional_attr, getattr(dictionary, optional_attr)
+            )
+        except AttributeError:
+            pass
+
+try:
+    from gdcdictionary import gdcdictionary
+    init(gdcdictionary)
+except:
+    pass
diff --git a/dictionaryutils/json_load.py b/dictionaryutils/json_load.py
@@ -0,0 +1,35 @@
+"""
+this module serves as alternative json load that encode unicode to utf-8
+from https://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-from-json/19826039
+"""
+
+import json
+
+def json_load_byteified(file_handle):
+    return _byteify(
+        json.load(file_handle, object_hook=_byteify),
+        ignore_dicts=True
+    )
+
+def json_loads_byteified(json_text):
+    return _byteify(
+        json.loads(json_text, object_hook=_byteify),
+        ignore_dicts=True
+    )
+
+def _byteify(data, ignore_dicts = False):
+    # if this is a unicode string, return its string representation
+    if isinstance(data, unicode):
+        return data.encode('utf-8')
+    # if this is a list of values, return list of byteified values
+    if isinstance(data, list):
+        return [ _byteify(item, ignore_dicts=True) for item in data ]
+    # if this is a dictionary, return dictionary of byteified keys and values
+    # but only if we haven't already byteified it
+    if isinstance(data, dict) and not ignore_dicts:
+        return {
+            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
+            for key, value in data.iteritems()
+        }
+    # if it's anything else, return it in its original form
+    return data
diff --git a/dictionaryutils/schema_test.py b/dictionaryutils/schema_test.py
@@ -0,0 +1,202 @@
+"""This is an example of json schema for the GDC using schemas defined
+in local yaml files.
+
+Included are a few functions to augment jsonschema and the python
+validator.
+
+Examples are at the end.
+
+"""
+
+
+from jsonschema import validate, ValidationError
+import copy
+import yaml
+import glob
+import os
+import argparse
+import json
+import unittest
+from gdcdictionary import gdcdictionary, SCHEMA_DIR
+
+
+
+def load_yaml_schema(path):
+    with open(path, 'r') as f:
+        return yaml.load(f)
+
+CUR_DIR = os.path.dirname(SCHEMA_DIR)
+
+DATA_DIR = os.path.join(CUR_DIR, 'examples')
+project1 = load_yaml_schema(os.path.join(CUR_DIR, 'schemas/projects/project1.yaml'))
+projects = {'project1': project1}
+
+def merge_schemas(a, b, path=None):
+    """Recursively zip schemas together
+
+    """
+    path = path if path is not None else []
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_schemas(a[key], b[key], path + [str(key)])
+            elif a[key] == b[key]:
+                pass
+            else:
+                print("Overriding '{}':\n\t- {}\n\t+ {}".format(
+                    '.'.join(path + [str(key)]), a[key], b[key]))
+                a[key] = b[key]
+        else:
+            print("Adding '{}':\n\t+ {}".format(
+                '.'.join(path + [str(key)]), b[key]))
+            a[key] = b[key]
+    return a
+
+
+def get_project_specific_schema(projects, project, schema, entity_type):
+    """Look up the core schema for its type and override it with any
+    project level overrides
+
+    """
+    root = copy.deepcopy(schema)
+    project_overrides = projects.get(project)
+    if project_overrides:
+        overrides = project_overrides.get(entity_type)
+        if overrides:
+            merge_schemas(root, overrides, [entity_type])
+    return root
+
+
+def validate_entity(entity, schemata, project=None, name=''):
+    """Validate an entity by looking up the core schema for its type and
+    overriding it with any project level overrides
+
+    """
+    local_schema = get_project_specific_schema(
+        projects, project, schemata[entity['type']], entity['type'])
+    result = validate(entity, local_schema)
+    return result
+
+
+def validate_schemata(schemata, metaschema):
+    # validate schemata
+    print('Validating schemas against metaschema... '),
+    for s in schemata.values():
+        validate(s, metaschema)
+
+        def assert_link_is_also_prop(link):
+            assert link in s['properties'],\
+                "Entity '{}' has '{}' as a link but not property".format(
+                    s['id'], link)
+
+        for link in [l['name'] for l in s['links'] if 'name' in l]:
+            assert_link_is_also_prop(link)
+        for subgroup in [l['subgroup'] for l in s['links'] if 'name' not in l]:
+            for link in [l['name'] for l in subgroup if 'name' in l]:
+                assert_link_is_also_prop(link)
+
+
+class SchemaTest(unittest.TestCase):
+    def setUp(self):
+        self.dictionary = gdcdictionary
+        self.definitions = yaml.load(open(os.path.join(CUR_DIR, 'schemas','_definitions.yaml'),'r'))
+
+    def test_schemas(self):
+        validate_schemata(self.dictionary.schema, self.dictionary.metaschema)
+
+    def test_valid_files(self):
+        for path in glob.glob(os.path.join(DATA_DIR, 'valid', '*.json')):
+            print("Validating {}".format(path))
+            doc = json.load(open(path, 'r'))
+            print(doc)
+            if type(doc) == dict:
+                self.add_system_props(doc)
+                validate_entity(doc, self.dictionary.schema)
+            elif type(doc) == list:
+                for entity in doc:
+                    self.add_system_props(entity)
+                    validate_entity(entity, self.dictionary.schema)
+            else:
+                raise Exception("Invalid json")
+
+    def test_invalid_files(self):
+        for path in glob.glob(os.path.join(DATA_DIR, 'invalid', '*.json')):
+            print("Validating {}".format(path))
+            doc = json.load(open(path, 'r'))
+            if type(doc) == dict:
+                self.add_system_props(doc)
+                with self.assertRaises(ValidationError):
+                    validate_entity(doc, self.dictionary.schema)
+            elif type(doc) == list:
+                for entity in doc:
+                    self.add_system_props(entity)
+                    with self.assertRaises(ValidationError):
+                        validate_entity(entity, self.dictionary.schema)
+            else:
+                raise Exception("Invalid json")
+
+    def add_system_props(self, doc):
+        schema = self.dictionary.schema[doc['type']]
+        for key in schema['systemProperties']:
+            use_def_default = (
+                '$ref' in schema['properties'][key] and
+                key in self.definitions and
+                'default' in self.definitions[key]
+            )
+            if use_def_default:
+                doc[key] = self.definitions[key]['default']
+
+if __name__ == '__main__':
+
+    ####################
+    # Setup
+    ####################
+
+
+    parser = argparse.ArgumentParser(description='Validate JSON')
+    parser.add_argument('jsonfiles', metavar='file',
+                        type=argparse.FileType('r'), nargs='*',
+                        help='json files to test if (in)valid')
+
+    parser.add_argument('--invalid', action='store_true', default=False,
+                        help='expect the files to be invalid instead of valid')
+
+    args = parser.parse_args()
+
+    ####################
+    # Example validation
+    ####################
+
+    # Load schemata
+    dictionary = gdcdictionary
+
+    for f in args.jsonfiles:
+        doc = json.load(f)
+        if args.invalid:
+            try:
+                print("CHECK if {0} is invalid:".format(f.name)),
+                print(type(doc))
+                if type(doc) == dict:
+                    validate_entity(doc, dictionary.schema)
+                elif type(doc) == list:
+                    for entity in doc:
+                        validate_entity(entity, dictionary.schema)
+                else:
+                    raise ValidationError("Invalid json")
+            except ValidationError as e:
+                print("Invalid as expected.")
+                pass
+            else:
+                raise Exception("Expected invalid, but validated.")
+        else:
+            print ("CHECK if {0} is valid:".format(f.name)),
+            if type(doc) == dict:
+                validate_entity(doc, dictionary.schema)
+            elif type(doc) == list:
+                for entity in doc:
+                    validate_entity(entity, dictionary.schema)
+            else:
+                print("Invalid json")
+
+            print("Valid as expected")
+    print('ok.')
diff --git a/run_tests.sh b/run_tests.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+pip install -r dev-requirements.txt
+nosetests -v
+python bin/dump_schema.py
diff --git a/tests/datamodel_test.py b/tests/datamodel_test.py
@@ -0,0 +1,2 @@
+def test_datamodel():
+    from gdcdatamodel.models import *