Skip to content

Commit

Permalink
Merge pull request #2 from uc-cdis/chore/json
Browse files Browse the repository at this point in the history
feat(json): encode utf-8
  • Loading branch information
philloooo authored Jan 9, 2018
2 parents 35899a8 + 50143b4 commit 2a4c8a3
Show file tree
Hide file tree
Showing 9 changed files with 337 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,6 @@ ENV/

# mypy
.mypy_cache/

# artifacts
artifacts/
19 changes: 19 additions & 0 deletions bin/dump_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
This script dumps all schema files in currently installed gdcdictionary
to one json schema to ./artifacts folder.
"""
from exceptions import OSError
import json
import os

from gdcdictionary import SCHEMA_DIR
from dictionaryutils import dump_schemas_from_dir
try:
os.mkdir('artifacts')
except OSError:
pass

with open(os.path.join('artifacts', 'schema.json'), 'w') as f:
json.dump(
dump_schemas_from_dir(SCHEMA_DIR), f)
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-e git+https://[email protected]/NCI-GDC/cdisutils.git@8a8e599fdab5ade9bd8c586132d974a102e5d72d#egg=cdisutils
-e git+https://[email protected]/NCI-GDC/psqlgraph.git@7b5de7d56aa3159a9526940eb273579ddbf084ca#egg=psqlgraph
-e git+https://[email protected]/NCI-GDC/gdcdatamodel.git@755c6d7c380b69dc36dced55700bc9e24a084db1#egg=gdcdatamodel
6 changes: 4 additions & 2 deletions dictionaryutils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .json_load import json_loads_byteified

from copy import deepcopy
from collections import namedtuple
from contextlib import contextmanager
Expand Down Expand Up @@ -45,8 +47,8 @@ def load_schemas_from_url(url, logger):
logger.error('Fail to get schema from {}: {}'.format(url, r.text))
raise
schemas, resolvers = {}, {}

for key, schema in r.json().iteritems():
response = json_loads_byteified(r.text)
for key, schema in response.iteritems():
schemas[key] = schema
resolver = RefResolver('{}#'.format(key), schema)
resolvers[key] = ResolverPair(resolver, schema)
Expand Down
65 changes: 65 additions & 0 deletions dictionaryutils/dictionary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
This modules provide the same interface as gdcdictionary.gdcdictionary
It can be 'reinstialized' after it's called init() with another dictionary
For example, using
``gdcdictionary.gdcdictionary`` as the dictionary:
.. code-block:: python
dictionary.init(gdcdictionary.gdcdictionary)
"""

import sys


# Get this module as a variable so its attributes can be set later.
this_module = sys.modules[__name__]

#: The data dictionary must implement these attributes.
required_attrs = [
'resolvers',
'schema',
]

optional_attrs = [
'settings',
]

resolvers = None
schema = None
settings = None


def init(dictionary):
"""
Initialize this file with the same attributes as ``dictionary``
Args:
dictionary (DataDictionary): a dictionary instance
Return:
None
"""
for required_attr in required_attrs:
try:
# Basically do: this_module.required_attr = models.required_attr
setattr(
this_module, required_attr, getattr(dictionary, required_attr)
)
except AttributeError:
raise ValueError('given dictionary does not define ' + required_attr)

for optional_attr in optional_attrs:
try:
# Basically do: this_module.required_attr = models.required_attr
setattr(
this_module, optional_attr, getattr(dictionary, optional_attr)
)
except AttributeError:
pass

try:
from gdcdictionary import gdcdictionary
init(gdcdictionary)
except:
pass
35 changes: 35 additions & 0 deletions dictionaryutils/json_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
this module serves as alternative json load that encode unicode to utf-8
from https://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-from-json/19826039
"""

import json

def json_load_byteified(file_handle):
return _byteify(
json.load(file_handle, object_hook=_byteify),
ignore_dicts=True
)

def json_loads_byteified(json_text):
return _byteify(
json.loads(json_text, object_hook=_byteify),
ignore_dicts=True
)

def _byteify(data, ignore_dicts = False):
# if this is a unicode string, return its string representation
if isinstance(data, unicode):
return data.encode('utf-8')
# if this is a list of values, return list of byteified values
if isinstance(data, list):
return [ _byteify(item, ignore_dicts=True) for item in data ]
# if this is a dictionary, return dictionary of byteified keys and values
# but only if we haven't already byteified it
if isinstance(data, dict) and not ignore_dicts:
return {
_byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
for key, value in data.iteritems()
}
# if it's anything else, return it in its original form
return data
202 changes: 202 additions & 0 deletions dictionaryutils/schema_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""This is an example of json schema for the GDC using schemas defined
in local yaml files.
Included are a few functions to augment jsonschema and the python
validator.
Examples are at the end.
"""


from jsonschema import validate, ValidationError
import copy
import yaml
import glob
import os
import argparse
import json
import unittest
from gdcdictionary import gdcdictionary, SCHEMA_DIR



def load_yaml_schema(path):
with open(path, 'r') as f:
return yaml.load(f)

CUR_DIR = os.path.dirname(SCHEMA_DIR)

DATA_DIR = os.path.join(CUR_DIR, 'examples')
project1 = load_yaml_schema(os.path.join(CUR_DIR, 'schemas/projects/project1.yaml'))
projects = {'project1': project1}

def merge_schemas(a, b, path=None):
"""Recursively zip schemas together
"""
path = path if path is not None else []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge_schemas(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass
else:
print("Overriding '{}':\n\t- {}\n\t+ {}".format(
'.'.join(path + [str(key)]), a[key], b[key]))
a[key] = b[key]
else:
print("Adding '{}':\n\t+ {}".format(
'.'.join(path + [str(key)]), b[key]))
a[key] = b[key]
return a


def get_project_specific_schema(projects, project, schema, entity_type):
"""Look up the core schema for its type and override it with any
project level overrides
"""
root = copy.deepcopy(schema)
project_overrides = projects.get(project)
if project_overrides:
overrides = project_overrides.get(entity_type)
if overrides:
merge_schemas(root, overrides, [entity_type])
return root


def validate_entity(entity, schemata, project=None, name=''):
"""Validate an entity by looking up the core schema for its type and
overriding it with any project level overrides
"""
local_schema = get_project_specific_schema(
projects, project, schemata[entity['type']], entity['type'])
result = validate(entity, local_schema)
return result


def validate_schemata(schemata, metaschema):
# validate schemata
print('Validating schemas against metaschema... '),
for s in schemata.values():
validate(s, metaschema)

def assert_link_is_also_prop(link):
assert link in s['properties'],\
"Entity '{}' has '{}' as a link but not property".format(
s['id'], link)

for link in [l['name'] for l in s['links'] if 'name' in l]:
assert_link_is_also_prop(link)
for subgroup in [l['subgroup'] for l in s['links'] if 'name' not in l]:
for link in [l['name'] for l in subgroup if 'name' in l]:
assert_link_is_also_prop(link)


class SchemaTest(unittest.TestCase):
def setUp(self):
self.dictionary = gdcdictionary
self.definitions = yaml.load(open(os.path.join(CUR_DIR, 'schemas','_definitions.yaml'),'r'))

def test_schemas(self):
validate_schemata(self.dictionary.schema, self.dictionary.metaschema)

def test_valid_files(self):
for path in glob.glob(os.path.join(DATA_DIR, 'valid', '*.json')):
print("Validating {}".format(path))
doc = json.load(open(path, 'r'))
print(doc)
if type(doc) == dict:
self.add_system_props(doc)
validate_entity(doc, self.dictionary.schema)
elif type(doc) == list:
for entity in doc:
self.add_system_props(entity)
validate_entity(entity, self.dictionary.schema)
else:
raise Exception("Invalid json")

def test_invalid_files(self):
for path in glob.glob(os.path.join(DATA_DIR, 'invalid', '*.json')):
print("Validating {}".format(path))
doc = json.load(open(path, 'r'))
if type(doc) == dict:
self.add_system_props(doc)
with self.assertRaises(ValidationError):
validate_entity(doc, self.dictionary.schema)
elif type(doc) == list:
for entity in doc:
self.add_system_props(entity)
with self.assertRaises(ValidationError):
validate_entity(entity, self.dictionary.schema)
else:
raise Exception("Invalid json")

def add_system_props(self, doc):
schema = self.dictionary.schema[doc['type']]
for key in schema['systemProperties']:
use_def_default = (
'$ref' in schema['properties'][key] and
key in self.definitions and
'default' in self.definitions[key]
)
if use_def_default:
doc[key] = self.definitions[key]['default']

if __name__ == '__main__':

####################
# Setup
####################


parser = argparse.ArgumentParser(description='Validate JSON')
parser.add_argument('jsonfiles', metavar='file',
type=argparse.FileType('r'), nargs='*',
help='json files to test if (in)valid')

parser.add_argument('--invalid', action='store_true', default=False,
help='expect the files to be invalid instead of valid')

args = parser.parse_args()

####################
# Example validation
####################

# Load schemata
dictionary = gdcdictionary

for f in args.jsonfiles:
doc = json.load(f)
if args.invalid:
try:
print("CHECK if {0} is invalid:".format(f.name)),
print(type(doc))
if type(doc) == dict:
validate_entity(doc, dictionary.schema)
elif type(doc) == list:
for entity in doc:
validate_entity(entity, dictionary.schema)
else:
raise ValidationError("Invalid json")
except ValidationError as e:
print("Invalid as expected.")
pass
else:
raise Exception("Expected invalid, but validated.")
else:
print ("CHECK if {0} is valid:".format(f.name)),
if type(doc) == dict:
validate_entity(doc, dictionary.schema)
elif type(doc) == list:
for entity in doc:
validate_entity(entity, dictionary.schema)
else:
print("Invalid json")

print("Valid as expected")
print('ok.')
4 changes: 4 additions & 0 deletions run_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
pip install -r dev-requirements.txt
nosetests -v
python bin/dump_schema.py
2 changes: 2 additions & 0 deletions tests/datamodel_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def test_datamodel():
from gdcdatamodel.models import *

0 comments on commit 2a4c8a3

Please sign in to comment.