From 2101e809144d4150b64b1e680e73c05788dd241d Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 22 Nov 2023 08:53:54 -0500 Subject: [PATCH] Initial skeleton project --- .github/workflows/ci.yaml | 69 +++++ .github/workflows/pypi.yaml | 27 ++ .gitignore | 2 + .pre-commit-config.yaml | 7 + .pylintrc | 437 +++++++++++++++++++++++++++++++ CONTRIBUTING.md | 12 + README.md | 41 +++ cumulus_fhir_support/__init__.py | 5 + cumulus_fhir_support/schemas.py | 193 ++++++++++++++ pyproject.toml | 49 ++++ tests/__init__.py | 0 tests/test_schemas.py | 183 +++++++++++++ 12 files changed, 1025 insertions(+) create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/pypi.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 .pylintrc create mode 100644 CONTRIBUTING.md create mode 100644 README.md create mode 100644 cumulus_fhir_support/__init__.py create mode 100644 cumulus_fhir_support/schemas.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_schemas.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..170bd10 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,69 @@ +name: CI +on: + pull_request: + push: + branches: + - main + +# The goal here is to cancel older workflows when a PR is updated (because it's pointless work) +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + +jobs: + unittest: + name: unit tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ".[tests]" + + - name: Test with pytest + run: | + python -m pytest + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install linters + # black is synced with the .pre-commit-hooks version + run: | + python -m pip install --upgrade pip + python -m pip install .[dev] bandit[toml] pycodestyle pylint + + - name: Run pycodestyle + # E203: pycodestyle is a little too rigid about slices & whitespace + # See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices + # W503: a default ignore that we are restoring + run: | + pycodestyle --max-line-length=100 --ignore=E203,W503 . + + - name: Run pylint + if: success() || failure() # still run pylint if above checks fail + run: | + pylint cumulus_fhir_support tests + + - name: Run bandit + if: success() || failure() # still run bandit if above checks fail + run: | + bandit -c pyproject.toml -r . + + - name: Run black + if: success() || failure() # still run black if above checks fails + run: | + black --check --verbose . diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml new file mode 100644 index 0000000..42dcac1 --- /dev/null +++ b/.github/workflows/pypi.yaml @@ -0,0 +1,27 @@ +name: PyPI + +on: + release: + types: [created] + +jobs: + publish: + runs-on: ubuntu-latest + + permissions: + contents: read + id-token: write # this permission is required for PyPI "trusted publishing" + + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build + + - name: Build + run: python -m build + + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5affc21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.idea/ +__pycache__/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f3d1e69 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/psf/black + #this version is synced with the black mentioned in .github/workflows/ci.yml + rev: 23.10.0 + hooks: + - id: black + entry: bash -c 'black "$@"; git add -u' -- diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..98c6c6b --- /dev/null +++ b/.pylintrc @@ -0,0 +1,437 @@ +# Below is a copy of Google's pylintrc, with the following modifications: +# - indent-string changed to 4 spaces (the shipped version of this config +# file has 2 because that's what Google uses internally, despite the +# public description of their style guide using 4) +# - max-line-length changed to 120 because 80 was driving us crazy +# - wrong-import-order re-enabled, because MT likes order among chaos +# +# BELOW THIS LINE IS A COPY OF https://google.github.io/styleguide/pylintrc + +# This Pylint rcfile contains a best-effort configuration to uphold the +# best-practices and style described in the Google Python style guide: +# https://google.github.io/styleguide/pyguide.html +# +# Its canonical open-source location is: +# https://google.github.io/styleguide/pylintrc + +[MASTER] + +# Files or directories to be skipped. They should be base names, not paths. +ignore=third_party + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=no + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=abstract-method, + apply-builtin, + arguments-differ, + attribute-defined-outside-init, + backtick, + bad-option-value, + basestring-builtin, + buffer-builtin, + c-extension-no-member, + consider-using-enumerate, + cmp-builtin, + cmp-method, + coerce-builtin, + coerce-method, + delslice-method, + div-method, + duplicate-code, + eq-without-hash, + execfile-builtin, + file-builtin, + filter-builtin-not-iterating, + fixme, + getslice-method, + global-statement, + hex-method, + idiv-method, + implicit-str-concat, + import-error, + import-self, + import-star-module-level, + inconsistent-return-statements, + input-builtin, + intern-builtin, + invalid-str-codec, + locally-disabled, + long-builtin, + long-suffix, + map-builtin-not-iterating, + misplaced-comparison-constant, + missing-function-docstring, + metaclass-assignment, + next-method-called, + next-method-defined, + no-absolute-import, + no-else-break, + no-else-continue, + no-else-raise, + no-else-return, + no-init, # added + no-member, + no-name-in-module, + no-self-use, + nonzero-method, + oct-method, + old-division, + old-ne-operator, + old-octal-literal, + old-raise-syntax, + parameter-unpacking, + print-statement, + raising-string, + range-builtin-not-iterating, + raw_input-builtin, + rdiv-method, + reduce-builtin, + relative-import, + reload-builtin, + round-builtin, + setslice-method, + signature-differs, + standarderror-builtin, + suppressed-message, + sys-max-int, + too-few-public-methods, + too-many-ancestors, + too-many-arguments, + too-many-boolean-expressions, + too-many-branches, + too-many-instance-attributes, + too-many-locals, + too-many-nested-blocks, + too-many-public-methods, + too-many-return-statements, + too-many-statements, + trailing-newlines, + unichr-builtin, + unicode-builtin, + unnecessary-pass, + unpacking-in-except, + useless-else-on-loop, + useless-object-inheritance, + useless-suppression, + using-cmp-argument, + xrange-builtin, + zip-builtin-not-iterating, + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=main,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl + +# Regular expression matching correct function names +function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ + +# Regular expression matching correct variable names +variable-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct constant names +const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct attribute names +attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ + +# Regular expression matching correct argument names +argument-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class names +class-rgx=^_?[A-Z][a-zA-Z0-9]*$ + +# Regular expression matching correct module names +module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ + +# Regular expression matching correct method names +method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=10 + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=120 + +# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt +# lines made too long by directives to pytype. + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=(?x)( + ^\s*(\#\ )??$| + ^\s*(from\s+\S+\s+)?import\s+.+$) + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=yes + +# Maximum number of lines in a module +max-module-lines=99999 + +# String used as indentation unit. The internal Google style guide mandates 2 +# spaces. Google's externaly-published style guide says 4, consistent with +# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google +# projects (like TensorFlow). +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=TODO + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=yes + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging,absl.logging,tensorflow.io.logging + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub, + TERMIOS, + Bastion, + rexec, + sets + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant, absl + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls, + class_ + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=builtins.StandardError, + builtins.Exception, + builtins.BaseException diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..058e488 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,12 @@ +# Contributing to Cumulus FHIR Support + +## Set up your dev environment + +To use the same dev environment as us, you'll want to run these commands: +```sh +pip install .[dev] +pre-commit install +``` + +This will install dependencies & build tools, +as well as set up a `black` auto-formatter commit hook. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6bcfc58 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# Cumulus FHIR Support + +This library holds FHIR support code for the Cumulus project as a whole. + +## Installing + +```shell +pip install cumulus-fhir-support +``` + +## Examples + +### pyarrow_schema_from_rows + +```python3 +import cumulus_fhir_support + +rows = [ + { + "resourceType": "Patient", + "id": "1", + "extension": [{ + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [{ + "url": "ombCategory", + "valueCoding": { + "code": "2135-2", + "display": "Hispanic or Latino", + "system": "urn:oid:2.16.840.1.113883.6.238", + } + }], + }] + }, +] + +# The resulting schema will be both wide (every toplevel column) +# and deep enough for every field in `rows`. +# That is, both the non-present toplevel field "telecom" and the deeper +# field "extension.extension.valueCoding.system" will be in the schema. +schema = cumulus_fhir_support.pyarrow_schema_from_rows("Patient", rows) +``` diff --git a/cumulus_fhir_support/__init__.py b/cumulus_fhir_support/__init__.py new file mode 100644 index 0000000..ad9c09c --- /dev/null +++ b/cumulus_fhir_support/__init__.py @@ -0,0 +1,5 @@ +"""FHIR support code for the Cumulus project""" + +__version__ = "1.0.0" + +from .schemas import pyarrow_schema_from_rows diff --git a/cumulus_fhir_support/schemas.py b/cumulus_fhir_support/schemas.py new file mode 100644 index 0000000..3173f21 --- /dev/null +++ b/cumulus_fhir_support/schemas.py @@ -0,0 +1,193 @@ +"""Detect FHIR resource schemas""" + +from collections import namedtuple +from functools import partial +from typing import Any, Iterable, Optional + +import pyarrow +from fhirclient.models import ( + codeableconcept, + coding, + extension, + fhirabstractbase, + fhirdate, + fhirelementfactory, +) + + +FhirProperty = namedtuple( + "FhirProperty", ["name", "json_name", "pytype", "is_list", "of_many", "required"] +) + +# We include one level of the FHIR spec in our schema, regardless of what's in the source data. +# This is to help downstream SQL by at least making sure each column is in the schema. +LEVEL_INCLUSION = 1 + + +def pyarrow_schema_from_rows(resource_type: str, rows: Iterable[dict] = None) -> pyarrow.Schema: + """ + Creates a PyArrow schema based off the named resource (like 'Observation') and row contents. + + Note that this schema will not be deep (fully nested all the way down), + it will simply be wide (covering each toplevel field, each likely nullable). + But it *will* at least include every field contained in the batch. + + Non-FHIR-spec fields will not be present in the final schema. + All fields will be marked nullable. + + :param resource_type: the FHIR resource name to create a schema for + :param rows: optionally a set of JSON FHIR resources to ensure are covered by the schema + :returns: a PyArrow schema that covers the unified shape of all provided rows + """ + # Examine batch to see the full shape of it, in order to detect any deeply nested fields + # that we want to make sure to include in the final schema (normally, we go wide but only as + # deep as we need to) + batch_shape = _get_shape_of_dicts(None, rows and list(rows)) + + return _create_pyarrow_schema_for_resource(resource_type, batch_shape) + + +def _get_shape_of_dicts(total_shape: Optional[dict], item: Any) -> dict: + """ + Examines `item` and gives a description of its "shape". + + Shape here means a dictionary tree of fields, like {"id": {}, "code": {"text": {}}} + where empty dictionaries indicate no further children. + + This is not a generic concept at all - it's purely to aid with creating a schema for a batch + of input rows. This shape will tell us which FHIR fields to include in our schema. + + Example Input: + {"address": [{"street": "123 Main St", "city": "Springfield"}], "name": "Jane Smith"} + + Example output: + {"address": {"street": {}, "city": {}}, "name": {}} + + :param total_shape: a pre-existing shape that we will merge fields into + :param item: the current item being examined + :returns: a shape for this item and its descendants (will be same dict as total_shape) + """ + total_shape = total_shape or {} + + if isinstance(item, list): + for x in item: + total_shape = _get_shape_of_dicts(total_shape, x) + elif isinstance(item, dict): + for key, val in item.items(): + total_shape[key] = _get_shape_of_dicts(total_shape.get(key), val) + + return total_shape + + +def _create_pyarrow_schema_for_resource(resource_type: str, batch_shape: dict) -> pyarrow.Schema: + """ + Creates a PyArrow schema based off the named resource (like 'Observation'). + + This schema will be as wide as the spec is and as deep as the batch_shape is. + + batch_shape is a dictionary tree of fields to include, like {"id": {}, "code": {"text": {}}} + where empty dictionaries indicate no children (but the parent should still be included). + """ + instance = fhirelementfactory.FHIRElementFactory.instantiate(resource_type, None) + + # fhirclient doesn't include `resourceType` in the list of properties. So do that manually. + type_field = pyarrow.field("resourceType", pyarrow.string()) + + return pyarrow.schema( + [type_field, *_fhir_obj_to_pyarrow_fields(instance, batch_shape, level=0)] + ) + + +def _fhir_obj_to_pyarrow_fields( + base_obj: fhirabstractbase.FHIRAbstractBase, batch_shape: dict, *, level: int +) -> list[pyarrow.Field]: + """Convert a FHIR instance to a PyArrow Field schema list""" + properties = map(FhirProperty._make, base_obj.elementProperties()) + return list( + filter( + None, + map( + partial( + _fhir_to_pyarrow_property, + base_obj=base_obj, + batch_shape=batch_shape, + level=level, + ), + properties, + ), + ) + ) + + +def _fhir_to_pyarrow_property( + prop: FhirProperty, + *, + base_obj: fhirabstractbase.FHIRAbstractBase, + batch_shape: dict = None, + level: int, +) -> Optional[pyarrow.Field]: + """Converts a single FhirProperty to a PyArrow Field, or None if this field should be skipped""" + if batch_shape is not None: + batch_shape = batch_shape.get(prop.json_name) + + # If we see a piece of a Concept or Coding, we like to grab the full schema for it. + # This helps downstream SQL avoid dealing about incomplete Coding fields - which do appear. + full_schema_types = (codeableconcept.CodeableConcept, coding.Coding) + is_inside_full_schema_type = isinstance(base_obj, full_schema_types) + is_extension_type = issubclass(prop.pytype, extension.Extension) + force_inclusion = is_inside_full_schema_type and not is_extension_type + + # OK how do we handle this field? Include or exclude - descend or not? + present_in_shape = batch_shape is not None + include_in_schema = present_in_shape or force_inclusion + is_struct = issubclass(prop.pytype, fhirabstractbase.FHIRAbstractBase) + + if is_struct: + if level >= LEVEL_INCLUSION and not include_in_schema: + # Skip this element entirely and do not descend, to avoid infinite recursion. + # Note that in theory this might leave a struct with no child fields + # (if a struct's only children were also structs), + # which parquet/spark would have an issue with because they won't allow empty structs. + # But in practice with FHIR, all BackboneElements have at least an id (string) field, + # so we dodge that bullet. + return None + # Recurse! + pyarrow_type = pyarrow.struct( + _fhir_obj_to_pyarrow_fields(prop.pytype(), batch_shape, level=level + 1) + ) + else: + if level > LEVEL_INCLUSION and not include_in_schema: + # If we're deeper than our inclusion level, + # bail if we don't actually see the field in the data + return None + pyarrow_type = _basic_fhir_to_pyarrow_type(prop.pytype) + + # Wrap lists in an ListType + if prop.is_list: + pyarrow_type = pyarrow.list_(pyarrow_type) + + # Mark all types as nullable, don't worry about the prop.required field. + # We don't need to be in the business of validation, we just want to provide a schema. + return pyarrow.field(prop.json_name, pyarrow_type, nullable=True) + + +def _basic_fhir_to_pyarrow_type(pytype: type) -> pyarrow.DataType: + """Converts a basic python type to a Pyspark type""" + if pytype is int: + return pyarrow.int32() + elif pytype is float: + # TODO: the FHIR spec suggests that float64 might not even be enough: + # From https://www.hl7.org/fhir/R4/datatypes.html: + # "In object code, implementations that might meet this constraint are GMP implementations + # or equivalents to Java BigDecimal that implement arbitrary precision, or a combination + # of a (64 bit) floating point value with a precision field" + # But for now, we are matching the inferred types from before we used a pre-calculated + # schema. We can presumably up-scale this at some point if we find limitations. + return pyarrow.float64() + elif pytype is str: + return pyarrow.string() + elif pytype is bool: + return pyarrow.bool_() + elif pytype is fhirdate.FHIRDate: + return pyarrow.string() # just leave it as a string, like it appears in the JSON + raise ValueError(f"Unexpected type: {pytype}") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..01ffc36 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "cumulus-fhir-support" +requires-python = ">= 3.9" +dependencies = [ + "fhirclient >= 4.1", + "pyarrow >= 12", +] +authors = [ + { name="Michael Terry", email="michael.terry@childrens.harvard.edu" }, +] +description = "FHIR schema support code for the Cumulus project" +readme = "README.md" +license = { text="Apache License 2.0" } +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dynamic = ["version"] + +[project.urls] +"Homepage" = "https://github.com/smart-on-fhir/cumulus-fhir-support" + +[build-system] +requires = ["flit_core >=3.4,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.sdist] +include = [ + "tests/", + "LICENSE", + "*.md", +] + +[tool.bandit] +exclude_dirs = ["tests"] + +[tool.black] +line-length = 100 + +[project.optional-dependencies] +tests = [ + "pytest", +] +dev = [ + "black >= 23, < 24", + "pre-commit", +] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_schemas.py b/tests/test_schemas.py new file mode 100644 index 0000000..c78afba --- /dev/null +++ b/tests/test_schemas.py @@ -0,0 +1,183 @@ +"""Tests for schemas.py""" + +import unittest + +import pyarrow + +import cumulus_fhir_support as support + + +class SchemaDetectionTests(unittest.TestCase): + """Test case for schema detection""" + + def test_makes_wide_schema(self): + """Verify we write out a wide schema even when presented with nothing""" + schema = support.pyarrow_schema_from_rows("Patient") + self.assertListEqual( + [ + "resourceType", + "id", + "implicitRules", + "language", + "meta", + "contained", + "extension", + "modifierExtension", + "text", + "active", + "address", + "birthDate", + "communication", + "contact", + "deceasedBoolean", + "deceasedDateTime", + "gender", + "generalPractitioner", + "identifier", + "link", + "managingOrganization", + "maritalStatus", + "multipleBirthBoolean", + "multipleBirthInteger", + "name", + "photo", + "telecom", + ], + schema.names, + ) + + # Spot check a few of the types + self.assertEqual(pyarrow.string(), schema.field("id").type) + self.assertEqual(pyarrow.bool_(), schema.field("deceasedBoolean").type) + self.assertEqual(pyarrow.int32(), schema.field("multipleBirthInteger").type) + # Note how struct types only have basic types inside of them - this is intentional, + # no recursion of structs is done + self.assertEqual( + pyarrow.struct( + {"id": pyarrow.string(), "div": pyarrow.string(), "status": pyarrow.string()} + ), + schema.field("text").type, + ) + self.assertEqual( + pyarrow.list_(pyarrow.struct({"id": pyarrow.string(), "preferred": pyarrow.bool_()})), + schema.field("communication").type, + ) + + def test_detected_fields_are_included_and_expanded(self): + """Verify that deep (detected) fields are also included, with Coding expansion""" + # Make sure that we include different deep fields for each - final schema should be a union + rows = [ + {"stage": [{"type": {"coding": [{"version": "1.0"}]}}]}, + {"onsetRange": {"low": {"value": 1.0}}}, + ] + schema = support.pyarrow_schema_from_rows("Condition", rows) + + # Start with simple, non-present CodeableConcept at level zero. + # This should be fully described. + self.assertEqual( + pyarrow.struct( + { + "id": pyarrow.string(), + "coding": pyarrow.list_( + pyarrow.struct( + { + "id": pyarrow.string(), + "code": pyarrow.string(), + "display": pyarrow.string(), + "system": pyarrow.string(), + "userSelected": pyarrow.bool_(), + "version": pyarrow.string(), + } + ) + ), + "text": pyarrow.string(), + } + ), + schema.field("code").type, # CodeableConcept type + ) + # While a deeper non-present CodeableConcept should be ignored + self.assertEqual( + pyarrow.list_( + pyarrow.struct( + { + "id": pyarrow.string(), + # "code" field is missing (CodeableConcept type) + # "detail" field is missing (Reference type) + } + ) + ), + schema.field("evidence").type, # BackboneElement type + ) + # But if any piece of a deep CodeableConcept is present, it gets fully expanded. + self.assertEqual( + pyarrow.list_( + pyarrow.struct( + { + "id": pyarrow.string(), + # "assessment" field is missing (Reference type) + # "summary" field is missing (CodeableConcept type) + # But the "type" is here in full because a piece of it was in the input + "type": pyarrow.struct( + { + "id": pyarrow.string(), + "coding": pyarrow.list_( + pyarrow.struct( + { + "id": pyarrow.string(), + "code": pyarrow.string(), + "display": pyarrow.string(), + "system": pyarrow.string(), + "userSelected": pyarrow.bool_(), + "version": pyarrow.string(), + } + ) + ), + "text": pyarrow.string(), + } + ), + } + ) + ), + schema.field("stage").type, # BackboneElement type + ) + # Other deep-and-partial elements do not get the same expansion treatment. + # Here is a deep Quantity element. + # The parts present in the input are also in the schema, but only those parts. + self.assertEqual( + pyarrow.struct( + { + "id": pyarrow.string(), + "low": pyarrow.struct( + { + "value": pyarrow.float64(), + } + ), + } + ), + schema.field("onsetRange").type, + ) + + def test_schema_types_are_coerced(self): + """Verify that fields with "wrong" input types (like int instead of float) are corrected""" + # Make sure that we include both wide and deep fields. + # Both should be coerced into floats. + rows = [ + {"quantityQuantity": {"value": 1}}, + {"quantityRange": {"low": {"value": 2}}}, + ] + schema = support.pyarrow_schema_from_rows("ServiceRequest", rows) + + self.assertEqual( + pyarrow.float64(), schema.field("quantityQuantity").type.field("value").type + ) + self.assertEqual( + pyarrow.float64(), + schema.field("quantityRange").type.field("low").type.field("value").type, + ) + + def test_non_spec_field_are_ignored(self): + """Verify that a field not in the FHIR spec is handled gracefully""" + rows = [{"invalid_field": "nope"}] + schema = support.pyarrow_schema_from_rows("Observation", rows) + + self.assertNotIn("invalid_field", schema.names)