From 2101e809144d4150b64b1e680e73c05788dd241d Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Wed, 22 Nov 2023 08:53:54 -0500
Subject: [PATCH] Initial skeleton project

---
 .github/workflows/ci.yaml        |  69 +++++
 .github/workflows/pypi.yaml      |  27 ++
 .gitignore                       |   2 +
 .pre-commit-config.yaml          |   7 +
 .pylintrc                        | 437 +++++++++++++++++++++++++++++++
 CONTRIBUTING.md                  |  12 +
 README.md                        |  41 +++
 cumulus_fhir_support/__init__.py |   5 +
 cumulus_fhir_support/schemas.py  | 193 ++++++++++++++
 pyproject.toml                   |  49 ++++
 tests/__init__.py                |   0
 tests/test_schemas.py            | 183 +++++++++++++
 12 files changed, 1025 insertions(+)
 create mode 100644 .github/workflows/ci.yaml
 create mode 100644 .github/workflows/pypi.yaml
 create mode 100644 .gitignore
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 .pylintrc
 create mode 100644 CONTRIBUTING.md
 create mode 100644 README.md
 create mode 100644 cumulus_fhir_support/__init__.py
 create mode 100644 cumulus_fhir_support/schemas.py
 create mode 100644 pyproject.toml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_schemas.py

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..170bd10
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,69 @@
+name: CI
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+# The goal here is to cancel older workflows when a PR is updated (because it's pointless work)
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  unittest:
+    name: unit tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[tests]"
+
+      - name: Test with pytest
+        run: |
+          python -m pytest
+
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install linters
+        # black is synced with the .pre-commit-hooks version
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[dev] bandit[toml] pycodestyle pylint
+
+      - name: Run pycodestyle
+        # E203: pycodestyle is a little too rigid about slices & whitespace
+        #  See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices
+        # W503: a default ignore that we are restoring
+        run: |
+          pycodestyle --max-line-length=100 --ignore=E203,W503 .
+
+      - name: Run pylint
+        if: success() || failure() # still run pylint if above checks fail
+        run: |
+          pylint cumulus_fhir_support tests
+
+      - name: Run bandit
+        if: success() || failure() # still run bandit if above checks fail
+        run: |
+          bandit -c pyproject.toml -r .
+
+      - name: Run black
+        if: success() || failure() # still run black if above checks fails
+        run: |
+          black --check --verbose .
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
new file mode 100644
index 0000000..42dcac1
--- /dev/null
+++ b/.github/workflows/pypi.yaml
@@ -0,0 +1,27 @@
+name: PyPI
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      id-token: write # this permission is required for PyPI "trusted publishing"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install build
+
+    - name: Build
+      run: python -m build
+
+    - name: Publish
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5affc21
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/.idea/
+__pycache__/
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..f3d1e69
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/psf/black
+    #this version is synced with the black mentioned in .github/workflows/ci.yml
+    rev: 23.10.0
+    hooks:
+      - id: black
+        entry: bash -c 'black "$@"; git add -u' --
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..98c6c6b
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,437 @@
+# Below is a copy of Google's pylintrc, with the following modifications:
+# - indent-string changed to 4 spaces (the shipped version of this config
+#   file has 2 because that's what Google uses internally, despite the
+#   public description of their style guide using 4)
+# - max-line-length changed to 120 because 80 was driving us crazy
+# - wrong-import-order re-enabled, because MT likes order among chaos
+#
+# BELOW THIS LINE IS A COPY OF https://google.github.io/styleguide/pylintrc
+
+# This Pylint rcfile contains a best-effort configuration to uphold the
+# best-practices and style described in the Google Python style guide:
+#   https://google.github.io/styleguide/pyguide.html
+#
+# Its canonical open-source location is:
+#   https://google.github.io/styleguide/pylintrc
+
+[MASTER]
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=third_party
+
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=4
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=abstract-method,
+        apply-builtin,
+        arguments-differ,
+        attribute-defined-outside-init,
+        backtick,
+        bad-option-value,
+        basestring-builtin,
+        buffer-builtin,
+        c-extension-no-member,
+        consider-using-enumerate,
+        cmp-builtin,
+        cmp-method,
+        coerce-builtin,
+        coerce-method,
+        delslice-method,
+        div-method,
+        duplicate-code,
+        eq-without-hash,
+        execfile-builtin,
+        file-builtin,
+        filter-builtin-not-iterating,
+        fixme,
+        getslice-method,
+        global-statement,
+        hex-method,
+        idiv-method,
+        implicit-str-concat,
+        import-error,
+        import-self,
+        import-star-module-level,
+        inconsistent-return-statements,
+        input-builtin,
+        intern-builtin,
+        invalid-str-codec,
+        locally-disabled,
+        long-builtin,
+        long-suffix,
+        map-builtin-not-iterating,
+        misplaced-comparison-constant,
+        missing-function-docstring,
+        metaclass-assignment,
+        next-method-called,
+        next-method-defined,
+        no-absolute-import,
+        no-else-break,
+        no-else-continue,
+        no-else-raise,
+        no-else-return,
+        no-init,  # added
+        no-member,
+        no-name-in-module,
+        no-self-use,
+        nonzero-method,
+        oct-method,
+        old-division,
+        old-ne-operator,
+        old-octal-literal,
+        old-raise-syntax,
+        parameter-unpacking,
+        print-statement,
+        raising-string,
+        range-builtin-not-iterating,
+        raw_input-builtin,
+        rdiv-method,
+        reduce-builtin,
+        relative-import,
+        reload-builtin,
+        round-builtin,
+        setslice-method,
+        signature-differs,
+        standarderror-builtin,
+        suppressed-message,
+        sys-max-int,
+        too-few-public-methods,
+        too-many-ancestors,
+        too-many-arguments,
+        too-many-boolean-expressions,
+        too-many-branches,
+        too-many-instance-attributes,
+        too-many-locals,
+        too-many-nested-blocks,
+        too-many-public-methods,
+        too-many-return-statements,
+        too-many-statements,
+        trailing-newlines,
+        unichr-builtin,
+        unicode-builtin,
+        unnecessary-pass,
+        unpacking-in-except,
+        useless-else-on-loop,
+        useless-object-inheritance,
+        useless-suppression,
+        using-cmp-argument,
+        xrange-builtin,
+        zip-builtin-not-iterating,
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
+
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
+
+# Regular expression matching correct method names
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
+# lines made too long by directives to pytype.
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=(?x)(
+  ^\s*(\#\ )?<?https?://\S+>?$|
+  ^\s*(from\s+\S+\s+)?import\s+.+$)
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+
+# Maximum number of lines in a module
+max-module-lines=99999
+
+# String used as indentation unit.  The internal Google style guide mandates 2
+# spaces.  Google's externaly-published style guide says 4, consistent with
+# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
+# projects (like TensorFlow).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=TODO
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging,absl.logging,tensorflow.io.logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,
+                   TERMIOS,
+                   Bastion,
+                   rexec,
+                   sets
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant, absl
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,
+                            class_
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=builtins.StandardError,
+                       builtins.Exception,
+                       builtins.BaseException
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..058e488
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+# Contributing to Cumulus FHIR Support
+
+## Set up your dev environment
+
+To use the same dev environment as us, you'll want to run these commands:
+```sh
+pip install .[dev]
+pre-commit install
+```
+
+This will install dependencies & build tools,
+as well as set up a `black` auto-formatter commit hook.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6bcfc58
--- /dev/null
+++ b/README.md
@@ -0,0 +1,41 @@
+# Cumulus FHIR Support
+
+This library holds FHIR support code for the Cumulus project as a whole.
+
+## Installing
+
+```shell
+pip install cumulus-fhir-support
+```
+
+## Examples
+
+### pyarrow_schema_from_rows
+
+```python3
+import cumulus_fhir_support
+
+rows = [
+    {
+        "resourceType": "Patient",
+        "id": "1",
+        "extension": [{
+            "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
+            "extension": [{
+                "url": "ombCategory",
+                "valueCoding": {
+                    "code": "2135-2",
+                    "display": "Hispanic or Latino",
+                    "system": "urn:oid:2.16.840.1.113883.6.238",
+                }
+            }],
+        }]
+    },
+]
+
+# The resulting schema will be both wide (every toplevel column)
+# and deep enough for every field in `rows`.
+# That is, both the non-present toplevel field "telecom" and the deeper
+# field "extension.extension.valueCoding.system" will be in the schema.
+schema = cumulus_fhir_support.pyarrow_schema_from_rows("Patient", rows)
+```
diff --git a/cumulus_fhir_support/__init__.py b/cumulus_fhir_support/__init__.py
new file mode 100644
index 0000000..ad9c09c
--- /dev/null
+++ b/cumulus_fhir_support/__init__.py
@@ -0,0 +1,5 @@
+"""FHIR support code for the Cumulus project"""
+
+__version__ = "1.0.0"
+
+from .schemas import pyarrow_schema_from_rows
diff --git a/cumulus_fhir_support/schemas.py b/cumulus_fhir_support/schemas.py
new file mode 100644
index 0000000..3173f21
--- /dev/null
+++ b/cumulus_fhir_support/schemas.py
@@ -0,0 +1,193 @@
+"""Detect FHIR resource schemas"""
+
+from collections import namedtuple
+from functools import partial
+from typing import Any, Iterable, Optional
+
+import pyarrow
+from fhirclient.models import (
+    codeableconcept,
+    coding,
+    extension,
+    fhirabstractbase,
+    fhirdate,
+    fhirelementfactory,
+)
+
+
+FhirProperty = namedtuple(
+    "FhirProperty", ["name", "json_name", "pytype", "is_list", "of_many", "required"]
+)
+
+# We include one level of the FHIR spec in our schema, regardless of what's in the source data.
+# This is to help downstream SQL by at least making sure each column is in the schema.
+LEVEL_INCLUSION = 1
+
+
+def pyarrow_schema_from_rows(resource_type: str, rows: Iterable[dict] = None) -> pyarrow.Schema:
+    """
+    Creates a PyArrow schema based off the named resource (like 'Observation') and row contents.
+
+    Note that this schema will not be deep (fully nested all the way down),
+    it will simply be wide (covering each toplevel field, each likely nullable).
+    But it *will* at least include every field contained in the batch.
+
+    Non-FHIR-spec fields will not be present in the final schema.
+    All fields will be marked nullable.
+
+    :param resource_type: the FHIR resource name to create a schema for
+    :param rows: optionally a set of JSON FHIR resources to ensure are covered by the schema
+    :returns: a PyArrow schema that covers the unified shape of all provided rows
+    """
+    # Examine batch to see the full shape of it, in order to detect any deeply nested fields
+    # that we want to make sure to include in the final schema (normally, we go wide but only as
+    # deep as we need to)
+    batch_shape = _get_shape_of_dicts(None, rows and list(rows))
+
+    return _create_pyarrow_schema_for_resource(resource_type, batch_shape)
+
+
+def _get_shape_of_dicts(total_shape: Optional[dict], item: Any) -> dict:
+    """
+    Examines `item` and gives a description of its "shape".
+
+    Shape here means a dictionary tree of fields, like {"id": {}, "code": {"text": {}}}
+    where empty dictionaries indicate no further children.
+
+    This is not a generic concept at all - it's purely to aid with creating a schema for a batch
+    of input rows. This shape will tell us which FHIR fields to include in our schema.
+
+    Example Input:
+    {"address": [{"street": "123 Main St", "city": "Springfield"}], "name": "Jane Smith"}
+
+    Example output:
+    {"address": {"street": {}, "city": {}}, "name": {}}
+
+    :param total_shape: a pre-existing shape that we will merge fields into
+    :param item: the current item being examined
+    :returns: a shape for this item and its descendants (will be same dict as total_shape)
+    """
+    total_shape = total_shape or {}
+
+    if isinstance(item, list):
+        for x in item:
+            total_shape = _get_shape_of_dicts(total_shape, x)
+    elif isinstance(item, dict):
+        for key, val in item.items():
+            total_shape[key] = _get_shape_of_dicts(total_shape.get(key), val)
+
+    return total_shape
+
+
+def _create_pyarrow_schema_for_resource(resource_type: str, batch_shape: dict) -> pyarrow.Schema:
+    """
+    Creates a PyArrow schema based off the named resource (like 'Observation').
+
+    This schema will be as wide as the spec is and as deep as the batch_shape is.
+
+    batch_shape is a dictionary tree of fields to include, like {"id": {}, "code": {"text": {}}}
+    where empty dictionaries indicate no children (but the parent should still be included).
+    """
+    instance = fhirelementfactory.FHIRElementFactory.instantiate(resource_type, None)
+
+    # fhirclient doesn't include `resourceType` in the list of properties. So do that manually.
+    type_field = pyarrow.field("resourceType", pyarrow.string())
+
+    return pyarrow.schema(
+        [type_field, *_fhir_obj_to_pyarrow_fields(instance, batch_shape, level=0)]
+    )
+
+
+def _fhir_obj_to_pyarrow_fields(
+    base_obj: fhirabstractbase.FHIRAbstractBase, batch_shape: dict, *, level: int
+) -> list[pyarrow.Field]:
+    """Convert a FHIR instance to a PyArrow Field schema list"""
+    properties = map(FhirProperty._make, base_obj.elementProperties())
+    return list(
+        filter(
+            None,
+            map(
+                partial(
+                    _fhir_to_pyarrow_property,
+                    base_obj=base_obj,
+                    batch_shape=batch_shape,
+                    level=level,
+                ),
+                properties,
+            ),
+        )
+    )
+
+
+def _fhir_to_pyarrow_property(
+    prop: FhirProperty,
+    *,
+    base_obj: fhirabstractbase.FHIRAbstractBase,
+    batch_shape: dict = None,
+    level: int,
+) -> Optional[pyarrow.Field]:
+    """Converts a single FhirProperty to a PyArrow Field, or None if this field should be skipped"""
+    if batch_shape is not None:
+        batch_shape = batch_shape.get(prop.json_name)
+
+    # If we see a piece of a Concept or Coding, we like to grab the full schema for it.
+    # This helps downstream SQL avoid dealing about incomplete Coding fields - which do appear.
+    full_schema_types = (codeableconcept.CodeableConcept, coding.Coding)
+    is_inside_full_schema_type = isinstance(base_obj, full_schema_types)
+    is_extension_type = issubclass(prop.pytype, extension.Extension)
+    force_inclusion = is_inside_full_schema_type and not is_extension_type
+
+    # OK how do we handle this field? Include or exclude - descend or not?
+    present_in_shape = batch_shape is not None
+    include_in_schema = present_in_shape or force_inclusion
+    is_struct = issubclass(prop.pytype, fhirabstractbase.FHIRAbstractBase)
+
+    if is_struct:
+        if level >= LEVEL_INCLUSION and not include_in_schema:
+            # Skip this element entirely and do not descend, to avoid infinite recursion.
+            # Note that in theory this might leave a struct with no child fields
+            # (if a struct's only children were also structs),
+            # which parquet/spark would have an issue with because they won't allow empty structs.
+            # But in practice with FHIR, all BackboneElements have at least an id (string) field,
+            # so we dodge that bullet.
+            return None
+        # Recurse!
+        pyarrow_type = pyarrow.struct(
+            _fhir_obj_to_pyarrow_fields(prop.pytype(), batch_shape, level=level + 1)
+        )
+    else:
+        if level > LEVEL_INCLUSION and not include_in_schema:
+            # If we're deeper than our inclusion level,
+            # bail if we don't actually see the field in the data
+            return None
+        pyarrow_type = _basic_fhir_to_pyarrow_type(prop.pytype)
+
+    # Wrap lists in an ListType
+    if prop.is_list:
+        pyarrow_type = pyarrow.list_(pyarrow_type)
+
+    # Mark all types as nullable, don't worry about the prop.required field.
+    # We don't need to be in the business of validation, we just want to provide a schema.
+    return pyarrow.field(prop.json_name, pyarrow_type, nullable=True)
+
+
+def _basic_fhir_to_pyarrow_type(pytype: type) -> pyarrow.DataType:
+    """Converts a basic python type to a Pyspark type"""
+    if pytype is int:
+        return pyarrow.int32()
+    elif pytype is float:
+        # TODO: the FHIR spec suggests that float64 might not even be enough:
+        #  From https://www.hl7.org/fhir/R4/datatypes.html:
+        #  "In object code, implementations that might meet this constraint are GMP implementations
+        #   or equivalents to Java BigDecimal that implement arbitrary precision, or a combination
+        #   of a (64 bit) floating point value with a precision field"
+        #  But for now, we are matching the inferred types from before we used a pre-calculated
+        #  schema. We can presumably up-scale this at some point if we find limitations.
+        return pyarrow.float64()
+    elif pytype is str:
+        return pyarrow.string()
+    elif pytype is bool:
+        return pyarrow.bool_()
+    elif pytype is fhirdate.FHIRDate:
+        return pyarrow.string()  # just leave it as a string, like it appears in the JSON
+    raise ValueError(f"Unexpected type: {pytype}")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..01ffc36
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+[project]
+name = "cumulus-fhir-support"
+requires-python = ">= 3.9"
+dependencies = [
+    "fhirclient >= 4.1",
+    "pyarrow >= 12",
+]
+authors = [
+  { name="Michael Terry", email="michael.terry@childrens.harvard.edu" },
+]
+description = "FHIR schema support code for the Cumulus project"
+readme = "README.md"
+license = { text="Apache License 2.0" }
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dynamic = ["version"]
+
+[project.urls]
+"Homepage" = "https://github.com/smart-on-fhir/cumulus-fhir-support"
+
+[build-system]
+requires = ["flit_core >=3.4,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.sdist]
+include = [
+    "tests/",
+    "LICENSE",
+    "*.md",
+]
+
+[tool.bandit]
+exclude_dirs = ["tests"]
+
+[tool.black]
+line-length = 100
+
+[project.optional-dependencies]
+tests = [
+    "pytest",
+]
+dev = [
+    "black >= 23, < 24",
+    "pre-commit",
+]
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_schemas.py b/tests/test_schemas.py
new file mode 100644
index 0000000..c78afba
--- /dev/null
+++ b/tests/test_schemas.py
@@ -0,0 +1,183 @@
+"""Tests for schemas.py"""
+
+import unittest
+
+import pyarrow
+
+import cumulus_fhir_support as support
+
+
+class SchemaDetectionTests(unittest.TestCase):
+    """Test case for schema detection"""
+
+    def test_makes_wide_schema(self):
+        """Verify we write out a wide schema even when presented with nothing"""
+        schema = support.pyarrow_schema_from_rows("Patient")
+        self.assertListEqual(
+            [
+                "resourceType",
+                "id",
+                "implicitRules",
+                "language",
+                "meta",
+                "contained",
+                "extension",
+                "modifierExtension",
+                "text",
+                "active",
+                "address",
+                "birthDate",
+                "communication",
+                "contact",
+                "deceasedBoolean",
+                "deceasedDateTime",
+                "gender",
+                "generalPractitioner",
+                "identifier",
+                "link",
+                "managingOrganization",
+                "maritalStatus",
+                "multipleBirthBoolean",
+                "multipleBirthInteger",
+                "name",
+                "photo",
+                "telecom",
+            ],
+            schema.names,
+        )
+
+        # Spot check a few of the types
+        self.assertEqual(pyarrow.string(), schema.field("id").type)
+        self.assertEqual(pyarrow.bool_(), schema.field("deceasedBoolean").type)
+        self.assertEqual(pyarrow.int32(), schema.field("multipleBirthInteger").type)
+        # Note how struct types only have basic types inside of them - this is intentional,
+        # no recursion of structs is done
+        self.assertEqual(
+            pyarrow.struct(
+                {"id": pyarrow.string(), "div": pyarrow.string(), "status": pyarrow.string()}
+            ),
+            schema.field("text").type,
+        )
+        self.assertEqual(
+            pyarrow.list_(pyarrow.struct({"id": pyarrow.string(), "preferred": pyarrow.bool_()})),
+            schema.field("communication").type,
+        )
+
+    def test_detected_fields_are_included_and_expanded(self):
+        """Verify that deep (detected) fields are also included, with Coding expansion"""
+        # Make sure that we include different deep fields for each - final schema should be a union
+        rows = [
+            {"stage": [{"type": {"coding": [{"version": "1.0"}]}}]},
+            {"onsetRange": {"low": {"value": 1.0}}},
+        ]
+        schema = support.pyarrow_schema_from_rows("Condition", rows)
+
+        # Start with simple, non-present CodeableConcept at level zero.
+        # This should be fully described.
+        self.assertEqual(
+            pyarrow.struct(
+                {
+                    "id": pyarrow.string(),
+                    "coding": pyarrow.list_(
+                        pyarrow.struct(
+                            {
+                                "id": pyarrow.string(),
+                                "code": pyarrow.string(),
+                                "display": pyarrow.string(),
+                                "system": pyarrow.string(),
+                                "userSelected": pyarrow.bool_(),
+                                "version": pyarrow.string(),
+                            }
+                        )
+                    ),
+                    "text": pyarrow.string(),
+                }
+            ),
+            schema.field("code").type,  # CodeableConcept type
+        )
+        # While a deeper non-present CodeableConcept should be ignored
+        self.assertEqual(
+            pyarrow.list_(
+                pyarrow.struct(
+                    {
+                        "id": pyarrow.string(),
+                        # "code" field is missing (CodeableConcept type)
+                        # "detail" field is missing (Reference type)
+                    }
+                )
+            ),
+            schema.field("evidence").type,  # BackboneElement type
+        )
+        # But if any piece of a deep CodeableConcept is present, it gets fully expanded.
+        self.assertEqual(
+            pyarrow.list_(
+                pyarrow.struct(
+                    {
+                        "id": pyarrow.string(),
+                        # "assessment" field is missing (Reference type)
+                        # "summary" field is missing (CodeableConcept type)
+                        # But the "type" is here in full because a piece of it was in the input
+                        "type": pyarrow.struct(
+                            {
+                                "id": pyarrow.string(),
+                                "coding": pyarrow.list_(
+                                    pyarrow.struct(
+                                        {
+                                            "id": pyarrow.string(),
+                                            "code": pyarrow.string(),
+                                            "display": pyarrow.string(),
+                                            "system": pyarrow.string(),
+                                            "userSelected": pyarrow.bool_(),
+                                            "version": pyarrow.string(),
+                                        }
+                                    )
+                                ),
+                                "text": pyarrow.string(),
+                            }
+                        ),
+                    }
+                )
+            ),
+            schema.field("stage").type,  # BackboneElement type
+        )
+        # Other deep-and-partial elements do not get the same expansion treatment.
+        # Here is a deep Quantity element.
+        # The parts present in the input are also in the schema, but only those parts.
+        self.assertEqual(
+            pyarrow.struct(
+                {
+                    "id": pyarrow.string(),
+                    "low": pyarrow.struct(
+                        {
+                            "value": pyarrow.float64(),
+                        }
+                    ),
+                }
+            ),
+            schema.field("onsetRange").type,
+        )
+
+    def test_schema_types_are_coerced(self):
+        """Verify that fields with "wrong" input types (like int instead of float) are corrected"""
+        # Make sure that we include both wide and deep fields.
+        # Both should be coerced into floats.
+        rows = [
+            {"quantityQuantity": {"value": 1}},
+            {"quantityRange": {"low": {"value": 2}}},
+        ]
+        schema = support.pyarrow_schema_from_rows("ServiceRequest", rows)
+
+        self.assertEqual(
+            pyarrow.float64(), schema.field("quantityQuantity").type.field("value").type
+        )
+        self.assertEqual(
+            pyarrow.float64(),
+            schema.field("quantityRange").type.field("low").type.field("value").type,
+        )
+
+    def test_non_spec_field_are_ignored(self):
+        """Verify that a field not in the FHIR spec is handled gracefully"""
+        rows = [{"invalid_field": "nope"}]
+        schema = support.pyarrow_schema_from_rows("Observation", rows)
+
+        self.assertNotIn("invalid_field", schema.names)