Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ The h5t module
--------------

Partial implementation of some of the lower level h5py API, needed
to support enumerations and variable length strings.
to support enumerations, variable length strings, and opaque datatypes.

.. autofunction:: pyfive.h5t.check_enum_dtype

.. autofunction:: pyfive.h5t.check_string_dtype

.. autofunction:: pyfive.h5t.check_dtype

.. autofunction:: pyfive.h5t.check_opaque_dtype

.. autoclass:: pyfive.h5t.TypeEnumID

3 changes: 3 additions & 0 deletions doc/quickstart/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ Getting started
Installation <installation>
Usage <usage>
Enumerations <enums>
Opaque Datasets <opaque>



25 changes: 25 additions & 0 deletions doc/quickstart/opaque.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Opaque Datasets
---------------

It is possible to create datasets with opaque datatypes in HDF5. These are
datasets where the data is stored as a sequence of bytes, with no
interpretation of those bytes. This is not a commonly used feature of HDF5,
but it is used in some applications. The `h5py` package supports reading
and writing opaque datatypes, and so `pyfive` also supports reading them.

This implementation has only been tested for opaque datatypes that
were created using `h5py`.

Such opaque datatypes will be transparently read into the same type of
numpy array as was used to write the data. The users should not
need to do anything special to read the data - but may need to do
something special with the data to interpret it once read.









2 changes: 1 addition & 1 deletion pyfive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

from pyfive.high_level import File, Group, Dataset
from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype
from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype, opaque_dtype, check_opaque_dtype
from pyfive.h5py import Datatype, Empty
from importlib.metadata import version

Expand Down
20 changes: 19 additions & 1 deletion pyfive/datatype_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def determine_dtype(self):
elif datatype_class == DATATYPE_BITFIELD:
raise NotImplementedError("Bitfield datatype class not supported.")
elif datatype_class == DATATYPE_OPAQUE:
raise NotImplementedError("Opaque datatype class not supported.")
return self._determine_dtype_opaque(datatype_msg)
elif datatype_class == DATATYPE_COMPOUND:
return self._determine_dtype_compound(datatype_msg)
elif datatype_class == DATATYPE_REFERENCE:
Expand Down Expand Up @@ -161,6 +161,24 @@ def _determine_dtype_compound(self, datatype_msg):

raise NotImplementedError("Compound dtype not supported.")

def _determine_dtype_opaque(self, datatype_msg):
""" Return the dtype information for an opaque class. """
# Opaque types are not understood by pyfive, so we return
# a tuple indicating the type is opaque, the size in bytes
# and the tag, if any. The tag is an ascii string, null terminated
# and padded to an 8 byte boundary, the number of which is given by the
# message size.
size = datatype_msg['size']
null_location = self.buf.index(b'\x00', self.offset)
tag_size = _padded_size(null_location - self.offset + 1, 8)
tag_bytes = self.buf[self.offset:self.offset+tag_size]
tag = tag_bytes.strip(b'\x00').decode('ascii')
self.offset += tag_size
if tag == '':
tag = None

return ('OPAQUE', tag, size)

@staticmethod
def _determine_dtype_vlen(datatype_msg):
""" Return the dtype information for a variable length class. """
Expand Down
16 changes: 11 additions & 5 deletions pyfive/h5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,21 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
self._unique = (self._filename, self.shape, self._msg_offset)

if isinstance(dataobject.dtype, tuple):
if dataobject.dtype[0] == 'ENUMERATION':
dtype = dataobject.dtype
if isinstance(dtype, tuple):
if dtype[0] == 'ENUMERATION':
self._dtype = np.dtype(dataobject.dtype[1], metadata={'enum':dataobject.dtype[2]})
elif dataobject.dtype[0] == 'COMPOUND':
elif dtype[0] == 'COMPOUND':
self._dtype = np.dtype(dataobject.dtype[1])
elif dtype[0] == 'OPAQUE':
if dtype[1].startswith('NUMPY:'):
self._dtype = np.dtype(dtype[1][6:], metadata={'h5py_opaque': True})
else:
self._dtype = np.dtype('V'+str(dtype[2]), metadata={'h5py_opaque': True})
else:
self._dtype = dataobject.dtype
self._dtype = dtype
else:
self._dtype = np.dtype(dataobject.dtype)
self._dtype = np.dtype(dtype)

self._meta = DatasetMeta(dataobject)

Expand Down
34 changes: 33 additions & 1 deletion pyfive/h5t.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,32 @@
string_info = namedtuple('string_info', ['encoding', 'length'])


def opaque_dtype(dt):
"""
Return the numpy dtype of the dtype. (So it does nothing,
but is included for compatibility with the h5py API
docuemntation which _implies_ this is needed to read data,
but it is not.)
"""
# For pyfive, the opaque dtype is fully handled in h5d.py
# and as this is really only for writing (where it marks
# a dtype with metadata) we just return the dtype in
# pyfive where we are only reading and users don't actually
# need this function. It is only included as the h5py docs
# make it seem relevant for reading. It is not.
return dt

def check_opaque_dtype(dt):
"""
If the dtype represents an HDF5 opaque type, returns True.
Returns False if the dtype does not represent an HDF5 opaque type.
"""
if dt.metadata and 'h5py_opaque' in dt.metadata:
return True
return False



def check_enum_dtype(dt):
"""
If the dtype represents an HDF5 enumerated type, returns the dictionary
Expand Down Expand Up @@ -53,6 +79,10 @@ def check_dtype(**kwds):
mapping string names to integer values. Returns None if the dtype does
not represent an HDF5 enumerated type.

opaque = dtype
If the dtype represents an HDF5 opaque type, returns True. Returns False if the
dtype does not represent an HDF5 opaque type.

"""
#ref = dtype
# If the dtype represents an HDF5 reference type, returns the reference
Expand All @@ -69,8 +99,10 @@ def check_dtype(**kwds):
return check_string_dtype(dt)
elif name == 'enum':
return check_enum_dtype(dt)
elif name == 'opaque':
return check_opaque_dtype(dt)
elif name == 'ref':
return NotImplementedError
raise NotImplementedError
else:
return None

Expand Down
152 changes: 152 additions & 0 deletions tests/test_opaque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os

import numpy as np
import pytest
from numpy.testing import assert_array_equal

import pyfive
import h5py


def test_opaque_dataset1_hdf5(name, data):

# Verify that h5py can read this file before we do
# our own test. If this fails, pyfive cannot be
# expected to get it right.

(ordinary_data, string_data, opdata) = data

with h5py.File(name, "r") as f:
dset = f["opaque_datetimes"]
assert_array_equal(dset[...], opdata.astype(h5py.opaque_dtype(opdata.dtype)))

# Now see if pyfive can do the right thing
with pyfive.File(name) as hfile:
# check data
dset = hfile["opaque_datetimes"]
# pyfive should return the same raw bytes that h5py wrote
# but in the instance that it is tagged with NUMPY,
# pyfive automatically fixes it, which it should be for this example.
assert_array_equal(dset[...], opdata)

# make sure the other things are fine
assert_array_equal(hfile['string_data'][...],string_data)
assert_array_equal(hfile['ordinary_data'][...],ordinary_data)

# check the dtype interrogation functions

assert pyfive.check_opaque_dtype(dset.dtype) is True
assert pyfive.check_enum_dtype(dset.dtype) is None
assert pyfive.check_opaque_dtype(hfile['ordinary_data'].dtype) is False
assert pyfive.check_dtype(opaque=hfile['ordinary_data'].dtype) is False
assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
assert pyfive.check_dtype(enum=hfile['string_data'].dtype) is None
assert pyfive.check_dtype(vlen=hfile['string_data'].dtype) is not None
assert pyfive.check_dtype(vlen=hfile['ordinary_data'].dtype) is None

dt = hfile['ordinary_data'].dtype
with pytest.raises(NotImplementedError):
pyfive.check_dtype(ref=dt)

with pytest.raises(TypeError):
pyfive.check_dtype(fred=1,jane=2)

def test_opaque_dataset2_fixed(really_opaque):

name, original_data = really_opaque


with h5py.File(name, "r") as f:
dset = f["opaque_data"]
assert dset.shape == (3,)
assert dset.dtype == np.dtype('V64')

for i, blob in enumerate(original_data):
assert dset[i].tobytes().startswith(blob)


with pyfive.File(name) as hfile:
dset = hfile['opaque_data']
assert dset.shape == (3,)
assert dset.dtype == np.dtype('V64')

for i, blob in enumerate(original_data):
assert dset[i].tobytes().startswith(blob)

assert pyfive.check_opaque_dtype(dset.dtype) is True
assert pyfive.check_dtype(opaque=dset.dtype) is True
assert pyfive.check_enum_dtype(dset.dtype) is None




@pytest.fixture(scope='module')
def really_opaque():
""" Create an HDF5 file with a fixed size opaque dataset. """
name = os.path.join(os.path.dirname(__file__), "opaque_fixed.hdf5")

with h5py.File(name, "w") as f:
# Define a fixed-size opaque dtype as NumPy void
max_len = 64 # bytes per element
dt = np.dtype(f"V{max_len}") # 'V' = void type

# Create dataset
dset = f.create_dataset("opaque_data", shape=(3,), dtype=dt)

data = [
b"hello world",
b"\x01\x02\x03\x04custombinarydata",
bytes(range(10))
]

for i, blob in enumerate(data):
buf = blob[:max_len].ljust(max_len, b'\x00')
dset[i] = np.void(buf)

return name , data


@pytest.fixture(scope='module')
def data():
"""Provide datetime64 array data."""
ordinary_data = np.array([1, 2, 3], dtype='i4')
#string_data = np.array([b'one', b'two', b'three'], dtype='S5')
dt = h5py.special_dtype(vlen=str)
string_data = np.array(['one', 'two', 'three'], dtype=dt)
opaque_data = np.array([
np.datetime64("2019-09-22T17:38:30"),
np.datetime64("2020-01-01T00:00:00"),
np.datetime64("2025-10-04T12:00:00"),
])

data = (ordinary_data, string_data, opaque_data)

return data


@pytest.fixture(scope='module')
def name(data):
"""Create an HDF5 file with datetime64 data stored as opaque."""
name = os.path.join(os.path.dirname(__file__), "opaque_datetime.hdf5")

(ordinary_data, string_data, opdata) = data

# Convert dtype to an opaque version (as per h5py docs)
# AFIK this just adds {'h5py_opaque': True} to the dtype metadata
# without which h5py cannot write the data.

opaque_data = opdata.astype(h5py.opaque_dtype(opdata.dtype))

# Want to put some other things in the file too, so we can exercise
# some of the other code paths.

with h5py.File(name, "w") as f:
f["opaque_datetimes"] = opaque_data
f['string_data'] = string_data
f['ordinary_data'] = ordinary_data

return name


Loading