Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ The h5t module
--------------

Partial implementation of some of the lower level h5py API, needed
to support enumerations and variable length strings.
to support enumerations, variable length strings, and opaque datatypes.

.. autofunction:: pyfive.h5t.check_enum_dtype

.. autofunction:: pyfive.h5t.check_string_dtype

.. autofunction:: pyfive.h5t.check_dtype

.. autofunction:: pyfive.h5t.check_opaque_dtype

.. autoclass:: pyfive.h5t.TypeEnumID

3 changes: 3 additions & 0 deletions doc/quickstart/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ Getting started
Installation <installation>
Usage <usage>
Enumerations <enums>
Opaque Datasets <opaque>



25 changes: 25 additions & 0 deletions doc/quickstart/opaque.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Opaque Datasets
---------------

It is possible to create datasets with opaque datatypes in HDF5. These are
datasets where the data is stored as a sequence of bytes, with no
interpretation of those bytes. This is not a commonly used feature of HDF5,
but it is used in some applications. The `h5py` package supports reading
and writing opaque datatypes, and so `pyfive` also supports reading them.

This implementation has only been tested for opaque datatypes that
were created using `h5py`.

Such opaque datatypes will be transparently read into the same type of
numpy array as was used to write the data. The users should not
need to do anything special to read the data - but may need to do
something special with the data to interpret it once read.









2 changes: 1 addition & 1 deletion pyfive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

from pyfive.high_level import File, Group, Dataset
from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype
from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype, opaque_dtype, check_opaque_dtype
from pyfive.h5py import Datatype, Empty
from importlib.metadata import version

Expand Down
22 changes: 21 additions & 1 deletion pyfive/datatype_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def determine_dtype(self):
elif datatype_class == DATATYPE_BITFIELD:
raise NotImplementedError("Bitfield datatype class not supported.")
elif datatype_class == DATATYPE_OPAQUE:
raise NotImplementedError("Opaque datatype class not supported.")
return self._determine_dtype_opaque(datatype_msg)
elif datatype_class == DATATYPE_COMPOUND:
return self._determine_dtype_compound(datatype_msg)
elif datatype_class == DATATYPE_REFERENCE:
Expand Down Expand Up @@ -161,6 +161,26 @@ def _determine_dtype_compound(self, datatype_msg):

raise NotImplementedError("Compound dtype not supported.")

def _determine_dtype_opaque(self, datatype_msg):
""" Return the dtype information for an opaque class. """
# Opaque types are not understood by pyfive, so we return
# a tuple indicating the type is opaque, the size in bytes
# and the tag, if any. THe tag is a an ascii string, null terminated
# and padded to an 8 byte boundary, the number of which is given by the
# message size.
nbufs = datatype_msg['size']
tag = None
if nbufs:
tag_size = nbufs * 8
tag_bytes = self.buf[self.offset:self.offset+tag_size]
null_location = tag_bytes.find(b'\x00')
if null_location != -1:
tag = tag_bytes[:null_location].decode('ascii')
else:
tag = tag_bytes.decode('ascii')
self.offset += tag_size
return ('OPAQUE', tag)

@staticmethod
def _determine_dtype_vlen(datatype_msg):
""" Return the dtype information for a variable length class. """
Expand Down
7 changes: 7 additions & 0 deletions pyfive/h5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,18 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
self._unique = (self._filename, self.shape, self._msg_offset)

#FIXME: Everyone of these parses the message each time. Optimisation possible?
if isinstance(dataobject.dtype, tuple):
if dataobject.dtype[0] == 'ENUMERATION':
self._dtype = np.dtype(dataobject.dtype[1], metadata={'enum':dataobject.dtype[2]})
elif dataobject.dtype[0] == 'COMPOUND':
self._dtype = np.dtype(dataobject.dtype[1])
elif dataobject.dtype[0] == 'OPAQUE':
if dataobject.dtype[1].startswith('NUMPY:'):
self._dtype = np.dtype(dataobject.dtype[1][6:], metadata={'h5py_opaque': True})
else:
# Can't test this, since I don't know how to write a test dataset
self._dtype = np.dtype('V'+str(dataobject.dtype[1]), metadata={'h5py_opaque': True})
else:
self._dtype = dataobject.dtype
else:
Expand Down
32 changes: 32 additions & 0 deletions pyfive/h5t.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,32 @@
string_info = namedtuple('string_info', ['encoding', 'length'])


def opaque_dtype(dt):
"""
Return the numpy dtype of the dtype. (So it does nothing,
but is included for compatibility with the h5py API
docuemntation which _implies_ this is needed to read data,
but it is not.)
"""
# For pyfive, the opaque dtype is fully handled in h5d.py
# and as this is really only for writing (where it marks
# a dtype with metadata) we just return the dtype in
# pyfive where we are only reading and users don't actually
# need this function. It is only included as the h5py docs
# make it seem relevant for reading. It is not.
return dt

def check_opaque_dtype(dt):
"""
If the dtype represents an HDF5 opaque type, returns True.
Returns False if the dtype does not represent an HDF5 opaque type.
"""
if dt.metadata and 'h5py_opaque' in dt.metadata:
return True
return False



def check_enum_dtype(dt):
"""
If the dtype represents an HDF5 enumerated type, returns the dictionary
Expand Down Expand Up @@ -53,6 +79,10 @@ def check_dtype(**kwds):
mapping string names to integer values. Returns None if the dtype does
not represent an HDF5 enumerated type.

opaque = dtype
If the dtype represents an HDF5 opaque type, returns True. Returns False if the
dtype does not represent an HDF5 opaque type.

"""
#ref = dtype
# If the dtype represents an HDF5 reference type, returns the reference
Expand All @@ -69,6 +99,8 @@ def check_dtype(**kwds):
return check_string_dtype(dt)
elif name == 'enum':
return check_enum_dtype(dt)
elif name == 'opaque':
return check_opaque_dtype(dt)
elif name == 'ref':
return NotImplementedError
else:
Expand Down
84 changes: 84 additions & 0 deletions tests/test_opaque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os

import numpy as np
import pytest
from numpy.testing import assert_array_equal

import pyfive
import h5py


def test_opaque_dataset_hdf5(name, data):

# Verify that h5py can read this file before we do
# our own test. If this fails, pyfive cannot be
# expected to get it right.

(ordinary_data, string_data, opdata) = data

with h5py.File(name, "r") as f:
dset = f["opaque_datetimes"]
assert_array_equal(dset[...], opdata.astype(h5py.opaque_dtype(opdata.dtype)))

# Now see if pyfive can do the right thing
with pyfive.File(name) as hfile:
# check data
dset = hfile["opaque_datetimes"]
# pyfive should return the same raw bytes that h5py wrote
# but in the instance that it is tagged with NUMPY,
# pyfive automatically fixes it, which it should be for this example.
assert_array_equal(dset[...], opdata)

# make sure the other things are fine
assert_array_equal(hfile['string_data'][...],string_data)
assert_array_equal(hfile['ordinary_data'][...],ordinary_data)

assert pyfive.check_opaque_dtype(dset.dtype) is True
assert pyfive.check_enum_dtype(dset.dtype) is None
assert pyfive.check_opaque_dtype(hfile['ordinary_data'].dtype) is False
assert pyfive.check_dtype(opaque=hfile['ordinary_data'].dtype) is False
assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True






@pytest.fixture(scope='module')
def data():
"""Provide datetime64 array data."""
ordinary_data = np.array([1, 2, 3], dtype='i4')
string_data = np.array([b'one', b'two', b'three'], dtype='S5')
opaque_data = np.array([
np.datetime64("2019-09-22T17:38:30"),
np.datetime64("2020-01-01T00:00:00"),
np.datetime64("2025-10-04T12:00:00"),
])

data = (ordinary_data, string_data, opaque_data)

return data


@pytest.fixture(scope='module')
def name(data):
"""Create an HDF5 file with datetime64 data stored as opaque."""
name = os.path.join(os.path.dirname(__file__), "opaque_datetime.hdf5")

(ordinary_data, string_data, opdata) = data

# Convert dtype to an opaque version (as per h5py docs)
# AFIK this just adds {'h5py_opaque': True} to the dtype metadata
# without which h5py cannot write the data.

opaque_data = opdata.astype(h5py.opaque_dtype(opdata.dtype))

# Want to put some other things in the file too, so we can exercise
# some of the other code paths.

with h5py.File(name, "w") as f:
f["opaque_datetimes"] = opaque_data
f['string_data'] = string_data
f['ordinary_data'] = ordinary_data

return name
Loading