NCAS-CMS · valeriupredoi · Oct 7, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
diff --git a/doc/api_reference.rst b/doc/api_reference.rst
@@ -29,13 +29,15 @@ The h5t module
 --------------
 
 Partial implementation of some of the lower level h5py API, needed
-to support enumerations and variable length strings.
+to support enumerations, variable length strings, and opaque datatypes.
 
 .. autofunction:: pyfive.h5t.check_enum_dtype
 
 .. autofunction:: pyfive.h5t.check_string_dtype
 
 .. autofunction:: pyfive.h5t.check_dtype
 
+.. autofunction:: pyfive.h5t.check_opaque_dtype
+
 .. autoclass:: pyfive.h5t.TypeEnumID
 
diff --git a/doc/quickstart/index.rst b/doc/quickstart/index.rst
@@ -7,4 +7,7 @@ Getting started
     Installation <installation>
     Usage <usage>
     Enumerations <enums>
+    Opaque Datasets <opaque>
+
+
 
diff --git a/doc/quickstart/opaque.rst b/doc/quickstart/opaque.rst
@@ -0,0 +1,25 @@
+Opaque Datasets 
+---------------
+
+It is possible to create datasets with opaque datatypes in HDF5.  These are
+datasets where the data is stored as a sequence of bytes, with no
+interpretation of those bytes.  This is not a commonly used feature of HDF5,
+but it is used in some applications.  The `h5py` package supports reading
+and writing opaque datatypes, and so `pyfive` also supports reading them.
+
+This implementation has only been tested for opaque datatypes that
+were created using `h5py`.
+
+Such opaque datatypes will be transparently read into the same type of
+numpy array as was used to write the data.  The users should not
+need to do anything special to read the data - but may need to do
+something special with the data to interpret it once read.
+
+
+
+
+
+
+
+
+
diff --git a/pyfive/__init__.py b/pyfive/__init__.py
@@ -5,7 +5,7 @@
 """
 
 from pyfive.high_level import File, Group, Dataset
-from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype
+from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype, opaque_dtype, check_opaque_dtype
 from pyfive.h5py import Datatype, Empty
 from importlib.metadata import version
 

diff --git a/pyfive/datatype_msg.py b/pyfive/datatype_msg.py
@@ -35,7 +35,7 @@ def determine_dtype(self):
         elif datatype_class == DATATYPE_BITFIELD:
             raise NotImplementedError("Bitfield datatype class not supported.")
         elif datatype_class == DATATYPE_OPAQUE:
-            raise NotImplementedError("Opaque datatype class not supported.")
+            return self._determine_dtype_opaque(datatype_msg)
         elif datatype_class == DATATYPE_COMPOUND:
             return self._determine_dtype_compound(datatype_msg)
         elif datatype_class == DATATYPE_REFERENCE:
@@ -161,6 +161,24 @@ def _determine_dtype_compound(self, datatype_msg):
 
         raise NotImplementedError("Compound dtype not supported.")
 
+    def _determine_dtype_opaque(self, datatype_msg):
+        """ Return the dtype information for an opaque class. """
+        # Opaque types are not understood by pyfive, so we return
+        # a tuple indicating the type is opaque, the size in bytes
+        # and the tag, if any. The tag is an ascii string, null terminated 
+        # and padded to an 8 byte boundary, the number of which is given by the 
+        # message size.
+        size =  datatype_msg['size']
+        null_location = self.buf.index(b'\x00', self.offset)
+        tag_size = _padded_size(null_location - self.offset + 1, 8)
+        tag_bytes = self.buf[self.offset:self.offset+tag_size]
+        tag = tag_bytes.strip(b'\x00').decode('ascii')
+        self.offset += tag_size
+        if tag == '':
+            tag = None  
+
+        return ('OPAQUE', tag, size)
+
     @staticmethod
     def _determine_dtype_vlen(datatype_msg):
         """ Return the dtype information for a variable length class. """

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
@@ -84,15 +84,21 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
 
-        if isinstance(dataobject.dtype, tuple):
-            if dataobject.dtype[0] == 'ENUMERATION':
+        dtype = dataobject.dtype
+        if isinstance(dtype, tuple):
+            if dtype[0] == 'ENUMERATION':
                 self._dtype = np.dtype(dataobject.dtype[1], metadata={'enum':dataobject.dtype[2]})
-            elif dataobject.dtype[0] == 'COMPOUND':
+            elif dtype[0] == 'COMPOUND':
                 self._dtype = np.dtype(dataobject.dtype[1])
+            elif dtype[0] == 'OPAQUE':
+                if dtype[1].startswith('NUMPY:'):
+                    self._dtype = np.dtype(dtype[1][6:], metadata={'h5py_opaque': True})
+                else: 
+                    self._dtype = np.dtype('V'+str(dtype[2]), metadata={'h5py_opaque': True})
             else:
-                self._dtype = dataobject.dtype
+                self._dtype = dtype
         else:
-            self._dtype = np.dtype(dataobject.dtype)
+            self._dtype = np.dtype(dtype)
 
         self._meta = DatasetMeta(dataobject)
 

diff --git a/pyfive/h5t.py b/pyfive/h5t.py
@@ -10,6 +10,32 @@
 string_info = namedtuple('string_info', ['encoding', 'length'])
 
 
+def opaque_dtype(dt):
+    """
+    Return the numpy dtype of the dtype. (So it does nothing,
+    but is included for compatibility with the h5py API
+    docuemntation which _implies_ this is needed to read data,
+    but it is not.)
+    """
+    # For pyfive, the opaque dtype is fully handled in h5d.py
+    # and as this is really only for writing (where it marks
+    # a dtype with metadata) we just return the dtype in 
+    # pyfive where we are only reading and users don't actually
+    # need  this function. It is only included as the h5py docs
+    # make it seem relevant for reading. It is not.
+    return dt 
+
+def check_opaque_dtype(dt):
+    """
+    If the dtype represents an HDF5 opaque type, returns True.
+    Returns False if the dtype does not represent an HDF5 opaque type.
+    """
+    if dt.metadata and 'h5py_opaque' in dt.metadata:
+        return True
+    return False 
+
+
+
 def check_enum_dtype(dt):
     """
     If the dtype represents an HDF5 enumerated type, returns the dictionary
@@ -53,6 +79,10 @@ def check_dtype(**kwds):
         mapping string names to integer values.  Returns None if the dtype does
         not represent an HDF5 enumerated type.
 
+    opaque = dtype
+        If the dtype represents an HDF5 opaque type, returns True.  Returns False if the
+        dtype does not represent an HDF5 opaque type.
+
     """
     #ref = dtype
     #    If the dtype represents an HDF5 reference type, returns the reference
@@ -69,8 +99,10 @@ def check_dtype(**kwds):
         return check_string_dtype(dt)
     elif name == 'enum':
         return check_enum_dtype(dt)
+    elif name == 'opaque':
+        return check_opaque_dtype(dt)
     elif name == 'ref':
-        return NotImplementedError
+        raise NotImplementedError
     else:
         return None
 

diff --git a/tests/test_opaque.py b/tests/test_opaque.py
@@ -0,0 +1,152 @@
+import os
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import pyfive
+import h5py
+
+
+def test_opaque_dataset1_hdf5(name, data):
+
+    # Verify that h5py can read this file before we do
+    # our own test. If this fails, pyfive cannot be 
+    # expected to get it right.
+
+    (ordinary_data, string_data, opdata) = data
+
+    with h5py.File(name, "r") as f:
+        dset = f["opaque_datetimes"]
+        assert_array_equal(dset[...], opdata.astype(h5py.opaque_dtype(opdata.dtype)))
+
+    # Now see if pyfive can do the right thing
+    with pyfive.File(name) as hfile:
+        # check data
+        dset = hfile["opaque_datetimes"]
+        # pyfive should return the same raw bytes that h5py wrote
+        # but in the instance that it is tagged with NUMPY, 
+        # pyfive automatically fixes it, which it should be for this example.
+        assert_array_equal(dset[...], opdata)
+
+        # make sure the other things are fine
+        assert_array_equal(hfile['string_data'][...],string_data)
+        assert_array_equal(hfile['ordinary_data'][...],ordinary_data)
+
+        # check the dtype interrogation functions
+
+        assert pyfive.check_opaque_dtype(dset.dtype) is True
+        assert pyfive.check_enum_dtype(dset.dtype) is None
+        assert pyfive.check_opaque_dtype(hfile['ordinary_data'].dtype) is False
+        assert pyfive.check_dtype(opaque=hfile['ordinary_data'].dtype) is False
+        assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
+        assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
+        assert pyfive.check_dtype(opaque=hfile['opaque_datetimes'].dtype) is True
+        assert pyfive.check_dtype(enum=hfile['string_data'].dtype) is None
+        assert pyfive.check_dtype(vlen=hfile['string_data'].dtype) is not None       
+        assert pyfive.check_dtype(vlen=hfile['ordinary_data'].dtype) is None
+
+        dt = hfile['ordinary_data'].dtype
+        with pytest.raises(NotImplementedError):
+            pyfive.check_dtype(ref=dt)
+
+        with pytest.raises(TypeError):
+            pyfive.check_dtype(fred=1,jane=2)
+
+def test_opaque_dataset2_fixed(really_opaque):
+
+    name, original_data = really_opaque
+
+
+    with h5py.File(name, "r") as f:
+        dset = f["opaque_data"]
+        assert dset.shape == (3,)
+        assert dset.dtype == np.dtype('V64')
+
+        for i, blob in enumerate(original_data):
+            assert dset[i].tobytes().startswith(blob)
+
+
+    with pyfive.File(name) as hfile:
+        dset = hfile['opaque_data']
+        assert dset.shape == (3,)
+        assert dset.dtype == np.dtype('V64')
+
+        for i, blob in enumerate(original_data):
+            assert dset[i].tobytes().startswith(blob)
+
+        assert pyfive.check_opaque_dtype(dset.dtype) is True
+        assert pyfive.check_dtype(opaque=dset.dtype) is True
+        assert pyfive.check_enum_dtype(dset.dtype) is None
+
+
+
+
+@pytest.fixture(scope='module')
+def really_opaque():
+    """ Create an HDF5 file with a fixed size opaque dataset. """
+    name = os.path.join(os.path.dirname(__file__), "opaque_fixed.hdf5")
+
+    with h5py.File(name, "w") as f:
+        # Define a fixed-size opaque dtype as NumPy void
+        max_len = 64  # bytes per element
+        dt = np.dtype(f"V{max_len}")  # 'V' = void type
+
+        # Create dataset
+        dset = f.create_dataset("opaque_data", shape=(3,), dtype=dt)
+
+        data = [
+            b"hello world",
+            b"\x01\x02\x03\x04custombinarydata",
+            bytes(range(10))
+        ]
+
+        for i, blob in enumerate(data):
+            buf = blob[:max_len].ljust(max_len, b'\x00')
+            dset[i] = np.void(buf)
+
+    return name , data  
+
+
+@pytest.fixture(scope='module')
+def data():
+    """Provide datetime64 array data."""
+    ordinary_data = np.array([1, 2, 3], dtype='i4')
+    #string_data = np.array([b'one', b'two', b'three'], dtype='S5')
+    dt = h5py.special_dtype(vlen=str)
+    string_data = np.array(['one', 'two', 'three'], dtype=dt)
+    opaque_data =  np.array([
+            np.datetime64("2019-09-22T17:38:30"),
+            np.datetime64("2020-01-01T00:00:00"),
+            np.datetime64("2025-10-04T12:00:00"),
+        ])
+
+    data = (ordinary_data, string_data, opaque_data)
+
+    return data
+
+
+@pytest.fixture(scope='module')
+def name(data):
+    """Create an HDF5 file with datetime64 data stored as opaque."""
+    name = os.path.join(os.path.dirname(__file__), "opaque_datetime.hdf5")
+
+    (ordinary_data, string_data, opdata) = data
+
+    # Convert dtype to an opaque version (as per h5py docs)
+    # AFIK this just adds {'h5py_opaque': True} to the dtype metadata
+    # without which h5py cannot write the data.
+
+    opaque_data = opdata.astype(h5py.opaque_dtype(opdata.dtype))
+
+    # Want to put some other things in the file too, so we can exercise
+    # some of the other code paths.
+
+    with h5py.File(name, "w") as f:
+        f["opaque_datetimes"] = opaque_data
+        f['string_data'] = string_data
+        f['ordinary_data'] = ordinary_data
+
+    return name
+
+