NCAS-CMS · valeriupredoi · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
@@ -301,13 +301,17 @@ def _vlen_size_and_data(self, buf, offset):
         # stored in the data object storage.
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
-        #print('Collection address in _vlen', gheap_address)
-        if gheap_address not in self._global_heaps:
-            # load the global heap and cache the instance
-            gheap = GlobalHeap(self.fh, gheap_address)
-            self._global_heaps[gheap_address] = gheap
-        gheap = self._global_heaps[gheap_address]
-        vlen_data = gheap.objects[gheap_id['object_index']]
+        # only work on valid global heap addresses
+        if gheap_address != 0:
+            #print('Collection address in _vlen', gheap_address)
+            if gheap_address not in self._global_heaps:
+                # load the global heap and cache the instance
+                gheap = GlobalHeap(self.fh, gheap_address)
+                self._global_heaps[gheap_address] = gheap
+            gheap = self._global_heaps[gheap_address]
+            vlen_data = gheap.objects[gheap_id['object_index']]
+        else:
+            vlen_data = None
         return vlen_size, vlen_data
 
 

diff --git a/pyfive/h5d.py b/pyfive/h5d.py
@@ -84,7 +84,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
         self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
         self._unique = (self._filename, self.shape, self._msg_offset)
 
-        if isinstance(dataobject.dtype,tuple):
+        if isinstance(dataobject.dtype, tuple):
             if dataobject.dtype[0] == 'ENUMERATION':
                 self._dtype = np.dtype(dataobject.dtype[1], metadata={'enum':dataobject.dtype[2]})
             else:
@@ -174,7 +174,7 @@ def get_data(self, args, fillvalue):
                     #       created. One for the future.
                     return np.full(self.shape, fillvalue, dtype=dtype)[args]
                 else:
-                    return self._get_contiguous_data(args)
+                    return self._get_contiguous_data(args, fillvalue)
             case 2:  # chunked storage
                 if not self._index:
                     # no storage is backing array, return an array of
@@ -312,7 +312,7 @@ def _build_index(self, dataobject):
                 self._nthindex.append(key)
                 self._index[key] = StoreInfo(key, filter_mask, addr, size)
 
-    def _get_contiguous_data(self, args):
+    def _get_contiguous_data(self, args, fillvalue):
 
         if isinstance(self._dtype, tuple):
             dtype_class = self._dtype[0]
@@ -337,7 +337,8 @@ def _get_contiguous_data(self, args):
                     self.data_offset,
                     self._global_heaps,
                     self.shape,
-                    self._dtype
+                    self._dtype,
+                    fillvalue,
                 )
                 if self.posix:
                     fh.close()

diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py
@@ -337,30 +337,41 @@ def _indirect_info(self, nrows):
         return ndirect, nindirect
 
 def get_vlen_string_data_contiguous(
-        fh, data_offset, global_heaps, shape, dtype
+        fh, data_offset, global_heaps, shape, dtype, fillvalue
 ):
     """ Return the data for a variable which is made up of variable length string data """
     # we need to import this from DatasetID, and that's imported from Dataobjects hence
     # hiding it here in misc_low_level.
+    if fillvalue in [0, None]:
+        fillvalue = b""
+
     fh.seek(data_offset)
     count = prod(shape)
     _, _, character_set = dtype
     if int(character_set) not in [0, 1]:
         raise ValueError(f'Unexpected string type, cannot decode character set {character_set}')
-    value = np.empty(count,dtype=object)
+
+    # create with fillvalue
+    value = np.full(count, fillvalue, dtype=object)
     offset = 0
     buf = fh.read(16*count)
     for i in range(count):
         # vlen_size, = struct.unpack_from('<I', buf, offset=offset)
         gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
         gheap_address = gheap_id['collection_address']
-        #print('Collection address for data', gheap_address)
-        if gheap_address not in global_heaps:
-            # load the global heap and cache the instance
-            gheap = GlobalHeap(fh, gheap_address)
-            global_heaps[gheap_address] = gheap
-        gheap = global_heaps[gheap_address]
-        value[i] = gheap.objects[gheap_id['object_index']]
+        # only work on valid global heap addresses
+        if gheap_address != 0:
+            #print('Collection address for data', gheap_address)
+            if gheap_address not in global_heaps:
+                # load the global heap and cache the instance
+                gheap = GlobalHeap(fh, gheap_address)
+                global_heaps[gheap_address] = gheap
+            gheap = global_heaps[gheap_address]
+
+            # skip if NULL vlen entry
+            if (obj_index:=gheap_id['object_index']) != 0:
+                value[i] = gheap.objects[obj_index]
+
         offset +=16
 
     # If character_set == 0 ascii character set, return as

diff --git a/tests/make_references_file.py b/tests/make_references_file.py
@@ -1,63 +1,84 @@
 #! /usr/bin/env python
 """ Create a HDF5 file with references. """
+import sys
 import h5py
 import numpy as np
+from pathlib import Path
 
-f = h5py.File('references.hdf5', 'w')
-
-# some HDF5 objects for testing
-f.attrs.create('root_attr', 123)
-
-dset1 = f.create_dataset(
-    'dataset1', shape=(4, ), dtype='<i4', data=np.arange(4), track_times=False)
-dset1.attrs.create('dset_attr', 456)
-region_ref = dset1.regionref[::2]
-
-grp = f.create_group('group1')
-grp.attrs.create('group_attr', 789)
-
-# references
-f.attrs['root_group_reference'] = f.ref
-f.attrs['dataset1_reference'] = dset1.ref
-f.attrs['group1_reference'] = grp.ref
-f.attrs['dataset1_region_reference'] = region_ref
-
-# variable length sequence of references sequence
-val = np.empty((2, ), dtype=np.object)
-ref_dtype = h5py.special_dtype(ref=h5py.Reference)
-val[0] = np.array([f.ref], dtype=ref_dtype)
-val[1] = np.array([dset1.ref, grp.ref], dtype=ref_dtype)
-dt = h5py.special_dtype(vlen=ref_dtype)
-f.attrs.create('vlen_refs', val, dtype=dt)
-
-# array of references
-ref_dtype = h5py.special_dtype(ref=h5py.Reference)
-
-ref_dataset = f.create_dataset(
-    "ref_dataset", (4,), dtype=ref_dtype, track_times=False)
-ref_dataset[0] = f.ref
-ref_dataset[1] = dset1.ref
-ref_dataset[2] = grp.ref
-# ref_dataset[3] is a Null reference
-
-chunked_ref_dataset = f.create_dataset(
-    "chunked_ref_dataset", (4,), chunks=(2, ), dtype=ref_dtype,
-    track_times=False)
-chunked_ref_dataset[0] = f.ref
-chunked_ref_dataset[1] = dset1.ref
-chunked_ref_dataset[2] = grp.ref
-# chunked_ref_dataset[3] is a Null reference
-
-regionref_dtype = h5py.special_dtype(ref=h5py.RegionReference)
-
-regionref_dataset = f.create_dataset(
-    "regionref_dataset", (2,), dtype=regionref_dtype, track_times=False)
-regionref_dataset[0] = region_ref
-
-chunked_regionref_dataset = f.create_dataset(
-    "chunked_regionref_dataset", (2,), chunks=(1, ), dtype=regionref_dtype,
-    track_times=False)
-chunked_regionref_dataset[0] = region_ref
-# chunked_regionref_dataset[1] is a Null reference
-
-f.close()
+
+def create_file(path):
+    with h5py.File(path, 'w') as f:
+
+        # some HDF5 objects for testing
+        f.attrs.create('root_attr', 123)
+
+        dset1 = f.create_dataset(
+            'dataset1', shape=(4, ), dtype='<i4', data=np.arange(4), track_times=False)
+        dset1.attrs.create('dset_attr', 456)
+        region_ref = dset1.regionref[::2]
+
+        grp = f.create_group('group1')
+        grp.attrs.create('group_attr', 789)
+
+        # references
+        f.attrs['root_group_reference'] = f.ref
+        f.attrs['dataset1_reference'] = dset1.ref
+        f.attrs['group1_reference'] = grp.ref
+        f.attrs['dataset1_region_reference'] = region_ref
+
+        # variable length sequence of references sequence
+        val = np.empty((2, ), dtype=object)
+        ref_dtype = h5py.special_dtype(ref=h5py.Reference)
+        val[0] = np.array([f.ref], dtype=ref_dtype)
+        val[1] = np.array([dset1.ref, grp.ref], dtype=ref_dtype)
+        dt = h5py.special_dtype(vlen=ref_dtype)
+        f.attrs.create('vlen_refs', val, dtype=dt)
+
+        # array of references
+        ref_dtype = h5py.special_dtype(ref=h5py.Reference)
+
+        ref_dataset = f.create_dataset(
+            "ref_dataset", (4,), dtype=ref_dtype, track_times=False)
+        ref_dataset[0] = f.ref
+        ref_dataset[1] = dset1.ref
+        ref_dataset[2] = grp.ref
+        # ref_dataset[3] is a Null reference
+
+        chunked_ref_dataset = f.create_dataset(
+            "chunked_ref_dataset", (4,), chunks=(2, ), dtype=ref_dtype,
+            track_times=False)
+        chunked_ref_dataset[0] = f.ref
+        chunked_ref_dataset[1] = dset1.ref
+        chunked_ref_dataset[2] = grp.ref
+        # chunked_ref_dataset[3] is a Null reference
+
+        regionref_dtype = h5py.special_dtype(ref=h5py.RegionReference)
+
+        regionref_dataset = f.create_dataset(
+            "regionref_dataset", (2,), dtype=regionref_dtype, track_times=False)
+        regionref_dataset[0] = region_ref
+
+        chunked_regionref_dataset = f.create_dataset(
+            "chunked_regionref_dataset", (2,), chunks=(1, ), dtype=regionref_dtype,
+            track_times=False)
+        chunked_regionref_dataset[0] = region_ref
+        # chunked_regionref_dataset[1] is a Null reference
+
+        # uninitialized references
+        # the following code creates a partly uninitialized attribute
+        # DIMENSION_LIST
+        # it seems creating attributes the normal way are always fully initialized
+        foo_data = np.arange(4).reshape(2, 2)
+        f.create_dataset("foo1", data=foo_data)
+        f.create_dataset("x", data=np.arange(2))
+        f.create_dataset("y", data=np.arange(2))
+
+        f["x"].make_scale()
+        f["y"].make_scale()
+        f["foo1"].dims[0].attach_scale(f["x"])
+
+
+if __name__ == "__main__":
+    default_path = Path(__file__).parent / "references.hdf5"
+    filepath = Path(sys.argv[1]) if len(sys.argv) > 1 else default_path
+    create_file(filepath)
diff --git a/tests/test_references.py b/tests/test_references.py
@@ -1,13 +1,25 @@
 """ Unit tests for pyfive references. """
 import os
+import sys
+import subprocess
 
 import numpy as np
 from numpy.testing import assert_array_equal, assert_raises
+import pytest
 
 import pyfive
 
 DIRNAME = os.path.dirname(__file__)
 REFERENCES_HDF5_FILE = os.path.join(DIRNAME, 'references.hdf5')
+MAKE_REFERENCES_SCRIPT = os.path.join(DIRNAME, 'make_references_file.py')
+
+
+@pytest.fixture(scope="module")
+def references_hdf5(tmp_path_factory):
+    tmp_dir = tmp_path_factory.mktemp("references")
+    path = tmp_dir / "references.hdf5"
+    subprocess.run([sys.executable, MAKE_REFERENCES_SCRIPT, str(path)], check=True)
+    return str(path)
 
 
 def test_reference_attrs():
@@ -158,3 +170,11 @@ def test_region_reference_attrs():
         subset = dset1[region_ref]
         assert_array_equal(subset, [0, 2])
 """
+
+def test_uninitialized_references(references_hdf5):
+    with pyfive.File(references_hdf5) as hfile:
+        # testing id of connected dimension
+        assert hfile[hfile["foo1"].attrs["DIMENSION_LIST"][0][0]].id == hfile["x"].id
+        # test empty aka NULL return
+        assert hfile["foo1"].attrs["DIMENSION_LIST"][1].tolist() == []
+
diff --git a/tests/test_vlen_str.py b/tests/test_vlen_str.py
@@ -11,8 +11,12 @@ def make_file_hdf5(our_file, vlen_strings):
     with h5py.File(our_file,'w') as hfile:
 
         dt = h5py.special_dtype(vlen=str)
-        v = hfile.create_dataset("var_len_str", (2,), dtype=dt)
-        v[:] = vlen_strings
+        v = hfile.create_dataset("var_len_str", (4,), dtype=dt)
+        v[:2] = vlen_strings
+
+        v = hfile.create_dataset("var_len_str_fv", (4,), dtype=dt,
+                                fillvalue=b"this really fills the data")
+        v[:2] = vlen_strings
 
 
 def make_file_nc(file_like,m_array, inmemory=False):
@@ -134,7 +138,7 @@ def test_vlen_string_hdf5(tmp_path):
     #tfile = io.BytesIO()
     our_file = tmp_path/'h5py_vlen.hdf5'
     our_view = tmp_path/'h5py_vlen.txt'
-    vlen_strings = ["foo","foobar"]
+    vlen_strings = ["foo", "foobar"]
     make_file_hdf5(our_file, vlen_strings)
     #os.system(f'h5dump {our_file} > {our_view}')
     #with open(our_view,'r') as f:
@@ -144,8 +148,12 @@ def test_vlen_string_hdf5(tmp_path):
     with pyfive.File(our_file) as hfile:
 
         ds1 = hfile['var_len_str'][:]
+        ds2 = hfile['var_len_str_fv'][:]
         print(ds1)
-        assert np.array_equal(ds1,vlen_strings)
+        print(ds2)
+        assert np.array_equal(ds1[:2], vlen_strings)
+        assert np.array_equal(ds2[:2], vlen_strings)
+        assert ds2[3] == "this really fills the data"
 
 def NOtest_vlen_string_nc1():
     """ this verson currently fails because netcdf4 is doing something odd in memory """