- 
                Notifications
    You must be signed in to change notification settings 
- Fork 24
Closed
Labels
Description
Consider this dataset created with h5py:
import h5py
with h5py.File("test.hdf5", mode="w") as h5:
    ds = h5.create_dataset('vlen_strings', shape=10, dtype=h5py.string_dtype(encoding="utf-8"))
    ds[:4] = ["varying", "sizes", "of", "strings"]
    print(ds.asstr()[:])Output
['varying' 'sizes' 'of' 'strings' '' '' '' '' '' '']
H5DUMP - Note the NULL data
HDF5 "test.hdf5" {
GROUP "/" {
   DATASET "vlen_strings" {
      DATATYPE  H5T_STRING {
         STRSIZE H5T_VARIABLE;
         STRPAD H5T_STR_NULLTERM;
         CSET H5T_CSET_UTF8;
         CTYPE H5T_C_S1;
      }
      DATASPACE  SIMPLE { ( 10 ) / ( 10 ) }
      DATA {
      (0): "varying", "sizes", "of", "strings", NULL, NULL, NULL, NULL, NULL,
      (9): NULL
      }
   }
}
}
This breaks when reading back with pyfive:
import pyfive
with pyfive.File("test.hdf5") as h5:
    print(h5["vlen_strings"])
    print(h5["vlen_strings"][:])
<HDF5 dataset "vlen_strings": shape (10,), type "object">
Traceback (most recent call last):
  File "/home/kai/data/mambaforge/envs/xr_312_np2/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-2811624e6d8d>", line 4, in <module>
    print(h5["vlen_strings"][:])
          ~~~~~~~~~~~~~~~~~~^^^
  File "/home/kai/python/projects/pyfive/pyfive/high_level.py", line 313, in __getitem__
    data = self.id.get_data(args, self.fillvalue)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kai/python/projects/pyfive/pyfive/h5d.py", line 177, in get_data
    return self._get_contiguous_data(args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kai/python/projects/pyfive/pyfive/h5d.py", line 335, in _get_contiguous_data
    array = get_vlen_string_data_contiguous(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kai/python/projects/pyfive/pyfive/misc_low_level.py", line 360, in get_vlen_string_data_contiguous
    gheap = GlobalHeap(fh, gheap_address)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/kai/python/projects/pyfive/pyfive/misc_low_level.py", line 138, in __init__
    assert header['signature'] == b'GCOL'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
I've traced that back to trying to read those NULL data. PR with fix is underway.