Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pyfive/dataobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,13 +301,17 @@ def _vlen_size_and_data(self, buf, offset):
# stored in the data object storage.
gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
gheap_address = gheap_id['collection_address']
#print('Collection address in _vlen', gheap_address)
if gheap_address not in self._global_heaps:
# load the global heap and cache the instance
gheap = GlobalHeap(self.fh, gheap_address)
self._global_heaps[gheap_address] = gheap
gheap = self._global_heaps[gheap_address]
vlen_data = gheap.objects[gheap_id['object_index']]
# only work on valid global heap addresses
if gheap_address != 0:
#print('Collection address in _vlen', gheap_address)
if gheap_address not in self._global_heaps:
# load the global heap and cache the instance
gheap = GlobalHeap(self.fh, gheap_address)
self._global_heaps[gheap_address] = gheap
gheap = self._global_heaps[gheap_address]
vlen_data = gheap.objects[gheap_id['object_index']]
else:
vlen_data = None
return vlen_size, vlen_data


Expand Down
9 changes: 5 additions & 4 deletions pyfive/h5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4):
self._msg_offset, self.layout_class,self.property_offset = dataobject.get_id_storage_params()
self._unique = (self._filename, self.shape, self._msg_offset)

if isinstance(dataobject.dtype,tuple):
if isinstance(dataobject.dtype, tuple):
if dataobject.dtype[0] == 'ENUMERATION':
self._dtype = np.dtype(dataobject.dtype[1], metadata={'enum':dataobject.dtype[2]})
else:
Expand Down Expand Up @@ -174,7 +174,7 @@ def get_data(self, args, fillvalue):
# created. One for the future.
return np.full(self.shape, fillvalue, dtype=dtype)[args]
else:
return self._get_contiguous_data(args)
return self._get_contiguous_data(args, fillvalue)
case 2: # chunked storage
if not self._index:
# no storage is backing array, return an array of
Expand Down Expand Up @@ -312,7 +312,7 @@ def _build_index(self, dataobject):
self._nthindex.append(key)
self._index[key] = StoreInfo(key, filter_mask, addr, size)

def _get_contiguous_data(self, args):
def _get_contiguous_data(self, args, fillvalue):

if isinstance(self._dtype, tuple):
dtype_class = self._dtype[0]
Expand All @@ -337,7 +337,8 @@ def _get_contiguous_data(self, args):
self.data_offset,
self._global_heaps,
self.shape,
self._dtype
self._dtype,
fillvalue,
)
if self.posix:
fh.close()
Expand Down
29 changes: 20 additions & 9 deletions pyfive/misc_low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,30 +337,41 @@ def _indirect_info(self, nrows):
return ndirect, nindirect

def get_vlen_string_data_contiguous(
fh, data_offset, global_heaps, shape, dtype
fh, data_offset, global_heaps, shape, dtype, fillvalue
):
""" Return the data for a variable which is made up of variable length string data """
# we need to import this from DatasetID, and that's imported from Dataobjects hence
# hiding it here in misc_low_level.
if fillvalue in [0, None]:
fillvalue = b""

fh.seek(data_offset)
count = prod(shape)
_, _, character_set = dtype
if int(character_set) not in [0, 1]:
raise ValueError(f'Unexpected string type, cannot decode character set {character_set}')
value = np.empty(count,dtype=object)

# create with fillvalue
value = np.full(count, fillvalue, dtype=object)
offset = 0
buf = fh.read(16*count)
for i in range(count):
# vlen_size, = struct.unpack_from('<I', buf, offset=offset)
gheap_id = _unpack_struct_from(GLOBAL_HEAP_ID, buf, offset+4)
gheap_address = gheap_id['collection_address']
#print('Collection address for data', gheap_address)
if gheap_address not in global_heaps:
# load the global heap and cache the instance
gheap = GlobalHeap(fh, gheap_address)
global_heaps[gheap_address] = gheap
gheap = global_heaps[gheap_address]
value[i] = gheap.objects[gheap_id['object_index']]
# only work on valid global heap addresses
if gheap_address != 0:
#print('Collection address for data', gheap_address)
if gheap_address not in global_heaps:
# load the global heap and cache the instance
gheap = GlobalHeap(fh, gheap_address)
global_heaps[gheap_address] = gheap
gheap = global_heaps[gheap_address]

# skip if NULL vlen entry
if (obj_index:=gheap_id['object_index']) != 0:
value[i] = gheap.objects[obj_index]

offset +=16

# If character_set == 0 ascii character set, return as
Expand Down
137 changes: 79 additions & 58 deletions tests/make_references_file.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,84 @@
#! /usr/bin/env python
""" Create a HDF5 file with references. """
import sys
import h5py
import numpy as np
from pathlib import Path

f = h5py.File('references.hdf5', 'w')

# some HDF5 objects for testing
f.attrs.create('root_attr', 123)

dset1 = f.create_dataset(
'dataset1', shape=(4, ), dtype='<i4', data=np.arange(4), track_times=False)
dset1.attrs.create('dset_attr', 456)
region_ref = dset1.regionref[::2]

grp = f.create_group('group1')
grp.attrs.create('group_attr', 789)

# references
f.attrs['root_group_reference'] = f.ref
f.attrs['dataset1_reference'] = dset1.ref
f.attrs['group1_reference'] = grp.ref
f.attrs['dataset1_region_reference'] = region_ref

# variable length sequence of references sequence
val = np.empty((2, ), dtype=np.object)
ref_dtype = h5py.special_dtype(ref=h5py.Reference)
val[0] = np.array([f.ref], dtype=ref_dtype)
val[1] = np.array([dset1.ref, grp.ref], dtype=ref_dtype)
dt = h5py.special_dtype(vlen=ref_dtype)
f.attrs.create('vlen_refs', val, dtype=dt)

# array of references
ref_dtype = h5py.special_dtype(ref=h5py.Reference)

ref_dataset = f.create_dataset(
"ref_dataset", (4,), dtype=ref_dtype, track_times=False)
ref_dataset[0] = f.ref
ref_dataset[1] = dset1.ref
ref_dataset[2] = grp.ref
# ref_dataset[3] is a Null reference

chunked_ref_dataset = f.create_dataset(
"chunked_ref_dataset", (4,), chunks=(2, ), dtype=ref_dtype,
track_times=False)
chunked_ref_dataset[0] = f.ref
chunked_ref_dataset[1] = dset1.ref
chunked_ref_dataset[2] = grp.ref
# chunked_ref_dataset[3] is a Null reference

regionref_dtype = h5py.special_dtype(ref=h5py.RegionReference)

regionref_dataset = f.create_dataset(
"regionref_dataset", (2,), dtype=regionref_dtype, track_times=False)
regionref_dataset[0] = region_ref

chunked_regionref_dataset = f.create_dataset(
"chunked_regionref_dataset", (2,), chunks=(1, ), dtype=regionref_dtype,
track_times=False)
chunked_regionref_dataset[0] = region_ref
# chunked_regionref_dataset[1] is a Null reference

f.close()

def create_file(path):
with h5py.File(path, 'w') as f:

# some HDF5 objects for testing
f.attrs.create('root_attr', 123)

dset1 = f.create_dataset(
'dataset1', shape=(4, ), dtype='<i4', data=np.arange(4), track_times=False)
dset1.attrs.create('dset_attr', 456)
region_ref = dset1.regionref[::2]

grp = f.create_group('group1')
grp.attrs.create('group_attr', 789)

# references
f.attrs['root_group_reference'] = f.ref
f.attrs['dataset1_reference'] = dset1.ref
f.attrs['group1_reference'] = grp.ref
f.attrs['dataset1_region_reference'] = region_ref

# variable length sequence of references sequence
val = np.empty((2, ), dtype=object)
ref_dtype = h5py.special_dtype(ref=h5py.Reference)
val[0] = np.array([f.ref], dtype=ref_dtype)
val[1] = np.array([dset1.ref, grp.ref], dtype=ref_dtype)
dt = h5py.special_dtype(vlen=ref_dtype)
f.attrs.create('vlen_refs', val, dtype=dt)

# array of references
ref_dtype = h5py.special_dtype(ref=h5py.Reference)

ref_dataset = f.create_dataset(
"ref_dataset", (4,), dtype=ref_dtype, track_times=False)
ref_dataset[0] = f.ref
ref_dataset[1] = dset1.ref
ref_dataset[2] = grp.ref
# ref_dataset[3] is a Null reference

chunked_ref_dataset = f.create_dataset(
"chunked_ref_dataset", (4,), chunks=(2, ), dtype=ref_dtype,
track_times=False)
chunked_ref_dataset[0] = f.ref
chunked_ref_dataset[1] = dset1.ref
chunked_ref_dataset[2] = grp.ref
# chunked_ref_dataset[3] is a Null reference

regionref_dtype = h5py.special_dtype(ref=h5py.RegionReference)

regionref_dataset = f.create_dataset(
"regionref_dataset", (2,), dtype=regionref_dtype, track_times=False)
regionref_dataset[0] = region_ref

chunked_regionref_dataset = f.create_dataset(
"chunked_regionref_dataset", (2,), chunks=(1, ), dtype=regionref_dtype,
track_times=False)
chunked_regionref_dataset[0] = region_ref
# chunked_regionref_dataset[1] is a Null reference

# uninitialized references
# the following code creates a partly uninitialized attribute
# DIMENSION_LIST
# it seems creating attributes the normal way are always fully initialized
foo_data = np.arange(4).reshape(2, 2)
f.create_dataset("foo1", data=foo_data)
f.create_dataset("x", data=np.arange(2))
f.create_dataset("y", data=np.arange(2))

f["x"].make_scale()
f["y"].make_scale()
f["foo1"].dims[0].attach_scale(f["x"])


if __name__ == "__main__":
default_path = Path(__file__).parent / "references.hdf5"
filepath = Path(sys.argv[1]) if len(sys.argv) > 1 else default_path
create_file(filepath)
20 changes: 20 additions & 0 deletions tests/test_references.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
""" Unit tests for pyfive references. """
import os
import sys
import subprocess

import numpy as np
from numpy.testing import assert_array_equal, assert_raises
import pytest

import pyfive

DIRNAME = os.path.dirname(__file__)
REFERENCES_HDF5_FILE = os.path.join(DIRNAME, 'references.hdf5')
MAKE_REFERENCES_SCRIPT = os.path.join(DIRNAME, 'make_references_file.py')


@pytest.fixture(scope="module")
def references_hdf5(tmp_path_factory):
tmp_dir = tmp_path_factory.mktemp("references")
path = tmp_dir / "references.hdf5"
subprocess.run([sys.executable, MAKE_REFERENCES_SCRIPT, str(path)], check=True)
return str(path)


def test_reference_attrs():
Expand Down Expand Up @@ -158,3 +170,11 @@ def test_region_reference_attrs():
subset = dset1[region_ref]
assert_array_equal(subset, [0, 2])
"""

def test_uninitialized_references(references_hdf5):
with pyfive.File(references_hdf5) as hfile:
# testing id of connected dimension
assert hfile[hfile["foo1"].attrs["DIMENSION_LIST"][0][0]].id == hfile["x"].id
# test empty aka NULL return
assert hfile["foo1"].attrs["DIMENSION_LIST"][1].tolist() == []

16 changes: 12 additions & 4 deletions tests/test_vlen_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ def make_file_hdf5(our_file, vlen_strings):
with h5py.File(our_file,'w') as hfile:

dt = h5py.special_dtype(vlen=str)
v = hfile.create_dataset("var_len_str", (2,), dtype=dt)
v[:] = vlen_strings
v = hfile.create_dataset("var_len_str", (4,), dtype=dt)
v[:2] = vlen_strings

v = hfile.create_dataset("var_len_str_fv", (4,), dtype=dt,
fillvalue=b"this really fills the data")
v[:2] = vlen_strings


def make_file_nc(file_like,m_array, inmemory=False):
Expand Down Expand Up @@ -134,7 +138,7 @@ def test_vlen_string_hdf5(tmp_path):
#tfile = io.BytesIO()
our_file = tmp_path/'h5py_vlen.hdf5'
our_view = tmp_path/'h5py_vlen.txt'
vlen_strings = ["foo","foobar"]
vlen_strings = ["foo", "foobar"]
make_file_hdf5(our_file, vlen_strings)
#os.system(f'h5dump {our_file} > {our_view}')
#with open(our_view,'r') as f:
Expand All @@ -144,8 +148,12 @@ def test_vlen_string_hdf5(tmp_path):
with pyfive.File(our_file) as hfile:

ds1 = hfile['var_len_str'][:]
ds2 = hfile['var_len_str_fv'][:]
print(ds1)
assert np.array_equal(ds1,vlen_strings)
print(ds2)
assert np.array_equal(ds1[:2], vlen_strings)
assert np.array_equal(ds2[:2], vlen_strings)
assert ds2[3] == "this really fills the data"

def NOtest_vlen_string_nc1():
""" this verson currently fails because netcdf4 is doing something odd in memory """
Expand Down
Loading