From 9bf85babdfbb90bcfaa4466e646e100baa84d07d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 21 Oct 2025 14:33:15 +0100 Subject: [PATCH 01/16] Calculate b-tree range --- pyfive/h5d.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 511ff80..b1b6895 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -249,6 +249,22 @@ def index(self): raise ValueError('No chunk index available for HDF layout class {self.layout}') else: return self._index + + ##### This property is made available to help understand object store performance + @property + def btree_range(self): + """ A tuple with the addresses of the first b-tree node + for this variable, and the address of the furthest away node + (Which may not be the last one in the chunk index). This property + may be of use in understanding the read performance of chunked + data in object stores. ``btree_range`` is a ``pyfive`` API extgension. + """ + if self._index is None: + raise ValueError('No b-tree available for HDF layout class {self.layout}') + else: + return (self._btree_start, self._btree_end) + + #### The following method can be used to set pseudo chunking size after the #### file has been closed and before data transactions. This is pyfive specific def set_pseudo_chunk_size(self, newsize_MB): @@ -311,7 +327,10 @@ def _build_index(self, dataobject): self._index = {} self._nthindex = [] + for node in chunk_btree.all_nodes[0]: + self._btree_start=node['addresses'][0] + self._btree_end=node['addresses'][0] for node_key, addr in zip(node['keys'], node['addresses']): start = node_key['chunk_offset'][:-1] key = start @@ -319,6 +338,9 @@ def _build_index(self, dataobject): filter_mask = node_key['filter_mask'] self._nthindex.append(key) self._index[key] = StoreInfo(key, filter_mask, addr, size) + self._btree_end=max(addr,self._btree_end) + + def _get_contiguous_data(self, args, fillvalue): From 70ad54ea0b7aa886590de5a1bde7b1d70f1d9d88 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 22 Oct 2025 13:38:06 +0100 Subject: [PATCH 02/16] Remove redudant chunk handling from btree.py (part of #131) --- pyfive/btree.py | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/pyfive/btree.py b/pyfive/btree.py index 5d34258..7ce119b 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -151,55 +151,6 @@ def _read_node(self, offset, node_level): node['addresses'] = addresses return node - def construct_data_from_chunks( - self, chunk_shape, data_shape, dtype, filter_pipeline): - """ Build a complete data array from chunks. """ - if isinstance(dtype, tuple): - true_dtype = tuple(dtype) - dtype_class = dtype[0] - if dtype_class == 'REFERENCE': - size = dtype[1] - if size != 8: - raise NotImplementedError('Unsupported Reference type') - dtype = ' Date: Wed, 22 Oct 2025 16:47:58 +0100 Subject: [PATCH 03/16] Just making a note that we need to be careful about chunk indexing should we have a v3 layout in the future. --- pyfive/h5d.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index b1b6895..fb28a19 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -321,6 +321,10 @@ def _build_index(self, dataobject): logging.info(f'Building chunk index in pyfive {version("pyfive")}') + #FIXME: How do we know it's a V1 B-tree? + # There are potentially five different chunk indexing options according to + # https://docs.hdfgroup.org/archive/support/HDF5/doc/H5.format.html#AppendixC + chunk_btree = BTreeV1RawDataChunks( dataobject.fh, dataobject._chunk_address, dataobject._chunk_dims) From 1c9bef4a91a3f3f4ab3241ebe0ce7f66345faf34 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Wed, 22 Oct 2025 16:48:56 +0100 Subject: [PATCH 04/16] First cut at supporting ncdump like behaviour. Doesn't have support for file or group attributes yet, or phony dimensions. --- .vscode/settings.json | 16 ++++++++ pyfive/__init__.py | 1 + pyfive/inspect.py | 83 +++++++++++++++++++++++++++++++++++++++++ pyfive/p5dump.py | 39 +++++++++++++++++++ pyproject.toml | 3 ++ tests/test_dump.py | 48 ++++++++++++++++++++++++ tests/test_mock_s3fs.py | 10 ++++- 7 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json create mode 100644 pyfive/inspect.py create mode 100644 pyfive/p5dump.py create mode 100644 tests/test_dump.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8391adb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,16 @@ +{ + "python.defaultInterpreterPath": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin/python", + "python.terminal.activateEnvironment":true, + "terminal.integrated.profiles.osx": { + "zsh": { + "path": "/bin/zsh", + "args": ["-l"] + } + }, + "terminal.integrated.defaultProfile.osx": "zsh", + "esbonio.server.env": { + "PATH": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin:${env:PATH}", + "PYTHONPATH": "/Users/bnl28/mambaforge/envs/pyfive-25aug/lib/python3.12/site-packages" + }, + "esbonio.server.pythonPath": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin/python" +} diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 7a83096..f2c54dc 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -8,6 +8,7 @@ from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype, opaque_dtype, check_opaque_dtype from pyfive.h5py import Datatype, Empty from importlib.metadata import version +from pyfive.inspect import p5ncdump __version__ = '0.5.0.dev' diff --git a/pyfive/inspect.py b/pyfive/inspect.py new file mode 100644 index 0000000..252dcee --- /dev/null +++ b/pyfive/inspect.py @@ -0,0 +1,83 @@ +from pathlib import Path +import pyfive +def clean_types(dtype): + """Convert a numpy dtype to classic ncdump type string.""" + # Strip endianness (> or <) and map to ncdump types + kind = dtype.kind + itemsize = dtype.itemsize + if kind == "f": # floating point + return f"float{itemsize*8}" + elif kind == "i": # signed integer + return f"int{itemsize*8}" + elif kind == "u": # unsigned integer + return f"uint{itemsize*8}" + elif kind == "S" or kind == "a": # fixed-length bytes + return "char" + else: + return str(dtype) # fallback + + +def dump_header(f, filename): + + print(f"File: {filename} "+'{') + dims = set() + datasets = {name: f[name] for name in f.keys() if hasattr(f[name], "shape")} + for ds in datasets.values(): + for dim in ds.dims: + for scale in dim: + dims.add((scale.name.split('/')[-1],scale.shape[0])) + if dims: + print("dimensions:") + for d in dims: + print(f' {d[0]}={d[1]};') + + print("variables:") + for name,ds in datasets.items(): + + # Variable type + dtype_str = clean_types(ds.dtype) + + # Dimensions for this variable (use dims if available) + if hasattr(ds, "dims") and len(ds.dims) > 0: + dim_names = [scale.name.split('/')[-1] for dim in ds.dims for scale in dim] + else: + # fallback: no dims + dim_names = [] + + dim_str = "(" + ", ".join(dim_names) + ")" if dim_names else "" + print(f" {dtype_str} {name}{dim_str};") + + # Attributes + ommit = ['CLASS','NAME','_Netcdf4Dimid','REFERENCE_LIST','DIMENSION_LIST','_Netcdf4Coordinates'] + for attr_name, attr_val in ds.attrs.items(): + if attr_name not in ommit: + if isinstance(attr_val, bytes): + attr_val = f'"{attr_val.decode("utf-8")}"' + print(f" {name}:{attr_name} = {attr_val};") + print('}') + +def p5ncdump(file_path, special=False): + + if special: + raise NotImplementedError + + # handle posix and S3 differently + filename = getattr(file_path,'full_name', None) + if filename is None: + filename = file_path + filename = Path(filename).name + + try: + print('Now going to pyfive') + with pyfive.File(file_path) as f: + # Attach dims if not already attached + print('opened') + for name in f.keys(): + ds = f[name] #bugger this is a b-tree read + if hasattr(ds, "shape") and not hasattr(ds, "dims"): + # internally pyfive may attach dims automatically, but safe to attach here + ds.dims # access triggers dimension proxies + dump_header(f, filename) + except NotImplementedError as e: + if 'unsupported superblock' in str(e): + raise ValueError('Not an HDF5 or NC4 file!') \ No newline at end of file diff --git a/pyfive/p5dump.py b/pyfive/p5dump.py new file mode 100644 index 0000000..f6c53aa --- /dev/null +++ b/pyfive/p5dump.py @@ -0,0 +1,39 @@ +from pyfive import p5ncdump +import sys + +def main(argv=None): + """ + Provides some of the functionality of tools like ncdump and h5dump. + By default this will attempt to do something similar to ncdump. + - h will return this information + - s (not yet implemented) will provide additional information + """ + if argv is None: + argv = sys.argv[1:] # ignore script name + + match argv: + # script → error (no filename) + case []: + raise ValueError("No filename provided") + + # script -h → help + case ["-h"]: + print(main.__doc__) + return 0 + + # script filename + case [filename]: + p5ncdump(filename, special=False) + return 0 + + # script -s filename + case ["-s", filename]: + p5ncdump(filename, special=True) + return 0 + + # Anything else → error + case _: + raise ValueError(f"Invalid arguments: {argv}") + +if __name__ == '__main__': + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index e83b13f..ce6333d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,9 @@ license = {text = "BSD License, Version 3-Clause"} name = "pyfive" requires-python = ">=3.10" +[project.scripts] +p5dump = "pyfive.p5dump:main" + [project.optional-dependencies] test = [ "pytest>6.0.0", diff --git a/tests/test_dump.py b/tests/test_dump.py new file mode 100644 index 0000000..5b4dc24 --- /dev/null +++ b/tests/test_dump.py @@ -0,0 +1,48 @@ +import pytest +from pyfive.p5dump import main +import os + + +DIRNAME = os.path.dirname(__file__) +EARLIEST_HDF5_FILE = os.path.join(DIRNAME, 'data', 'earliest.hdf5') + +# +# A standard nicely behaved netcdf4 file is tested in test_mock_s3fs +# Kill two birds with one stone there. +# + +def test_old_hd5_with_groups(capsys): + filename = EARLIEST_HDF5_FILE + + # No exception means success + assert main([filename]) == 0 + + captured = capsys.readouterr() + + #currently failing + assert 'phony_dim_0' in captured.out + assert 'dataset3(phony_dim_0)' in captured.out + assert 'string :attr5 = "Test"' in captured.out + + +# Test: script -s filename (special mode) +def test_main_special_real(): + filename = "tests/data/sample.nc" + with pytest.raises(NotImplementedError): + assert main(["-s", filename]) == 0 + +# Test: -h should print help +def test_main_help_real(capsys): + main(["-h"]) + captured = capsys.readouterr() + assert "Provides some of the functionality" in captured.out + +# Test: no filename → error +def test_main_no_args_real(): + with pytest.raises(ValueError): + main([]) + +# Test: invalid flag → error +def test_main_invalid_args_real(): + with pytest.raises(ValueError): + main(["-x", "file.nc"]) \ No newline at end of file diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py index 7beffb2..6cc0a59 100644 --- a/tests/test_mock_s3fs.py +++ b/tests/test_mock_s3fs.py @@ -24,7 +24,7 @@ def test_s3fs_s3(s3fs_s3): assert mock_s3_filesystem.client_kwargs == {'endpoint_url': 'http://127.0.0.1:5555/'} -def test_s3file_with_s3fs(s3fs_s3): +def test_s3file_with_s3fs(s3fs_s3, capsys): """ This test spoofs a complete s3fs FileSystem via s3fs_s3, creates a mock bucket inside it, then puts a REAL netCDF4 file in it, @@ -57,3 +57,11 @@ def test_s3file_with_s3fs(s3fs_s3): pyfive_ds = pyfive.File(f) print(f"Dataset loaded from mock S3 with s3fs and Pyfive: ds") assert "q" in pyfive_ds + + # test command command line main test_s3 + with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f: + pyfive.p5ncdump(f) + + captured = capsys.readouterr() + assert ('File: issue23_A.nc' in captured.out) + assert ('q:cell_methods = "area: mean"' in captured.out) \ No newline at end of file From 9c1b6889818af55c076ecd89a4820580f4e88c61 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 23 Oct 2025 14:21:18 +0100 Subject: [PATCH 05/16] Working implementation of lazy access to variables (#135) and partial implementation of p5dump functionality (#134). Unit tests are failing due to a desire to get closer to (but not exactly) what ncdump will do. --- pyfive/h5d.py | 49 ++++++++++++++++++++++--------- pyfive/high_level.py | 41 ++++++++++++++++++++++++-- pyfive/inspect.py | 6 +++- tests/test_chunk_index_options.py | 28 ++++++++++++++++++ tests/test_mock_s3fs.py | 3 +- 5 files changed, 109 insertions(+), 18 deletions(-) create mode 100644 tests/test_chunk_index_options.py diff --git a/pyfive/h5d.py b/pyfive/h5d.py index fb28a19..a2168e3 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -12,6 +12,7 @@ from importlib.metadata import version StoreInfo = namedtuple('StoreInfo',"chunk_offset filter_mask byte_offset size") +ChunkIndex = namedtuple('ChunkIndex',"chunk_address chunk_dims") class DatasetID: """ @@ -27,10 +28,15 @@ class DatasetID: from the parent file access as both share underlying C-structures.* """ - def __init__(self, dataobject, pseudo_chunking_size_MB=4): + def __init__(self, dataobject, noindex=False, pseudo_chunking_size_MB=4): """ Instantiated with the ``pyfive`` ``datasetdataobject``, we copy and cache everything we want so that the only file operations are now data accesses. + + noindex provides a method for controlling how lazy the data load + actually is. This version supports values of False (normal behaviour + index is read when datasetid first instantiated) or True (index + is only read when the data is accessed). if ``pseudo_chunking_size_MB`` is set to a value greater than zero, and if the storage is not local posix (and hence ``np.mmap``is not available) then @@ -102,7 +108,9 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): self._meta = DatasetMeta(dataobject) - self._index = None + self._index = None + self.__index_built = False + self._index_params = None # throws a flake8 wobbly for Python<3.10; match is Py3.10+ syntax match self.layout_class: # noqa case 0: #compact storage @@ -110,7 +118,11 @@ def __init__(self, dataobject, pseudo_chunking_size_MB=4): case 1: # contiguous storage self.data_offset, = struct.unpack_from(' x[y] """ + """ x.__getitem__(y) <==> x[y]. + """ + return self.__getitem_lazy_control(y, noindex=False) + + + def get_lazy_view(self, y): + """ + This instantiates the object y, and if it is a + chunked dataset, does so without reading the b-tree + index. This is useful for inspecting a variable + that you are not expecting to access. If you know you + want to access the data, and in particular, if you are + going to hand the data to Dask or something else, you + almost certainly want to read the index now, so + just do x[y] rather than x.get_lazy_view(y). + + This is a ``pyfive`` extension to the standard h5py API. + """ + + return self.__getitem_lazy_control(y, noindex=True) + + + def __getitem_lazy_control(self, y, noindex): + """ + This is the routine which actually does the get item + but does it in such a way that we control how much laziness + is possible where we have chunked variables with b-trees. + + We want to return y, but if y is a chunked dataset we + normally return it with a cached b-tree (noindex=false). + If noindex is True, we do not read the b-tree, and that + will be done when data is first read - which is fine + in a single-threaded environment, but in a parallel + environment you only want to read the index once + (so use noindex=False, which you get via the + normal getitem interface - x[y]). + """ + if isinstance(y, Reference): return self._dereference(y) @@ -92,7 +129,7 @@ def __getitem__(self, y): if dataobjs.is_dataset: if additional_obj != '.': raise KeyError('%s is a dataset, not a group' % (obj_name)) - return Dataset(obj_name, DatasetID(dataobjs), self) + return Dataset(obj_name, DatasetID(dataobjs, noindex=noindex), self) try: # if true, this may well raise a NotImplementedError, if so, we need diff --git a/pyfive/inspect.py b/pyfive/inspect.py index 252dcee..59c7576 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -21,7 +21,11 @@ def dump_header(f, filename): print(f"File: {filename} "+'{') dims = set() - datasets = {name: f[name] for name in f.keys() if hasattr(f[name], "shape")} + datasets = {} + for name in f: + item = f.get_lazy_view(name) + if hasattr(item,"shape"): + datasets[name]=item for ds in datasets.values(): for dim in ds.dims: for scale in dim: diff --git a/tests/test_chunk_index_options.py b/tests/test_chunk_index_options.py new file mode 100644 index 0000000..0384f8a --- /dev/null +++ b/tests/test_chunk_index_options.py @@ -0,0 +1,28 @@ +""" Test pyfive's abililty to read multidimensional datasets. """ +import os + +import numpy as np +from numpy.testing import assert_array_equal + +import pyfive + +DIRNAME = os.path.dirname(__file__) +DATASET_CHUNKED_HDF5_FILE = os.path.join(DIRNAME, "data", 'chunked.hdf5') + + +def test_lazy_index(): + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + + # instantiate variable + dset1 = hfile.get_lazy_view('dataset1') + + # should be able to see attributes but not have an index yet + assert dset1.attrs['attr1'] == 130 + + # test we have no index yet + assert dset1.id._DatasetID__index_built==False + + # this should force an index build + assert_array_equal(dset1[:], np.arange(21*16).reshape((21, 16))) + assert dset1.chunks == (2, 2) \ No newline at end of file diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py index 6cc0a59..0896a4e 100644 --- a/tests/test_mock_s3fs.py +++ b/tests/test_mock_s3fs.py @@ -64,4 +64,5 @@ def test_s3file_with_s3fs(s3fs_s3, capsys): captured = capsys.readouterr() assert ('File: issue23_A.nc' in captured.out) - assert ('q:cell_methods = "area: mean"' in captured.out) \ No newline at end of file + assert ('q:cell_methods = "area: mean"' in captured.out) + assert (':Conventions = "CF-1.12"' in captured.out) \ No newline at end of file From 2c3a3e3657c72287f96d85262f0524466217fd94 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 24 Oct 2025 16:09:55 +0100 Subject: [PATCH 06/16] allow visititems to be lazy --- pyfive/high_level.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pyfive/high_level.py b/pyfive/high_level.py index 2a4ad4b..6c97b96 100644 --- a/pyfive/high_level.py +++ b/pyfive/high_level.py @@ -162,7 +162,7 @@ def visit(self, func): """ return self.visititems(lambda name, obj: func(name)) - def visititems(self, func): + def visititems(self, func, noindex=False): """ Recursively visit all objects in this group and subgroups. @@ -173,11 +173,25 @@ def visititems(self, func): Returning None continues iteration, return anything else stops and return that value from the visit method. + Use of the optional noindex=True will ensure that + all operations are not only lazy wrt data, but lazy + wrt to any chunked data indices. + """ root_name_length = len(self.name) if not self.name.endswith('/'): root_name_length += 1 - queue = deque(self.values()) + + # Use either normal access or lazy access: + if noindex: + # Avoid loading dataset indices + get_obj = self.get_lazy_view + else: + get_obj = self.__getitem__ + + # Initialize queue using the correct getter + queue = deque(get_obj(k) for k in self._links.keys()) + while queue: obj = queue.popleft() name = obj.name[root_name_length:] From 7827480f90ea86f41cea2d1f03b5f2f983aa957d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 24 Oct 2025 16:10:20 +0100 Subject: [PATCH 07/16] p5dump works for the test cases --- pyfive/inspect.py | 170 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 133 insertions(+), 37 deletions(-) diff --git a/pyfive/inspect.py b/pyfive/inspect.py index 59c7576..f01bd94 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -1,5 +1,7 @@ from pathlib import Path -import pyfive + +from pyfive import Dataset, Group, File + def clean_types(dtype): """Convert a numpy dtype to classic ncdump type string.""" # Strip endianness (> or <) and map to ncdump types @@ -17,48 +19,141 @@ def clean_types(dtype): return str(dtype) # fallback -def dump_header(f, filename): +def collect_dimensions_from_root(root): + """ + Collect true netCDF-style dimensions from the root group only. + + Returns + ------- + dims : dict + Maps dimension name (str) -> size (int) + """ + dims = {} + + for name in root: + + obj = root.get_lazy_view(name) + # Must be a dataset to be a dimension scale + if not isinstance(obj,Dataset): + continue + + # Must have CLASS="DIMENSION_SCALE" to qualify + if str(obj.attrs.get("CLASS")) == "b'DIMENSION_SCALE'": + # NetCDF stores the real dimension name under NAME + dim_name = obj.attrs.get("NAME").decode() + if dim_name.startswith('This is a netCDF dimension but not a'): + dim_name = name + # Use the first axis of its shape as the dimension size + size = obj.shape[0] if hasattr(obj, "shape") and obj.shape else None + + # Only add if size makes sense + if size is not None: + dims[dim_name] = size + + return dims + +def gather_dimensions(obj, alldims, phonys, real_dimensions): + """ + Gather dimensions from dimension scales if present, and if not, + infer infer phony dimensions (to behave like netcdf reporting of and HDF5 file). + For a dump that seems useful even if we are an HDF5 only application. + Monkey patch these dims alongside existing dimension manager. + """ + + if not hasattr(obj,'__inspected_dims'): + obj.__inspected_dims=[] + + oname = obj.name.split('/')[-1] + + for axis, size in enumerate(obj.shape): + + if obj.dims[axis]: # real scale exists + edim = (obj.dims[axis][0].name.split('/')[-1], size) + + elif size in real_dimensions.values(): + dim_name = next(name for name, sz in real_dimensions.items() if sz == size) + edim = (dim_name, size) + else: + # make or reuse a phony dimension name + if size not in phonys: + phonys[size] = f"phony_dim_{len(phonys)}" + pname = phonys[size] + edim = (pname,size) + + obj.__inspected_dims.append(edim) + alldims.add(edim) + + return obj, alldims, phonys + + +def dump_header(obj, indent, real_dimensions): + """ Pretty print a group within an HDF5 file (including the root group) """ + + def printattr(attrs, ommit=[]): + """ Pretty print a set of attributes """ + for k,v in attrs.items(): + if k not in ommit: + if isinstance(v, bytes): + v = f'"{v.decode("utf-8")}"' + print(f"{indent}{dindent}{dindent}{name}:{k} = {v} ;") - print(f"File: {filename} "+'{') dims = set() datasets = {} - for name in f: - item = f.get_lazy_view(name) - if hasattr(item,"shape"): + groups = {} + phonys = {} + + for name in obj: + item = obj.get_lazy_view(name) + if isinstance(item, Dataset): + if str(item.attrs.get('NAME','None')).startswith('This is a netCDF dimension but not a'): + continue datasets[name]=item + elif isinstance(item, Group): + groups[name]=item + + for ds in datasets.values(): - for dim in ds.dims: - for scale in dim: - dims.add((scale.name.split('/')[-1],scale.shape[0])) + ds, dims, phonys = gather_dimensions(ds, dims, phonys, real_dimensions) if dims: - print("dimensions:") + print(f"{indent}dimensions:") + dindent = ' ' for d in dims: - print(f' {d[0]}={d[1]};') + print(f'{indent}{dindent}{d[0]} = {d[1]};') - print("variables:") + print(f"{indent}variables:") for name,ds in datasets.items(): # Variable type dtype_str = clean_types(ds.dtype) # Dimensions for this variable (use dims if available) - if hasattr(ds, "dims") and len(ds.dims) > 0: - dim_names = [scale.name.split('/')[-1] for dim in ds.dims for scale in dim] - else: - # fallback: no dims - dim_names = [] - - dim_str = "(" + ", ".join(dim_names) + ")" if dim_names else "" - print(f" {dtype_str} {name}{dim_str};") + if hasattr(ds,'__inspected_dims'): + dim_names = [d[0] for d in ds.__inspected_dims] + dim_str = "(" + ", ".join(dim_names) + ")" if dim_names else "" + print(f"{indent}{dindent}{dtype_str} {name}{dim_str} ;") # Attributes - ommit = ['CLASS','NAME','_Netcdf4Dimid','REFERENCE_LIST','DIMENSION_LIST','_Netcdf4Coordinates'] - for attr_name, attr_val in ds.attrs.items(): - if attr_name not in ommit: - if isinstance(attr_val, bytes): - attr_val = f'"{attr_val.decode("utf-8")}"' - print(f" {name}:{attr_name} = {attr_val};") - print('}') + ommit = ['CLASS','NAME','_Netcdf4Dimid', + 'REFERENCE_LIST','DIMENSION_LIST','DIMENSION_LABELS','_Netcdf4Coordinates'] + + printattr(ds.attrs, ommit) + + if isinstance(obj, File): + hstr='// global ' + elif isinstance(obj, Group): + hstr=f'{indent}// group ' + if obj.attrs: + print(hstr+'attributes:') + printattr(obj.attrs, ['_NCProperties']) + + if groups: + for g,o in groups.items(): + print(f'{indent}group: {g} '+'{') + gindent = indent+' ' + dump_header(o,gindent,real_dimensions) + print(gindent+'}'+f' // group {g}') + + def p5ncdump(file_path, special=False): @@ -72,16 +167,17 @@ def p5ncdump(file_path, special=False): filename = Path(filename).name try: - print('Now going to pyfive') - with pyfive.File(file_path) as f: - # Attach dims if not already attached - print('opened') - for name in f.keys(): - ds = f[name] #bugger this is a b-tree read - if hasattr(ds, "shape") and not hasattr(ds, "dims"): - # internally pyfive may attach dims automatically, but safe to attach here - ds.dims # access triggers dimension proxies - dump_header(f, filename) + with File(file_path) as f: + + # we assume all the netcdf 4 dimnnsions, if they exist, are in the root group + real_dimensions = collect_dimensions_from_root(f) + + # ok, go for it + print(f"File: {filename} "+'{') + indent = '' + dump_header(f, indent, real_dimensions) + print('}') + except NotImplementedError as e: if 'unsupported superblock' in str(e): raise ValueError('Not an HDF5 or NC4 file!') \ No newline at end of file From 481ca47a351f555839905f4e47198ac01e997f81 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 24 Oct 2025 16:15:21 +0100 Subject: [PATCH 08/16] Fixed string handling in groups --- pyfive/inspect.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyfive/inspect.py b/pyfive/inspect.py index f01bd94..cbae9d9 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -95,6 +95,8 @@ def printattr(attrs, ommit=[]): if k not in ommit: if isinstance(v, bytes): v = f'"{v.decode("utf-8")}"' + elif isinstance(v,str): + v = f'"{v}"' print(f"{indent}{dindent}{dindent}{name}:{k} = {v} ;") dims = set() From 5ab84bfdab89da6e4766dbb97c06f46d3569245d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Fri, 24 Oct 2025 16:27:08 +0100 Subject: [PATCH 09/16] Better testing --- tests/test_chunk_index_options.py | 29 ++++++++++++++++++++++++++++- tests/test_dump.py | 4 ++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/test_chunk_index_options.py b/tests/test_chunk_index_options.py index 0384f8a..1f6f8e6 100644 --- a/tests/test_chunk_index_options.py +++ b/tests/test_chunk_index_options.py @@ -25,4 +25,31 @@ def test_lazy_index(): # this should force an index build assert_array_equal(dset1[:], np.arange(21*16).reshape((21, 16))) - assert dset1.chunks == (2, 2) \ No newline at end of file + assert dset1.chunks == (2, 2) + + +def test_lazy_visititems(): + + def simpler_check(x,y): + """ Expect this to be visited and instantiated without an index """ + print(x,y.name) + assert y.attrs['attr1'] == 130 + assert y.id._DatasetID__index_built==False + + def simplest_check(x,y): + """ Expect this to be visited and instantiated with an index """ + print(x,y.name) + assert y.attrs['attr1'] == 130 + assert y.id._DatasetID__index_built==True + + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + + assert hfile.visititems(simpler_check,noindex=True) is None + assert hfile.visititems(simplest_check) is None + + + + + + diff --git a/tests/test_dump.py b/tests/test_dump.py index 5b4dc24..749283a 100644 --- a/tests/test_dump.py +++ b/tests/test_dump.py @@ -21,8 +21,8 @@ def test_old_hd5_with_groups(capsys): #currently failing assert 'phony_dim_0' in captured.out - assert 'dataset3(phony_dim_0)' in captured.out - assert 'string :attr5 = "Test"' in captured.out + assert 'dataset3(phony_dim' in captured.out + assert 'attr5 = "Test"' in captured.out # Test: script -s filename (special mode) From a917d9ed79d54e14daf7b55840f39756423db464 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Sun, 26 Oct 2025 18:49:33 +0000 Subject: [PATCH 10/16] Support for chunk information in p5dump via -s. (Fixed btree_range as well I think) --- pyfive/btree.py | 2 ++ pyfive/h5d.py | 19 ++++++++++++++----- pyfive/inspect.py | 19 ++++++++++++++----- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pyfive/btree.py b/pyfive/btree.py index 7ce119b..a48d278 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -22,6 +22,7 @@ def __init__(self, fh, offset): self.offset = offset self.depth = None self.all_nodes = {} + self.last_offset = offset self._read_root_node() self._read_children() @@ -53,6 +54,7 @@ def _read_node(self, offset, node_level): node = self._read_node_header(offset, node_level) node['keys'] = [] node['addresses'] = [] + self.last_offset=max(offset,self.last_offset) return node def _read_node_header(self, offset): diff --git a/pyfive/h5d.py b/pyfive/h5d.py index a2168e3..29a3d65 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -142,9 +142,11 @@ def get_chunk_info(self, index): """ Retrieve storage information about a chunk specified by its index. """ - if not self._index: - return None + if self._index_params is None: + raise ValueError('No chunk detail available for HDF layout class {self.layout}') else: + if not self.__index_built: + self._build_index() return self._index[self._nthindex[index]] def get_chunk_info_by_coord(self, coordinate_index): @@ -161,6 +163,11 @@ def get_num_chunks(self): """ Return total number of chunks in dataset """ + if self._index_params is None: + raise ValueError('No chunk detail available for HDF layout class {self.layout}') + else: + if not self.__index_built: + self._build_index() return len(self._index) def read_direct_chunk(self, chunk_position, **kwargs): @@ -354,8 +361,7 @@ def _build_index(self): self._nthindex = [] for node in chunk_btree.all_nodes[0]: - self._btree_start=node['addresses'][0] - self._btree_end=node['addresses'][0] + for node_key, addr in zip(node['keys'], node['addresses']): start = node_key['chunk_offset'][:-1] key = start @@ -363,7 +369,10 @@ def _build_index(self): filter_mask = node_key['filter_mask'] self._nthindex.append(key) self._index[key] = StoreInfo(key, filter_mask, addr, size) - self._btree_end=max(addr,self._btree_end) + + + self._btree_start=chunk_btree.offset + self._btree_end=chunk_btree.last_offset self.__index_built=True diff --git a/pyfive/inspect.py b/pyfive/inspect.py index cbae9d9..43f69ea 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -86,7 +86,7 @@ def gather_dimensions(obj, alldims, phonys, real_dimensions): return obj, alldims, phonys -def dump_header(obj, indent, real_dimensions): +def dump_header(obj, indent, real_dimensions, special): """ Pretty print a group within an HDF5 file (including the root group) """ def printattr(attrs, ommit=[]): @@ -139,6 +139,18 @@ def printattr(attrs, ommit=[]): 'REFERENCE_LIST','DIMENSION_LIST','DIMENSION_LABELS','_Netcdf4Coordinates'] printattr(ds.attrs, ommit) + + if special: + extras = {} + if ds.id._index_params: + extras['_n_chunks'] = ds.id.get_num_chunks() + extras['_chunk_shape'] = ds.id.chunks + extras['_btree_range'] = ds.id.btree_range + extras['_first_chunk'] = ds.id.get_chunk_info(0).byte_offset + if ds.compression: + extras['_compression'] = ds.compression+f'({ds.compression_opts})' + printattr(extras,[]) + if isinstance(obj, File): hstr='// global ' @@ -159,9 +171,6 @@ def printattr(attrs, ommit=[]): def p5ncdump(file_path, special=False): - if special: - raise NotImplementedError - # handle posix and S3 differently filename = getattr(file_path,'full_name', None) if filename is None: @@ -177,7 +186,7 @@ def p5ncdump(file_path, special=False): # ok, go for it print(f"File: {filename} "+'{') indent = '' - dump_header(f, indent, real_dimensions) + dump_header(f, indent, real_dimensions, special) print('}') except NotImplementedError as e: From 173d9d170ae9e0a94e272681f84b3e0b53c3fafc Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 27 Oct 2025 09:09:03 +0000 Subject: [PATCH 11/16] p5dump -s includes storage type (from layout_class) --- pyfive/inspect.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyfive/inspect.py b/pyfive/inspect.py index 43f69ea..51bc586 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -141,14 +141,14 @@ def printattr(attrs, ommit=[]): printattr(ds.attrs, ommit) if special: - extras = {} - if ds.id._index_params: + extras = {'_Storage':{0:'Compact',1:'Contiguous',2:'Chunked'}[ds.id.layout_class]} + if ds.id.layout_class==2: extras['_n_chunks'] = ds.id.get_num_chunks() extras['_chunk_shape'] = ds.id.chunks extras['_btree_range'] = ds.id.btree_range extras['_first_chunk'] = ds.id.get_chunk_info(0).byte_offset - if ds.compression: - extras['_compression'] = ds.compression+f'({ds.compression_opts})' + if ds.compression: + extras['_compression'] = ds.compression+f'({ds.compression_opts})' printattr(extras,[]) From dae9eb1314470fae0bbf06b444795e49e1546262 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 27 Oct 2025 09:42:17 +0000 Subject: [PATCH 12/16] Edge case detection and bug fix --- pyfive/btree.py | 1 + pyfive/h5d.py | 2 ++ pyfive/inspect.py | 7 ++++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyfive/btree.py b/pyfive/btree.py index a48d278..15e8732 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -151,6 +151,7 @@ def _read_node(self, offset, node_level): addresses.append(chunk_address) node['keys'] = keys node['addresses'] = addresses + self.last_offset=max(offset,self.last_offset) return node diff --git a/pyfive/h5d.py b/pyfive/h5d.py index 29a3d65..c815cb3 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -343,6 +343,8 @@ def _build_index(self): # look out for an empty dataset, which will have no btree if np.prod(self.shape) == 0 or self._index_params.chunk_address == UNDEFINED_ADDRESS: self._index = {} + #FIXME: There are other edge cases for self._index = {} to handle + self._btree_end, self._btree_start = None, None return logging.info(f'Building chunk index in pyfive {version("pyfive")}') diff --git a/pyfive/inspect.py b/pyfive/inspect.py index 51bc586..ce3ce9e 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -144,9 +144,10 @@ def printattr(attrs, ommit=[]): extras = {'_Storage':{0:'Compact',1:'Contiguous',2:'Chunked'}[ds.id.layout_class]} if ds.id.layout_class==2: extras['_n_chunks'] = ds.id.get_num_chunks() - extras['_chunk_shape'] = ds.id.chunks - extras['_btree_range'] = ds.id.btree_range - extras['_first_chunk'] = ds.id.get_chunk_info(0).byte_offset + if extras['_n_chunks'] != 0: + extras['_chunk_shape'] = ds.id.chunks + extras['_btree_range'] = ds.id.btree_range + extras['_first_chunk'] = ds.id.get_chunk_info(0).byte_offset if ds.compression: extras['_compression'] = ds.compression+f'({ds.compression_opts})' printattr(extras,[]) From aa4ca30e5a5653680453753a18a2ee43fbb9f270 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 27 Oct 2025 09:59:51 +0000 Subject: [PATCH 13/16] Handling broken pipes more gracefully --- pyfive/inspect.py | 26 ++++++++++++++++---------- pyfive/p5dump.py | 32 +++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/pyfive/inspect.py b/pyfive/inspect.py index ce3ce9e..20ef409 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -2,6 +2,12 @@ from pyfive import Dataset, Group, File +def safe_print(*args, **kwargs): + try: + print(*args, **kwargs) + except BrokenPipeError: + raise SystemExit(1) + def clean_types(dtype): """Convert a numpy dtype to classic ncdump type string.""" # Strip endianness (> or <) and map to ncdump types @@ -97,7 +103,7 @@ def printattr(attrs, ommit=[]): v = f'"{v.decode("utf-8")}"' elif isinstance(v,str): v = f'"{v}"' - print(f"{indent}{dindent}{dindent}{name}:{k} = {v} ;") + safe_print(f"{indent}{dindent}{dindent}{name}:{k} = {v} ;") dims = set() datasets = {} @@ -117,10 +123,10 @@ def printattr(attrs, ommit=[]): for ds in datasets.values(): ds, dims, phonys = gather_dimensions(ds, dims, phonys, real_dimensions) if dims: - print(f"{indent}dimensions:") + safe_print(f"{indent}dimensions:") dindent = ' ' for d in dims: - print(f'{indent}{dindent}{d[0]} = {d[1]};') + safe_print(f'{indent}{dindent}{d[0]} = {d[1]};') print(f"{indent}variables:") for name,ds in datasets.items(): @@ -132,7 +138,7 @@ def printattr(attrs, ommit=[]): if hasattr(ds,'__inspected_dims'): dim_names = [d[0] for d in ds.__inspected_dims] dim_str = "(" + ", ".join(dim_names) + ")" if dim_names else "" - print(f"{indent}{dindent}{dtype_str} {name}{dim_str} ;") + safe_print(f"{indent}{dindent}{dtype_str} {name}{dim_str} ;") # Attributes ommit = ['CLASS','NAME','_Netcdf4Dimid', @@ -158,15 +164,15 @@ def printattr(attrs, ommit=[]): elif isinstance(obj, Group): hstr=f'{indent}// group ' if obj.attrs: - print(hstr+'attributes:') - printattr(obj.attrs, ['_NCProperties']) + safe_print(hstr+'attributes:') + safe_printattr(obj.attrs, ['_NCProperties']) if groups: for g,o in groups.items(): - print(f'{indent}group: {g} '+'{') + safe_print(f'{indent}group: {g} '+'{') gindent = indent+' ' dump_header(o,gindent,real_dimensions) - print(gindent+'}'+f' // group {g}') + safe_print(gindent+'}'+f' // group {g}') @@ -185,10 +191,10 @@ def p5ncdump(file_path, special=False): real_dimensions = collect_dimensions_from_root(f) # ok, go for it - print(f"File: {filename} "+'{') + safe_print(f"File: {filename} "+'{') indent = '' dump_header(f, indent, real_dimensions, special) - print('}') + safe_print('}') except NotImplementedError as e: if 'unsupported superblock' in str(e): diff --git a/pyfive/p5dump.py b/pyfive/p5dump.py index f6c53aa..52f9364 100644 --- a/pyfive/p5dump.py +++ b/pyfive/p5dump.py @@ -1,39 +1,49 @@ from pyfive import p5ncdump import sys +import signal def main(argv=None): """ Provides some of the functionality of tools like ncdump and h5dump. By default this will attempt to do something similar to ncdump. - h will return this information - - s (not yet implemented) will provide additional information + - will provide additional information """ if argv is None: argv = sys.argv[1:] # ignore script name match argv: - # script → error (no filename) case []: raise ValueError("No filename provided") - - # script -h → help case ["-h"]: print(main.__doc__) return 0 - - # script filename case [filename]: p5ncdump(filename, special=False) return 0 - - # script -s filename case ["-s", filename]: p5ncdump(filename, special=True) return 0 - - # Anything else → error case _: raise ValueError(f"Invalid arguments: {argv}") if __name__ == '__main__': - sys.exit(main()) + # Set SIGPIPE to default behaviour on Unix (ignored safely on Windows) + try: + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + except (AttributeError, ValueError): + pass + + try: + sys.exit(main()) + except BrokenPipeError: + # Happens if pipe is closed early (e.g. `| head` or user quits `more`) + try: + sys.stderr.flush() + except Exception: + pass + sys.exit(0) + except Exception as e: + # Any other error: no traceback, just a clean message + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) \ No newline at end of file From a84fea66d0f502f9a290adce7c1d7e898219482b Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 27 Oct 2025 10:32:14 +0000 Subject: [PATCH 14/16] Why don't I run my tests before committing? --- pyfive/inspect.py | 4 ++-- tests/test_dump.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pyfive/inspect.py b/pyfive/inspect.py index 20ef409..04c59f5 100644 --- a/pyfive/inspect.py +++ b/pyfive/inspect.py @@ -165,13 +165,13 @@ def printattr(attrs, ommit=[]): hstr=f'{indent}// group ' if obj.attrs: safe_print(hstr+'attributes:') - safe_printattr(obj.attrs, ['_NCProperties']) + printattr(obj.attrs, ['_NCProperties']) if groups: for g,o in groups.items(): safe_print(f'{indent}group: {g} '+'{') gindent = indent+' ' - dump_header(o,gindent,real_dimensions) + dump_header(o,gindent,real_dimensions, special=special) safe_print(gindent+'}'+f' // group {g}') diff --git a/tests/test_dump.py b/tests/test_dump.py index 749283a..771ad46 100644 --- a/tests/test_dump.py +++ b/tests/test_dump.py @@ -27,9 +27,8 @@ def test_old_hd5_with_groups(capsys): # Test: script -s filename (special mode) def test_main_special_real(): - filename = "tests/data/sample.nc" - with pytest.raises(NotImplementedError): - assert main(["-s", filename]) == 0 + filename = EARLIEST_HDF5_FILE + assert main(["-s", filename]) == 0 # Test: -h should print help def test_main_help_real(capsys): From b02083c41b135e551bc772115c54a6db7650314a Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Mon, 27 Oct 2025 13:33:26 +0000 Subject: [PATCH 15/16] Cleaning up the DatasetID interface error handling for chunk queries on unchunked data and some tests to keep V happy. --- pyfive/h5d.py | 63 +++++++++++++++++-------------- tests/test_chunk_index_options.py | 57 +++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 30 deletions(-) diff --git a/pyfive/h5d.py b/pyfive/h5d.py index c815cb3..5c3845f 100644 --- a/pyfive/h5d.py +++ b/pyfive/h5d.py @@ -138,44 +138,52 @@ def __eq__(self, other): """ return self._unique == other._unique + def __chunk_init_check(self): + """ + Used by all the chunk methods to see if this dataset is + chunked, and if so, if the index is present, and if not, + build it. Otherwise handle errors etc. + """ + if self.layout_class != 2: + raise TypeError('Dataset is not chunked ') + return not self.index == {} + + def get_chunk_info(self, index): """ Retrieve storage information about a chunk specified by its index. """ - if self._index_params is None: - raise ValueError('No chunk detail available for HDF layout class {self.layout}') - else: - if not self.__index_built: - self._build_index() + if self.__chunk_init_check(): return self._index[self._nthindex[index]] + else: + return None + def get_chunk_info_by_coord(self, coordinate_index): """ Retrieve information about a chunk specified by the array address of the chunk’s first element in each dimension. """ - if not self._index: - return None - else: + if self.__chunk_init_check(): return self._index[coordinate_index] + else: + return None def get_num_chunks(self): """ Return total number of chunks in dataset """ - if self._index_params is None: - raise ValueError('No chunk detail available for HDF layout class {self.layout}') + if self.__chunk_init_check(): + return len(self._index) else: - if not self.__index_built: - self._build_index() - return len(self._index) + return 0 def read_direct_chunk(self, chunk_position, **kwargs): """ Returns a tuple containing the filter_mask and the raw data storing this chunk as bytes. Additional arguments supported by ``h5py`` are not supported here. """ - if not self.index: + if not self.__chunk_init_check(): return None if chunk_position not in self._index: raise OSError("Chunk coordinates must lie on chunk boundaries") @@ -229,9 +237,9 @@ def iter_chunks(self, args): intersection of the given chunk with the selection area. This can be used to read data in that chunk. """ - if self.chunks is None: - raise TypeError('Dataset is not chunked') - + if not self.__chunk_init_check(): + return None + def convert_selection(tuple_of_slices): # while a slice of the form slice(a,b,None) is equivalent # in function to a slice of form (a,b,1) it is not the same. @@ -265,12 +273,13 @@ def convert_slice(aslice): @property def index(self): """ Direct access to the chunk index, if there is one. This is a ``pyfive`` API extension. """ - if self._index_params is None: - raise ValueError('No chunk index available for HDF layout class {self.layout}') - else: - if not self.__index_built: - self._build_index() - return self._index + # can't use init_chunk_check because that would be an infinite regression + if self.layout_class != 2: + raise TypeError("Data is not chunked") + if not self._index: + self._build_index() + return self._index + ##### This property is made available to help understand object store performance @property @@ -281,12 +290,8 @@ def btree_range(self): may be of use in understanding the read performance of chunked data in object stores. ``btree_range`` is a ``pyfive`` API extgension. """ - if self._index_params is None: - raise ValueError('No b-tree available for HDF layout class {self.layout}') - else: - if not self.__index_built: - self._build_index() - return (self._btree_start, self._btree_end) + self.__chunk_init_check() + return (self._btree_start, self._btree_end) #### The following method can be used to set pseudo chunking size after the #### file has been closed and before data transactions. This is pyfive specific diff --git a/tests/test_chunk_index_options.py b/tests/test_chunk_index_options.py index 1f6f8e6..5ce2872 100644 --- a/tests/test_chunk_index_options.py +++ b/tests/test_chunk_index_options.py @@ -1,14 +1,21 @@ -""" Test pyfive's abililty to read multidimensional datasets. """ +""" +Test pyfive's abililty to read multidimensional datasets +and variants of the chunk index accesses +""" import os import numpy as np from numpy.testing import assert_array_equal import pyfive +from pyfive.h5d import StoreInfo +import pytest DIRNAME = os.path.dirname(__file__) DATASET_CHUNKED_HDF5_FILE = os.path.join(DIRNAME, "data", 'chunked.hdf5') +NOT_CHUNKED_FILE = os.path.join(DIRNAME, "data", 'issue23_A_contiguous.nc') + def test_lazy_index(): @@ -49,6 +56,54 @@ def simplest_check(x,y): assert hfile.visititems(simplest_check) is None +def test_get_chunk_info_chunked(): + + # start lazy, then go real + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + + ds = hfile.get_lazy_view('dataset1') + assert ds.id._DatasetID__index_built==False + + si = StoreInfo((0,0), 0, 4016, 16) + info = ds.id.get_chunk_info(0) + assert info == si + + assert ds.id.get_num_chunks() == 88 + + assert ds.id.btree_range == (1072, 8680) + + +def test_get_chunk_methods_contiguous(): + + with pyfive.File(NOT_CHUNKED_FILE) as hfile: + + ds = hfile.get_lazy_view('q') + assert ds.id._DatasetID__index_built==False + + with pytest.raises(TypeError): + ds.id.get_chunk_info(0) + + with pytest.raises(TypeError): + ds.id.get_num_chunks() + + with pytest.raises(TypeError): + ds.id.read_direct_chunk(0) + + with pytest.raises(TypeError): + ds.id.btree_range + + + + + + + + + + + + From 3040c3d8d33ffcee3176ce06cb258d6add592d3d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Tue, 28 Oct 2025 08:50:12 +0000 Subject: [PATCH 16/16] Better checking of chunk info testing answers, courtesy of @zequihg50 --- tests/test_chunk_index_options.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_chunk_index_options.py b/tests/test_chunk_index_options.py index 5ce2872..3b9380a 100644 --- a/tests/test_chunk_index_options.py +++ b/tests/test_chunk_index_options.py @@ -10,6 +10,7 @@ import pyfive from pyfive.h5d import StoreInfo import pytest +import h5py DIRNAME = os.path.dirname(__file__) DATASET_CHUNKED_HDF5_FILE = os.path.join(DIRNAME, "data", 'chunked.hdf5') @@ -58,9 +59,13 @@ def simplest_check(x,y): def test_get_chunk_info_chunked(): - # start lazy, then go real + # Start lazy, then go real + # we think we know what the right answers are, so we hard + # code them as well as check that's what h5py would return - with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile, \ + h5py.File(DATASET_CHUNKED_HDF5_FILE) as h5f, \ + open(DATASET_CHUNKED_HDF5_FILE, "rb") as f: ds = hfile.get_lazy_view('dataset1') assert ds.id._DatasetID__index_built==False @@ -68,10 +73,17 @@ def test_get_chunk_info_chunked(): si = StoreInfo((0,0), 0, 4016, 16) info = ds.id.get_chunk_info(0) assert info == si + assert h5f["dataset1"].id.get_chunk_info(0) == si assert ds.id.get_num_chunks() == 88 + assert h5f["dataset1"].id.get_num_chunks() == 88 assert ds.id.btree_range == (1072, 8680) + f.seek(1072) + assert f.read(4) == b"TREE" # only v1 btrees + f.seek(8680) + assert f.read(4) == b"TREE" # only v1 btrees + def test_get_chunk_methods_contiguous():