diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8391adb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,16 @@ +{ + "python.defaultInterpreterPath": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin/python", + "python.terminal.activateEnvironment":true, + "terminal.integrated.profiles.osx": { + "zsh": { + "path": "/bin/zsh", + "args": ["-l"] + } + }, + "terminal.integrated.defaultProfile.osx": "zsh", + "esbonio.server.env": { + "PATH": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin:${env:PATH}", + "PYTHONPATH": "/Users/bnl28/mambaforge/envs/pyfive-25aug/lib/python3.12/site-packages" + }, + "esbonio.server.pythonPath": "/Users/bnl28/mambaforge/envs/pyfive-25aug/bin/python" +} diff --git a/pyfive/__init__.py b/pyfive/__init__.py index 7a83096..f2c54dc 100644 --- a/pyfive/__init__.py +++ b/pyfive/__init__.py @@ -8,6 +8,7 @@ from pyfive.h5t import check_enum_dtype, check_string_dtype, check_dtype, opaque_dtype, check_opaque_dtype from pyfive.h5py import Datatype, Empty from importlib.metadata import version +from pyfive.inspect import p5ncdump __version__ = '0.5.0.dev' diff --git a/pyfive/btree.py b/pyfive/btree.py index f8aecd1..88d53f5 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -22,6 +22,7 @@ def __init__(self, fh, offset): self.offset = offset self.depth = None self.all_nodes = {} + self.last_offset = offset self._read_root_node() self._read_children() @@ -53,6 +54,7 @@ def _read_node(self, offset, node_level): node = self._read_node_header(offset, node_level) node['keys'] = [] node['addresses'] = [] + self.last_offset=max(offset,self.last_offset) return node def _read_node_header(self, offset): @@ -149,57 +151,9 @@ def _read_node(self, offset, node_level): addresses.append(chunk_address) node['keys'] = keys node['addresses'] = addresses + self.last_offset=max(offset,self.last_offset) return node - def construct_data_from_chunks( - self, chunk_shape, data_shape, dtype, filter_pipeline): - """ Build a complete data array from chunks. """ - if isinstance(dtype, tuple): - true_dtype = tuple(dtype) - dtype_class = dtype[0] - if dtype_class == 'REFERENCE': - size = dtype[1] - if size != 8: - raise NotImplementedError('Unsupported Reference type') - dtype = ' x[y] """ + """ x.__getitem__(y) <==> x[y]. + """ + return self.__getitem_lazy_control(y, noindex=False) + + + def get_lazy_view(self, y): + """ + This instantiates the object y, and if it is a + chunked dataset, does so without reading the b-tree + index. This is useful for inspecting a variable + that you are not expecting to access. If you know you + want to access the data, and in particular, if you are + going to hand the data to Dask or something else, you + almost certainly want to read the index now, so + just do x[y] rather than x.get_lazy_view(y). + + This is a ``pyfive`` extension to the standard h5py API. + """ + + return self.__getitem_lazy_control(y, noindex=True) + + + def __getitem_lazy_control(self, y, noindex): + """ + This is the routine which actually does the get item + but does it in such a way that we control how much laziness + is possible where we have chunked variables with b-trees. + + We want to return y, but if y is a chunked dataset we + normally return it with a cached b-tree (noindex=false). + If noindex is True, we do not read the b-tree, and that + will be done when data is first read - which is fine + in a single-threaded environment, but in a parallel + environment you only want to read the index once + (so use noindex=False, which you get via the + normal getitem interface - x[y]). + """ + if isinstance(y, Reference): return self._dereference(y) @@ -92,7 +129,7 @@ def __getitem__(self, y): if dataobjs.is_dataset: if additional_obj != '.': raise KeyError('%s is a dataset, not a group' % (obj_name)) - return Dataset(obj_name, DatasetID(dataobjs), self) + return Dataset(obj_name, DatasetID(dataobjs, noindex=noindex), self) try: # if true, this may well raise a NotImplementedError, if so, we need @@ -125,7 +162,7 @@ def visit(self, func): """ return self.visititems(lambda name, obj: func(name)) - def visititems(self, func): + def visititems(self, func, noindex=False): """ Recursively visit all objects in this group and subgroups. @@ -136,11 +173,25 @@ def visititems(self, func): Returning None continues iteration, return anything else stops and return that value from the visit method. + Use of the optional noindex=True will ensure that + all operations are not only lazy wrt data, but lazy + wrt to any chunked data indices. + """ root_name_length = len(self.name) if not self.name.endswith('/'): root_name_length += 1 - queue = deque(self.values()) + + # Use either normal access or lazy access: + if noindex: + # Avoid loading dataset indices + get_obj = self.get_lazy_view + else: + get_obj = self.__getitem__ + + # Initialize queue using the correct getter + queue = deque(get_obj(k) for k in self._links.keys()) + while queue: obj = queue.popleft() name = obj.name[root_name_length:] diff --git a/pyfive/inspect.py b/pyfive/inspect.py new file mode 100644 index 0000000..04c59f5 --- /dev/null +++ b/pyfive/inspect.py @@ -0,0 +1,201 @@ +from pathlib import Path + +from pyfive import Dataset, Group, File + +def safe_print(*args, **kwargs): + try: + print(*args, **kwargs) + except BrokenPipeError: + raise SystemExit(1) + +def clean_types(dtype): + """Convert a numpy dtype to classic ncdump type string.""" + # Strip endianness (> or <) and map to ncdump types + kind = dtype.kind + itemsize = dtype.itemsize + if kind == "f": # floating point + return f"float{itemsize*8}" + elif kind == "i": # signed integer + return f"int{itemsize*8}" + elif kind == "u": # unsigned integer + return f"uint{itemsize*8}" + elif kind == "S" or kind == "a": # fixed-length bytes + return "char" + else: + return str(dtype) # fallback + + +def collect_dimensions_from_root(root): + """ + Collect true netCDF-style dimensions from the root group only. + + Returns + ------- + dims : dict + Maps dimension name (str) -> size (int) + """ + dims = {} + + for name in root: + + obj = root.get_lazy_view(name) + # Must be a dataset to be a dimension scale + if not isinstance(obj,Dataset): + continue + + # Must have CLASS="DIMENSION_SCALE" to qualify + if str(obj.attrs.get("CLASS")) == "b'DIMENSION_SCALE'": + # NetCDF stores the real dimension name under NAME + dim_name = obj.attrs.get("NAME").decode() + if dim_name.startswith('This is a netCDF dimension but not a'): + dim_name = name + # Use the first axis of its shape as the dimension size + size = obj.shape[0] if hasattr(obj, "shape") and obj.shape else None + + # Only add if size makes sense + if size is not None: + dims[dim_name] = size + + return dims + +def gather_dimensions(obj, alldims, phonys, real_dimensions): + """ + Gather dimensions from dimension scales if present, and if not, + infer infer phony dimensions (to behave like netcdf reporting of and HDF5 file). + For a dump that seems useful even if we are an HDF5 only application. + Monkey patch these dims alongside existing dimension manager. + """ + + if not hasattr(obj,'__inspected_dims'): + obj.__inspected_dims=[] + + oname = obj.name.split('/')[-1] + + for axis, size in enumerate(obj.shape): + + if obj.dims[axis]: # real scale exists + edim = (obj.dims[axis][0].name.split('/')[-1], size) + + elif size in real_dimensions.values(): + dim_name = next(name for name, sz in real_dimensions.items() if sz == size) + edim = (dim_name, size) + else: + # make or reuse a phony dimension name + if size not in phonys: + phonys[size] = f"phony_dim_{len(phonys)}" + pname = phonys[size] + edim = (pname,size) + + obj.__inspected_dims.append(edim) + alldims.add(edim) + + return obj, alldims, phonys + + +def dump_header(obj, indent, real_dimensions, special): + """ Pretty print a group within an HDF5 file (including the root group) """ + + def printattr(attrs, ommit=[]): + """ Pretty print a set of attributes """ + for k,v in attrs.items(): + if k not in ommit: + if isinstance(v, bytes): + v = f'"{v.decode("utf-8")}"' + elif isinstance(v,str): + v = f'"{v}"' + safe_print(f"{indent}{dindent}{dindent}{name}:{k} = {v} ;") + + dims = set() + datasets = {} + groups = {} + phonys = {} + + for name in obj: + item = obj.get_lazy_view(name) + if isinstance(item, Dataset): + if str(item.attrs.get('NAME','None')).startswith('This is a netCDF dimension but not a'): + continue + datasets[name]=item + elif isinstance(item, Group): + groups[name]=item + + + for ds in datasets.values(): + ds, dims, phonys = gather_dimensions(ds, dims, phonys, real_dimensions) + if dims: + safe_print(f"{indent}dimensions:") + dindent = ' ' + for d in dims: + safe_print(f'{indent}{dindent}{d[0]} = {d[1]};') + + print(f"{indent}variables:") + for name,ds in datasets.items(): + + # Variable type + dtype_str = clean_types(ds.dtype) + + # Dimensions for this variable (use dims if available) + if hasattr(ds,'__inspected_dims'): + dim_names = [d[0] for d in ds.__inspected_dims] + dim_str = "(" + ", ".join(dim_names) + ")" if dim_names else "" + safe_print(f"{indent}{dindent}{dtype_str} {name}{dim_str} ;") + + # Attributes + ommit = ['CLASS','NAME','_Netcdf4Dimid', + 'REFERENCE_LIST','DIMENSION_LIST','DIMENSION_LABELS','_Netcdf4Coordinates'] + + printattr(ds.attrs, ommit) + + if special: + extras = {'_Storage':{0:'Compact',1:'Contiguous',2:'Chunked'}[ds.id.layout_class]} + if ds.id.layout_class==2: + extras['_n_chunks'] = ds.id.get_num_chunks() + if extras['_n_chunks'] != 0: + extras['_chunk_shape'] = ds.id.chunks + extras['_btree_range'] = ds.id.btree_range + extras['_first_chunk'] = ds.id.get_chunk_info(0).byte_offset + if ds.compression: + extras['_compression'] = ds.compression+f'({ds.compression_opts})' + printattr(extras,[]) + + + if isinstance(obj, File): + hstr='// global ' + elif isinstance(obj, Group): + hstr=f'{indent}// group ' + if obj.attrs: + safe_print(hstr+'attributes:') + printattr(obj.attrs, ['_NCProperties']) + + if groups: + for g,o in groups.items(): + safe_print(f'{indent}group: {g} '+'{') + gindent = indent+' ' + dump_header(o,gindent,real_dimensions, special=special) + safe_print(gindent+'}'+f' // group {g}') + + + +def p5ncdump(file_path, special=False): + + # handle posix and S3 differently + filename = getattr(file_path,'full_name', None) + if filename is None: + filename = file_path + filename = Path(filename).name + + try: + with File(file_path) as f: + + # we assume all the netcdf 4 dimnnsions, if they exist, are in the root group + real_dimensions = collect_dimensions_from_root(f) + + # ok, go for it + safe_print(f"File: {filename} "+'{') + indent = '' + dump_header(f, indent, real_dimensions, special) + safe_print('}') + + except NotImplementedError as e: + if 'unsupported superblock' in str(e): + raise ValueError('Not an HDF5 or NC4 file!') \ No newline at end of file diff --git a/pyfive/p5dump.py b/pyfive/p5dump.py new file mode 100644 index 0000000..52f9364 --- /dev/null +++ b/pyfive/p5dump.py @@ -0,0 +1,49 @@ +from pyfive import p5ncdump +import sys +import signal + +def main(argv=None): + """ + Provides some of the functionality of tools like ncdump and h5dump. + By default this will attempt to do something similar to ncdump. + - h will return this information + - will provide additional information + """ + if argv is None: + argv = sys.argv[1:] # ignore script name + + match argv: + case []: + raise ValueError("No filename provided") + case ["-h"]: + print(main.__doc__) + return 0 + case [filename]: + p5ncdump(filename, special=False) + return 0 + case ["-s", filename]: + p5ncdump(filename, special=True) + return 0 + case _: + raise ValueError(f"Invalid arguments: {argv}") + +if __name__ == '__main__': + # Set SIGPIPE to default behaviour on Unix (ignored safely on Windows) + try: + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + except (AttributeError, ValueError): + pass + + try: + sys.exit(main()) + except BrokenPipeError: + # Happens if pipe is closed early (e.g. `| head` or user quits `more`) + try: + sys.stderr.flush() + except Exception: + pass + sys.exit(0) + except Exception as e: + # Any other error: no traceback, just a clean message + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ea687c2..2523120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,9 @@ license = {text = "BSD License, Version 3-Clause"} name = "pyfive" requires-python = ">=3.10" +[project.scripts] +p5dump = "pyfive.p5dump:main" + [project.optional-dependencies] test = [ "pytest>6.0.0", diff --git a/tests/test_chunk_index_options.py b/tests/test_chunk_index_options.py new file mode 100644 index 0000000..3b9380a --- /dev/null +++ b/tests/test_chunk_index_options.py @@ -0,0 +1,122 @@ +""" +Test pyfive's abililty to read multidimensional datasets +and variants of the chunk index accesses +""" +import os + +import numpy as np +from numpy.testing import assert_array_equal + +import pyfive +from pyfive.h5d import StoreInfo +import pytest +import h5py + +DIRNAME = os.path.dirname(__file__) +DATASET_CHUNKED_HDF5_FILE = os.path.join(DIRNAME, "data", 'chunked.hdf5') + +NOT_CHUNKED_FILE = os.path.join(DIRNAME, "data", 'issue23_A_contiguous.nc') + + +def test_lazy_index(): + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + + # instantiate variable + dset1 = hfile.get_lazy_view('dataset1') + + # should be able to see attributes but not have an index yet + assert dset1.attrs['attr1'] == 130 + + # test we have no index yet + assert dset1.id._DatasetID__index_built==False + + # this should force an index build + assert_array_equal(dset1[:], np.arange(21*16).reshape((21, 16))) + assert dset1.chunks == (2, 2) + + +def test_lazy_visititems(): + + def simpler_check(x,y): + """ Expect this to be visited and instantiated without an index """ + print(x,y.name) + assert y.attrs['attr1'] == 130 + assert y.id._DatasetID__index_built==False + + def simplest_check(x,y): + """ Expect this to be visited and instantiated with an index """ + print(x,y.name) + assert y.attrs['attr1'] == 130 + assert y.id._DatasetID__index_built==True + + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile: + + assert hfile.visititems(simpler_check,noindex=True) is None + assert hfile.visititems(simplest_check) is None + + +def test_get_chunk_info_chunked(): + + # Start lazy, then go real + # we think we know what the right answers are, so we hard + # code them as well as check that's what h5py would return + + with pyfive.File(DATASET_CHUNKED_HDF5_FILE) as hfile, \ + h5py.File(DATASET_CHUNKED_HDF5_FILE) as h5f, \ + open(DATASET_CHUNKED_HDF5_FILE, "rb") as f: + + ds = hfile.get_lazy_view('dataset1') + assert ds.id._DatasetID__index_built==False + + si = StoreInfo((0,0), 0, 4016, 16) + info = ds.id.get_chunk_info(0) + assert info == si + assert h5f["dataset1"].id.get_chunk_info(0) == si + + assert ds.id.get_num_chunks() == 88 + assert h5f["dataset1"].id.get_num_chunks() == 88 + + assert ds.id.btree_range == (1072, 8680) + f.seek(1072) + assert f.read(4) == b"TREE" # only v1 btrees + f.seek(8680) + assert f.read(4) == b"TREE" # only v1 btrees + + + +def test_get_chunk_methods_contiguous(): + + with pyfive.File(NOT_CHUNKED_FILE) as hfile: + + ds = hfile.get_lazy_view('q') + assert ds.id._DatasetID__index_built==False + + with pytest.raises(TypeError): + ds.id.get_chunk_info(0) + + with pytest.raises(TypeError): + ds.id.get_num_chunks() + + with pytest.raises(TypeError): + ds.id.read_direct_chunk(0) + + with pytest.raises(TypeError): + ds.id.btree_range + + + + + + + + + + + + + + + + diff --git a/tests/test_dump.py b/tests/test_dump.py new file mode 100644 index 0000000..771ad46 --- /dev/null +++ b/tests/test_dump.py @@ -0,0 +1,47 @@ +import pytest +from pyfive.p5dump import main +import os + + +DIRNAME = os.path.dirname(__file__) +EARLIEST_HDF5_FILE = os.path.join(DIRNAME, 'data', 'earliest.hdf5') + +# +# A standard nicely behaved netcdf4 file is tested in test_mock_s3fs +# Kill two birds with one stone there. +# + +def test_old_hd5_with_groups(capsys): + filename = EARLIEST_HDF5_FILE + + # No exception means success + assert main([filename]) == 0 + + captured = capsys.readouterr() + + #currently failing + assert 'phony_dim_0' in captured.out + assert 'dataset3(phony_dim' in captured.out + assert 'attr5 = "Test"' in captured.out + + +# Test: script -s filename (special mode) +def test_main_special_real(): + filename = EARLIEST_HDF5_FILE + assert main(["-s", filename]) == 0 + +# Test: -h should print help +def test_main_help_real(capsys): + main(["-h"]) + captured = capsys.readouterr() + assert "Provides some of the functionality" in captured.out + +# Test: no filename → error +def test_main_no_args_real(): + with pytest.raises(ValueError): + main([]) + +# Test: invalid flag → error +def test_main_invalid_args_real(): + with pytest.raises(ValueError): + main(["-x", "file.nc"]) \ No newline at end of file diff --git a/tests/test_mock_s3fs.py b/tests/test_mock_s3fs.py index 7beffb2..0896a4e 100644 --- a/tests/test_mock_s3fs.py +++ b/tests/test_mock_s3fs.py @@ -24,7 +24,7 @@ def test_s3fs_s3(s3fs_s3): assert mock_s3_filesystem.client_kwargs == {'endpoint_url': 'http://127.0.0.1:5555/'} -def test_s3file_with_s3fs(s3fs_s3): +def test_s3file_with_s3fs(s3fs_s3, capsys): """ This test spoofs a complete s3fs FileSystem via s3fs_s3, creates a mock bucket inside it, then puts a REAL netCDF4 file in it, @@ -57,3 +57,12 @@ def test_s3file_with_s3fs(s3fs_s3): pyfive_ds = pyfive.File(f) print(f"Dataset loaded from mock S3 with s3fs and Pyfive: ds") assert "q" in pyfive_ds + + # test command command line main test_s3 + with s3.open(os.path.join("MY_BUCKET", file_name), "rb") as f: + pyfive.p5ncdump(f) + + captured = capsys.readouterr() + assert ('File: issue23_A.nc' in captured.out) + assert ('q:cell_methods = "area: mean"' in captured.out) + assert (':Conventions = "CF-1.12"' in captured.out) \ No newline at end of file