|
1 | 1 | # SPDX-License-Identifier: BSD-3-Clause |
2 | | -# Copyright 2025 MoDaCor Authors |
3 | | -# |
4 | | -# Redistribution and use in source and binary forms, with or without modification, |
5 | | -# are permitted provided that the following conditions are met: |
6 | | -# 1. Redistributions of source code must retain the above copyright notice, this |
7 | | -# list of conditions and the following disclaimer. |
8 | | -# 2. Redistributions in binary form must reproduce the above copyright notice, |
9 | | -# this list of conditions and the following disclaimer in the documentation |
10 | | -# and/or other materials provided with the distribution. |
11 | | -# 3. Neither the name of the copyright holder nor the names of its contributors |
12 | | -# may be used to endorse or promote products derived from this software without |
13 | | -# specific prior written permission. |
14 | | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND |
15 | | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
16 | | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
17 | | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR |
18 | | -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
19 | | -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
20 | | -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
21 | | -# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | | -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
23 | | -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | | - |
25 | | -__license__ = "BSD-3-Clause" |
26 | | -__copyright__ = "Copyright 2025 MoDaCor Authors" |
27 | | -__status__ = "Alpha" |
| 2 | +# /usr/bin/env python3 |
| 3 | +# -*- coding: utf-8 -*- |
28 | 4 |
|
| 5 | +from __future__ import annotations |
| 6 | + |
| 7 | +from typing import Any, Optional |
| 8 | + |
| 9 | +__coding__ = "utf-8" |
| 10 | +__author__ = "Tim Snow, Brian R. Pauw" |
| 11 | +__copyright__ = "Copyright 2025, The MoDaCor team" |
| 12 | +__date__ = "22/10/2025" |
| 13 | +__status__ = "Development" # "Development", "Production" |
| 14 | +# end of header and standard imports |
| 15 | + |
| 16 | +__all__ = ["HDFLoader"] |
29 | 17 |
|
30 | 18 | from logging import WARNING |
31 | | -from os.path import abspath |
| 19 | +from pathlib import Path |
32 | 20 |
|
33 | 21 | import h5py |
| 22 | +import numpy as np |
34 | 23 |
|
35 | | -from modacor.dataclasses.basedata import BaseData |
36 | 24 | from modacor.dataclasses.messagehandler import MessageHandler |
37 | 25 |
|
| 26 | +# from modacor.dataclasses.basedata import BaseData |
| 27 | +from modacor.io.io_source import ArraySlice |
| 28 | + |
38 | 29 | from ..io_source import IoSource |
39 | 30 |
|
40 | 31 |
|
41 | 32 | class HDFLoader(IoSource): |
42 | | - def __init__(self, source_reference: str, logging_level=WARNING): |
| 33 | + _data_cache: dict[str, np.ndarray] = None |
| 34 | + _file_path: Path | None = None |
| 35 | + _static_metadata_cache: dict[str, Any] = None |
| 36 | + |
| 37 | + def __init__(self, source_reference: str, logging_level=WARNING, resource_location: Path | str | None = None): |
43 | 38 | super().__init__(source_reference) |
44 | | - self.hdf_logger = MessageHandler(level=logging_level, name="hdf5logger") |
45 | | - self._file_path = None |
46 | | - self._file_reference = None |
| 39 | + self.logger = MessageHandler(level=logging_level, name="HDFLoader") |
| 40 | + self._file_path = Path(resource_location) if resource_location is not None else None |
| 41 | + # self._file_reference = None # let's not leave open file references lying around if we can help it. |
47 | 42 | self._file_datasets = [] |
48 | 43 | self._file_datasets_shapes = {} |
| 44 | + self._file_datasets_dtypes = {} |
| 45 | + self._data_cache = {} |
| 46 | + self._static_metadata_cache = {} |
49 | 47 |
|
50 | | - def _open_file(self, file_path=None): |
51 | | - if file_path is None: |
52 | | - error = "No filepath given" |
53 | | - self.hdf_logger.log.error(error) |
54 | | - raise OSError(error) |
55 | | - |
56 | | - try: |
57 | | - self._file_reference = h5py.File(file_path, "r") |
58 | | - self._file_path = abspath(file_path) |
59 | | - self._file_reference.visititems(self._find_datasets) |
60 | | - except OSError as error: |
61 | | - self.hdf_logger.logger.error(error) |
62 | | - raise OSError(error) |
63 | | - |
64 | | - def _close_file(self): |
| 48 | + def _preload(self): |
| 49 | + assert self._file_path.is_file(), self.logger.error(f"HDF5 file {self._file_path} does not exist.") |
65 | 50 | try: |
66 | | - self._file_reference.close() |
67 | | - self._file_path = None |
68 | | - self._file_reference = None |
69 | | - self._file_datasets.clear() |
70 | | - self._file_datasets_shapes.clear() |
| 51 | + with h5py.File(self._file_path, "r") as f: |
| 52 | + f.visititems(self._find_datasets) |
71 | 53 | except OSError as error: |
72 | | - self.hdf_logger.log.error(error) |
| 54 | + self.logger.log.error(error) |
73 | 55 | raise OSError(error) |
74 | 56 |
|
75 | 57 | def _find_datasets(self, path_name, path_object): |
76 | 58 | """ |
77 | 59 | An internal function to be used to walk the tree of an HDF5 file and return a list of |
78 | 60 | the datasets within |
79 | 61 | """ |
80 | | - if isinstance(self._file_reference[path_name], h5py._hl.dataset.Dataset): |
| 62 | + if isinstance(path_object, h5py._hl.dataset.Dataset): |
81 | 63 | self._file_datasets.append(path_name) |
82 | | - self._file_datasets_shapes[path_name] = self._file_reference[path_name].shape |
83 | | - |
84 | | - def get_data(self, data_key: str) -> BaseData: |
85 | | - raise (NotImplementedError("get_data method not yet implemented in HDFLoader class.")) |
| 64 | + self._file_datasets_shapes[path_name] = path_object.shape |
| 65 | + self._file_datasets_dtypes[path_name] = path_object.dtype |
86 | 66 |
|
87 | 67 | def get_static_metadata(self, data_key): |
88 | | - raise ( |
89 | | - NotImplementedError( |
90 | | - "get_static_metadata method not yet implemented in HDFLoader class." |
91 | | - ) |
92 | | - ) |
| 68 | + if data_key not in self._static_metadata_cache: |
| 69 | + with h5py.File(self._file_path, "r") as f: |
| 70 | + value = f[data_key][()] |
| 71 | + # decode bytes to string if necessary |
| 72 | + if isinstance(value, bytes): |
| 73 | + value = value.decode("utf-8") |
| 74 | + self._static_metadata_cache[data_key] = value |
| 75 | + return self._static_metadata_cache[data_key] |
| 76 | + |
| 77 | + def get_data(self, data_key: str, load_slice: ArraySlice = ...) -> np.ndarray: |
| 78 | + if data_key not in self._data_cache: |
| 79 | + with h5py.File(self._file_path, "r") as f: |
| 80 | + data_array = f[data_key][load_slice] # if load_slice is not None else f[data_key][()] |
| 81 | + self._data_cache[data_key] = np.array(data_array) |
| 82 | + return self._data_cache[data_key] |
| 83 | + |
| 84 | + def get_data_shape(self, data_key: str) -> tuple[int, ...]: |
| 85 | + if data_key in self._file_datasets_shapes: |
| 86 | + return self._file_datasets_shapes[data_key] |
| 87 | + return () |
| 88 | + |
| 89 | + def get_data_dtype(self, data_key: str) -> np.dtype | None: |
| 90 | + if data_key in self._file_datasets_dtypes: |
| 91 | + return self._file_datasets_dtypes[data_key] |
| 92 | + return None |
| 93 | + |
| 94 | + def get_data_attributes(self, data_key: str) -> dict[str, Any]: |
| 95 | + attributes = {} |
| 96 | + with h5py.File(self._file_path, "r") as f: |
| 97 | + if data_key in f: |
| 98 | + dataset = f[data_key] |
| 99 | + for attr_key in dataset.attrs: |
| 100 | + attributes[attr_key] = dataset.attrs[attr_key] |
| 101 | + return attributes |
0 commit comments