From 34dea8813e796c9bca1afaa9750bb5876ee0b436 Mon Sep 17 00:00:00 2001 From: Alexey Kozhevin Date: Tue, 30 Jul 2024 18:03:58 +0300 Subject: [PATCH] Add docs (#13) * Update docstrings and add docs --- .github/workflows/doc.yml | 28 ++++ docs/Makefile | 20 +++ docs/api/segfast.loader.rst | 5 + docs/api/segfast.memmap_loader.rst | 8 + docs/api/segfast.rst | 12 ++ docs/api/segfast.segyio_loader.rst | 13 ++ docs/api/segfast.trace_header_spec.rst | 8 + docs/api/segfast.utils.rst | 5 + docs/conf.py | 57 +++++++ docs/index.rst | 40 +++++ docs/installation.rst | 14 ++ docs/make.bat | 35 +++++ docs/segy.rst | 29 ++++ docs/start.rst | 31 ++++ pylintrc | 2 +- segfast/loader.py | 21 ++- segfast/memmap_loader.py | 197 +++++++++++++++---------- segfast/segyio_loader.py | 96 +++++++----- segfast/trace_header_spec.py | 33 ++++- segfast/utils.py | 5 +- 20 files changed, 530 insertions(+), 129 deletions(-) create mode 100644 .github/workflows/doc.yml create mode 100644 docs/Makefile create mode 100644 docs/api/segfast.loader.rst create mode 100644 docs/api/segfast.memmap_loader.rst create mode 100644 docs/api/segfast.rst create mode 100644 docs/api/segfast.segyio_loader.rst create mode 100644 docs/api/segfast.trace_header_spec.rst create mode 100644 docs/api/segfast.utils.rst create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/installation.rst create mode 100644 docs/make.bat create mode 100644 docs/segy.rst create mode 100644 docs/start.rst diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml new file mode 100644 index 0000000..4f9e765 --- /dev/null +++ b/.github/workflows/doc.yml @@ -0,0 +1,28 @@ +name: documentation + +on: [push, pull_request, workflow_dispatch] + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - name: Install dependencies + run: | + pip install sphinx sphinx_rtd_theme myst_parser + pip install . + - name: Sphinx build + run: | + sphinx-build docs _build + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + with: + publish_branch: gh-pages + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: _build/ + force_orphan: true diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/api/segfast.loader.rst b/docs/api/segfast.loader.rst new file mode 100644 index 0000000..b0c0e49 --- /dev/null +++ b/docs/api/segfast.loader.rst @@ -0,0 +1,5 @@ +====== +Loader +====== + +.. automethod:: segfast.loader.open diff --git a/docs/api/segfast.memmap_loader.rst b/docs/api/segfast.memmap_loader.rst new file mode 100644 index 0000000..6e33963 --- /dev/null +++ b/docs/api/segfast.memmap_loader.rst @@ -0,0 +1,8 @@ +============ +MemmapLoader +============ + +.. autoclass:: segfast.memmap_loader.MemmapLoader + :members: + :undoc-members: + :member-order: bysource diff --git a/docs/api/segfast.rst b/docs/api/segfast.rst new file mode 100644 index 0000000..7065abf --- /dev/null +++ b/docs/api/segfast.rst @@ -0,0 +1,12 @@ +=== +API +=== + +.. toctree:: + :maxdepth: 5 + + segfast.loader + segfast.memmap_loader + segfast.segyio_loader + segfast.trace_header_spec + segfast.utils diff --git a/docs/api/segfast.segyio_loader.rst b/docs/api/segfast.segyio_loader.rst new file mode 100644 index 0000000..0fcfdc5 --- /dev/null +++ b/docs/api/segfast.segyio_loader.rst @@ -0,0 +1,13 @@ +============ +SegyioLoader +============ + +.. autoclass:: segfast.segyio_loader.SegyioLoader + :members: + :undoc-members: + :member-order: bysource + +.. autoclass:: segfast.segyio_loader.SafeSegyioLoader + :members: + :undoc-members: + :member-order: bysource diff --git a/docs/api/segfast.trace_header_spec.rst b/docs/api/segfast.trace_header_spec.rst new file mode 100644 index 0000000..de91ee5 --- /dev/null +++ b/docs/api/segfast.trace_header_spec.rst @@ -0,0 +1,8 @@ +=============== +TraceHeaderSpec +=============== + +.. autoclass:: segfast.trace_header_spec.TraceHeaderSpec + :members: + :undoc-members: + :member-order: bysource diff --git a/docs/api/segfast.utils.rst b/docs/api/segfast.utils.rst new file mode 100644 index 0000000..49f350b --- /dev/null +++ b/docs/api/segfast.utils.rst @@ -0,0 +1,5 @@ +===== +Utils +===== + +.. autoclass:: segfast.utils.ForPoolExecutor diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..bca8d23 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +import sys, os +sys.path.insert(0, os.path.abspath('..')) +import segfast + +master_doc = 'index' + +project = 'segfast' +author = 'Analysis Center' +copyright = '2024, ' + author + +release = segfast.__version__ +version = '.'.join(release.split('.')) + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx_rtd_theme', +] + +templates_path = ['_templates'] +exclude_patterns = [] +language = 'en' + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_title = "SegFast" +html_theme = "sphinx_rtd_theme" +html_static_path = ['_static'] +html_theme_options = { + 'logo_only': False +} + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'python': ('https://docs.python.org/', None), + 'numpy': ('https://docs.scipy.org/doc/numpy/', None), + 'segyio': ('https://segyio.readthedocs.io/en/latest/', None) +} diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..82b70f5 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,40 @@ +.. segfast documentation master file, created by + sphinx-quickstart on Thu Feb 1 14:09:14 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +segfast documentation +===================== + +**segfast** is a library for interacting with SEG-Y seismic data. Main features are: + +* Faster access to read data: both traces headers and values +* Optional bufferization, where the user can provide a preallocated memory to load the data into +* Convenient API that relies on :class:`numpy.memmap` for most operations, while providing + `segyio `_ as a fallback engine + + +Implementation details +---------------------- +We rely on **segyio** to infer file-wide parameters. + +For headers and traces, we use custom methods of reading binary data. + +Main differences to **segyio** C++ implementation: + - we read all of the requested headers in one file-wide sweep, speeding up by an order of magnitude + compared to the **segyio** sequential read of every requested header. + Also, we do that in multiple processes across chunks. + + - a memory map over trace data is used for loading values. Avoiding redundant copies and leveraging + :mod:`numpy` superiority allows to speed up reading, especially in case of trace slicing along the samples axis. + This is extra relevant in the case of loading horizontal (depth) slices. + + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + installation + start + segy + api/segfast diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..1ecd479 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,14 @@ +Installation +============ + +* With ``pip``/``pip3``: + + .. code-block:: bash + + pip3 install segfast + +* Developer version (add ``--depth 1`` if needed) + + .. code-block:: bash + + git clone https://github.com/analysiscenter/segfast.git diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/segy.rst b/docs/segy.rst new file mode 100644 index 0000000..91c4486 --- /dev/null +++ b/docs/segy.rst @@ -0,0 +1,29 @@ +SEG-Y description +================= + +The most complete description can be found in `the official SEG-Y specification `_ but here we give +a brief intro into SEG-Y format. + +The SEG-Y is a binary file divided into several blocks: + +- file-wide information block which in most cases takes the first 3600 bytes: + + - **textual header**: the first 3200 bytes are reserved for textual info about the file. Most of the software uses + this header to keep acquisition meta, date of creation, author, etc. + - **binary header**: 3200–3600 bytes contain file-wide headers, which describe the number of traces, the format used + for storing numbers, the number of samples for each trace, acquisition parameters, etc. + - (optional) 3600+ bytes can be used to store the **extended textual information**. If there is such a header, + then this is indicated by the value in one of the 3200–3600 bytes. + +- a sequence of traces, where each trace is a combination of its header and signal data: + + - **trace header** takes the first 240 bytes and describes the meta info about its trace: shot/receiver coordinates, + the method of acquisition, current trace length, etc. Analogously to binary file header, each trace also + can have extended headers. + - **trace data** is usually an array of amplitude values, which can be stored in various numerical types. + As the original SEG-Y is quite old (1975), one of those numerical formats is IBM float, + which is very different from standard IEEE floats; therefore, special caution is required to + correctly decode values from such files. + +For the most part, SEG-Y files are written with a constant size of each trace, although the standard itself allows +for variable-sized traces. We do not work with such files. diff --git a/docs/start.rst b/docs/start.rst new file mode 100644 index 0000000..515dcad --- /dev/null +++ b/docs/start.rst @@ -0,0 +1,31 @@ +Quick start +=========== + +* Open the file: + + .. code-block:: python + + import segfast + segy_file = segfast.open('/path/to/file.sgy') + +* Load headers: + + .. code-block:: python + + headers = segy_file.load_headers(['CDP_X', 'CDP_Y', 'INLINE_3D', 'CROSSLINE_3D']) + +* Load inline: + + .. code-block:: python + + traces_idx = headers[headers['INLINE_3D'] == INLINE_IDX].index + inline = segy_file.load_traces(traces_idx) + +* Load certain depths from all traces: + + .. code-block:: python + + segy_file.load_depth_slices(DEPTHS) + + The resulting array will have shape ``(n_traces, len(DEPTHS))`` so it must be processed to be transformed + to an array of the field shape. diff --git a/pylintrc b/pylintrc index 8ff5727..f6505c0 100644 --- a/pylintrc +++ b/pylintrc @@ -15,7 +15,7 @@ variable-rgx=(.*[a-z][a-z0-9_]{1,30}|[a-z_])$ # snake_case + single letters argument-rgx=(.*[a-z][a-z0-9_]{1,30}|[a-z_])$ # snake_case + single letters [MESSAGE CONTROL] -disable=no-value-for-parameter, no-self-use, too-few-public-methods, unsubscriptable-object, no-member, too-many-lines, +disable=no-value-for-parameter, too-few-public-methods, unsubscriptable-object, no-member, too-many-lines, arguments-differ, too-many-locals, import-error, cyclic-import, duplicate-code, relative-beyond-top-level, unused-argument, too-many-public-methods, invalid-name, attribute-defined-outside-init, arguments-renamed, abstract-method, no-name-in-module, import-self diff --git a/segfast/loader.py b/segfast/loader.py index a0086d8..f84b29f 100644 --- a/segfast/loader.py +++ b/segfast/loader.py @@ -5,7 +5,26 @@ def Loader(path, engine='memmap', endian='big', strict=False, ignore_geometry=True): - """ Selector class for loading SEG-Y with either segyio-based loader or memmap-based one. """ + """ Selector class for loading SEG-Y with either segyio-based loader or memmap-based one. + + Parameters + ---------- + path : str + Path to the SEG-Y file + engine : 'memmap' or 'segyio' + Engine to load data from file: ``'memmap'`` is based on :class:`numpy.memmap` created for the whole file and + ``'segyio'`` is for using **segyio** library instruments. in any case, **segyio** is used to load information + about the entire file (e.g. ``'sample_interval'`` or ``'shape'``). + endian : 'big' or 'little' + Byte order in the file. + strict : bool + See :func:`segyio.open` + ignore_geometry : bool + See :func:`segyio.open` + Return + ------ + :class:`~.memmap_loader.MemmapLoader` or :class:`~.segyio_loader.SegyioLoader` + """ loader_class = _select_loader_class(engine) return loader_class(path=path, endian=endian, strict=strict, ignore_geometry=ignore_geometry) open = File = Loader diff --git a/segfast/memmap_loader.py b/segfast/memmap_loader.py index 3dd514b..91b4568 100755 --- a/segfast/memmap_loader.py +++ b/segfast/memmap_loader.py @@ -18,44 +18,45 @@ class MemmapLoader(SegyioLoader): """ Custom reader/writer for SEG-Y files. - Relies on memory mapping mechanism for actual reads of headers and traces. + Relies on a memory mapping mechanism for actual reads of headers and traces. SEG-Y description ----------------- - Here we give a brief intro into SEG-Y format. Each SEG-Y file consists of: - - file-wide information, in most cases the first 3600 bytes. - - the first 3200 bytes are reserved for textual info about the file. - Most software uses this to keep track of processing operations, date of creation, author, etc. - - 3200-3600 bytes contain file-wide headers, which describe the number of traces, - used format, depth, acquisition parameters, etc. - - 3600+ bytes can be used to store the extended textual information, which is optional and indicated by - one of the values in 3200-3600 bytes. - - - a sequence of traces, where each trace is a combination of header and its actual data. - - header is the first 240 bytes and it describes the meta info about that trace: - its coordinates in different types, the method of acquisition, etc. - - data is an array of values, usually amplitudes, which can be stored in multiple numerical types. - As the original SEG-Y is quite old (1975), one of those numerical formats is IBM float, - which is very different from standard IEEE floats; therefore, a special caution is required to - correctly decode values from such files. - - For the most part, SEG-Y files are written with constant size of each trace, although the standard itself allows - for variable-sized traces. We do not work with such files. + The SEG-Y is a binary file divided into several blocks: + + - file-wide information block which in most cases takes the first 3600 bytes: + + - **textual header**: the first 3200 bytes are reserved for textual info about the file. Most of the + software uses this header to keep acquisition meta, date of creation, author, etc. + - **binary header**: 3200–3600 bytes contain file-wide headers, which describe the number of traces, + a format used for storing numbers, the number of samples for each trace, acquisition parameters, etc. + - (optional) 3600+ bytes can be used to store the **extended textual information**. If there is + such a header, then this is indicated by the value in one of the 3200–3600 bytes. + + - a sequence of traces, where each trace is a combination of its header and signal data: + - **trace header** takes the first 240 bytes and describes the meta info about its trace: shot/receiver + coordinates, the method of acquisition, current trace length, etc. Analogously to binary file header, + each trace also can have extended headers. + - **trace data** is usually an array of amplitude values, which can be stored in various numerical types. + As the original SEG-Y is quite old (1975), one of those numerical formats is IBM float, + which is very different from standard IEEE floats; therefore, a special caution is required to + correctly decode values from such files. + + For the most part, SEG-Y files are written with a constant size of each trace, although the standard itself allows + for variable-sized traces. We do not work with such files. Implementation details ---------------------- - We rely on `segyio` to infer file-wide parameters. - - For headers and traces, we use custom methods of reading binary data. - Main differences to `segyio C++` implementation: + We rely on :mod:`segyio` to infer file-wide parameters. For headers and traces, we use custom methods of reading + binary data. Main differences to :mod:`segyio` `C++` implementation: - we read all of the requested headers in one file-wide sweep, speeding up by an order of magnitude - compared to the `segyio` sequential read of every requested header. - Also, we do that in multiple processes across chunks. + compared to the :mod:`segyio` sequential read of every requested header. + Also, we do that in multiple processes across chunks. - - a memory map over traces data is used for loading values. Avoiding redundant copies and leveraging - `numpy` superiority allows to speed up reading, especially in case of trace slicing along the samples axis. - This is extra relevant in case of loading horizontal (depth) slices. + - a memory map over trace data is used for loading values. Avoiding redundant copies and leveraging + :mod:`numpy` superiority allows to speed up reading, especially in case of trace slicing along the samples + axis. This is extra relevant in the case of loading horizontal (depth) slices. """ def __init__(self, path, endian='big', strict=False, ignore_geometry=True): # Re-use most of the file-wide attributes from the `segyio` loader @@ -92,9 +93,9 @@ def _construct_data_mmap(self): def load_headers(self, headers, indices=None, reconstruct_tsf=True, sort_columns=True, return_specs=False, chunk_size=25_000, max_workers=4, pbar=False, **kwargs): """ Load requested trace headers from a SEG-Y file for each trace into a dataframe. - If needed, we reconstruct the `'TRACE_SEQUENCE_FILE'` manually be re-indexing traces. + If needed, we reconstruct the ``'TRACE_SEQUENCE_FILE'`` manually be re-indexing traces. - Under the hood, we create a memory mapping over the SEG-Y file, and view it with a special dtype. + Under the hood, we create a memory mapping over the SEG-Y file, and view it with special dtype. That dtype skips all of the trace data bytes and all of the unrequested headers, leaving only passed `headers` as non-void dtype. @@ -104,11 +105,11 @@ def load_headers(self, headers, indices=None, reconstruct_tsf=True, sort_columns ---------- headers : sequence An array-like where each element can be: - - str -- header name, - - int -- header starting byte, - - :class:~`.utils.TraceHeaderSpec` -- used as is, - - tuple -- args to init :class:~`.utils.TraceHeaderSpec`, - - dict -- kwargs to init :class:~`.utils.TraceHeaderSpec`. + - ``str`` -- header name, + - ``int`` -- header starting byte, + - :class:`~.trace_header_spec.TraceHeaderSpec` -- used as is, + - ``tuple`` -- args to init :class:`~.trace_header_spec.TraceHeaderSpec`, + - ``dict`` -- kwargs to init :class:`~.trace_header_spec.TraceHeaderSpec`. indices : sequence or None Indices of traces to load trace headers for. If not given, trace headers are loaded for all traces. reconstruct_tsf : bool @@ -119,25 +120,50 @@ def load_headers(self, headers, indices=None, reconstruct_tsf=True, sort_columns Whether to return header specs used to load trace headers. chunk_size : int Maximum amount of traces in each chunk. - max_workers : int or None - Maximum number of parallel processes to spawn. If None, then the number of CPU cores is used. - pbar : bool, str + max_workers : int, optional + Maximum number of parallel processes to spawn. If ``None``, then the number of CPU cores is used. + pbar : bool or str If bool, then whether to display progress bar over the file sweep. - If str, then type of progress bar to display: `'t'` for textual, `'n'` for widget. + If str, then type of progress bar to display: ``'t'`` for textual, ``'n'`` for widget. + + Return + ------ + ``pd.DataFrame`` Examples -------- - Standard 'CDP_X' and 'CDP_Y' headers: - >>> segfast_file.load_headers(['CDP_X', 'CDP_Y']) - Standard headers from 181 and 185 bytes with standard dtypes: - >>> segfast_file.load_headers([181, 185]) - Load 'CDP_X' and 'CDP_Y' from non-standard bytes positions corresponding to some standard headers (i.e. load - 'CDP_X' from bytes for 'INLINE_3D' with '>> segfast_file.load_headers([{'name': 'CDP_X', 'start_byte': 189, 'dtype': '>> segfast_file.load_headers([('CDP_X', 45, '>f4'), ('CDP_Y', 10, '>f4')]) - Load 'FieldRecord' header for the first 5 traces: - >>> segfast_file.load_headers(['FieldRecord'], indices=np.arange(5)) + * Standard ``'CDP_X'`` and ``'CDP_Y'`` headers: + + .. code-block:: python + + segfast_file.load_headers(['CDP_X', 'CDP_Y']) + + * Standard headers from 181 and 185 bytes with standard dtypes: + + .. code-block:: python + + segfast_file.load_headers([181, 185]) + + * Load ``'CDP_X'`` and ``'CDP_Y'`` from non-standard bytes positions corresponding to some standard headers + (i.e. load ``'CDP_X'`` from bytes for ``'INLINE_3D'`` with ``'f4'), ('CDP_Y', 10, '>f4')]) + + * Load 'FieldRecord' header for the first 5 traces: + + .. code-block:: python + + segfast_file.load_headers(['FieldRecord'], indices=np.arange(5)) + """ _ = kwargs headers = self.make_headers_specs(headers) @@ -201,23 +227,28 @@ def callback(future, start): @staticmethod def _make_mmap_headers_dtype(headers): - """ Create list of `numpy` dtypes to view headers data. + """ Create a list of :mod:`numpy` dtypes to view headers data. Defines a dtype for exactly 240 bytes, where each of the requested headers would have its own named subdtype, - and the rest of bytes are lumped into `np.void` of certain lengths. + and the rest of bytes are lumped into :class:`numpy.void` of certain lengths. - Only the headers data should be viewed under this dtype: the rest of trace data (values) + Only the header data should be viewed under this dtype: the rest of trace data (values) should be processed (or skipped) separately. - We do not apply final conversion to `np.dtype` to the resulting list of dtypes so it is easier to append to it. + We do not apply the final conversion to :class:`numpy.dtype` to the resulting list of dtypes so it is easier + to append to it. Examples -------- - if `headers` are `INLINE_3D` and `CROSSLINE_3D`, which are 189-192 and 193-196 bytes, the output would be: - >>> [('unused_0', numpy.void, 188), - >>> ('INLINE_3D', '>i4'), - >>> ('CROSSLINE_3D', '>i4'), - >>> ('unused_1', numpy.void, 44)] + If ``headers`` are ``'INLINE_3D'`` and ``'CROSSLINE_3D'``, which are 189-192 and 193-196 bytes, the output + would be: + + .. code-block:: python + + [('unused_0', numpy.void, 188), + ('INLINE_3D', '>i4'), + ('CROSSLINE_3D', '>i4'), + ('unused_1', numpy.void, 44)] """ headers = sorted(headers, key=lambda x: x.start_byte) @@ -250,18 +281,22 @@ def _make_mmap_headers_dtype(headers): # Traces loading def load_traces(self, indices, limits=None, buffer=None): """ Load traces by their indices. - Under the hood, we use a pre-made memory mapping over the file, where trace data is viewed with a special dtype. + Under the hood, we use pre-made memory mapping over the file, where trace data is viewed with a special dtype. Regardless of the numerical dtype of SEG-Y file, we output IEEE float32: for IBM floats, that requires an additional conversion. Parameters ---------- indices : sequence - Indices (TRACE_SEQUENCE_FILE) of the traces to read. + Indices (``'TRACE_SEQUENCE_FILE'``) of the traces to read. limits : sequence of ints, slice, optional Slice of the data along the depth axis. - buffer : np.ndarray, optional + buffer : numpy.ndarray, optional Buffer to read the data into. If possible, avoids copies. + + Return + ------ + numpy.ndarray """ limits = self.process_limits(limits) @@ -281,14 +316,18 @@ def load_traces(self, indices, limits=None, buffer=None): def load_depth_slices(self, indices, buffer=None): """ Load horizontal (depth) slices of the data. - Requires a ~full sweep through SEG-Y, therefore is slow. + Requires an almost full sweep through SEG-Y, therefore is slow. Parameters ---------- indices : sequence Indices (ordinals) of the depth slices to read. - buffer : np.ndarray, optional + buffer : numpy.ndarray, optional Buffer to read the data into. If possible, avoids copies. + + Return + ------ + numpy.ndarray """ depth_slices = self.data_mmap[:, indices] if self.file_format == 1: @@ -316,7 +355,7 @@ def __getstate__(self): return state def __setstate__(self, state): - """ Recreate instance from unpickled state, reopen source SEG-Y file and memmap. """ + """ Recreate instance from the unpickled state, reopen source SEG-Y file and memmap. """ super().__setstate__(state) self.data_mmap = self._construct_data_mmap() @@ -324,7 +363,7 @@ def __setstate__(self, state): # Conversion to other SEG-Y formats (data dtype) def convert(self, path=None, format=8, transform=None, chunk_size=25_000, max_workers=4, pbar='t', overwrite=True): - """ Convert SEG-Y file to a different `format`: dtype of data values. + """ Convert SEG-Y file to a different ``format``: dtype of data values. Keeps the same binary header (except for the 3225 byte, which stores the format). Keeps the same header values for each trace: essentially, only the values of each trace are transformed. @@ -334,22 +373,26 @@ def convert(self, path=None, format=8, transform=None, chunk_size=25_000, max_wo Parameters ---------- path : str, optional - Path to save file to. If not provided, we use the path of the current cube with an added postfix. + Path to the save file to. If not provided, we use the path of the current cube with an added postfix. format : int Target SEG-Y format. - Refer to :attr:`SEGY_FORMAT_TO_TRACE_DATA_DTYPE` for list of available formats and their data value dtype. + Refer to :attr:`.SEGY_FORMAT_TO_TRACE_DATA_DTYPE` for list of available formats and their data value dtype. transform : callable, optional - Callable to transform data from the current file to the ones, saved in `path`. - Must return the same dtype, as specified by `format`. + Callable to transform data from the current file to the ones, saved in ``path``. + Must return the same dtype, as specified by ``format``. chunk_size : int Maximum amount of traces in each chunk. max_workers : int or None Maximum number of parallel processes to spawn. If None, then the number of CPU cores is used. pbar : bool, str - If bool, then whether to display progress bar. - If str, then type of progress bar to display: `'t'` for textual, `'n'` for widget. + If bool, then whether to display a progress bar. + If str, then the type of progress bar to display: ``'t'`` for textual, ``'n'`` for widget. overwrite : bool - Whether to overwrite existing `path` or raise an exception. + Whether to overwrite the existing ``path`` or raise an exception. + + Return + ------ + path : str """ #pylint: disable=redefined-builtin # Default path @@ -433,7 +476,7 @@ def read_chunk(path, shape, offset, mmap_dtype, buffer_dtype, headers, indices): def convert_chunk(src_path, dst_path, shape, offset, src_dtype, dst_dtype, endian, transform, start, chunk_size): - """ Copy the headers, transform and write data from one chunk. + """ Copy the headers, transform, and write data from one chunk. We create all memory mappings anew in each worker, as it is easier and creates no significant overhead. """ # Deserialize `transform` @@ -465,8 +508,8 @@ def convert_chunk(src_path, dst_path, shape, offset, src_dtype, dst_dtype, endia @njit(nogil=True, parallel=True) def ibm_to_ieee(hh, hl, lh, ll): """ Convert 4 arrays representing individual bytes of IBM 4-byte floats into a single array of floats. - Input arrays are ordered from most to least significant bytes and have `np.uint8` dtypes. - The result is returned as an `np.float32` array. + Input arrays are ordered from most to least significant bytes and have ``numpy.uint8`` dtypes. + The result is returned as an ``numpy.float32`` array. """ # pylint: disable=not-an-iterable res = np.empty_like(hh, dtype=np.float32) diff --git a/segfast/segyio_loader.py b/segfast/segyio_loader.py index dfc20a5..b28e50b 100755 --- a/segfast/segyio_loader.py +++ b/segfast/segyio_loader.py @@ -12,12 +12,15 @@ class SegyioLoader: - """ A thin wrapper around `segyio` library for convenient loading of headers and traces. + """ A thin wrapper around **segyio** library for convenient loading of headers and traces. + + Most of the methods directly call the public API of **segyio**. + + For trace loading, we use private methods and attributes of :class:`segyio.SegyFile`, which allow: + + * reading data into the pre-defined buffer + * read only parts of the trace - Most of the methods directly call public API of `segyio`. - For trace loading we use private methods and attributes of `segyio.SegyFile`, which allow: - - reading data into pre-defined buffer - - read only parts of the trace. This gives up to 50% speed-up over public API for the scenario of loading sequence of traces, and up to 15% over public API in case of loading full lines (inlines or crosslines). """ @@ -33,7 +36,7 @@ class SegyioLoader: 11: "u2", 12: "u8", 16: "u1", - } + } #: :meta private: ENDIANNESS_TO_SYMBOL = { "big": ">", @@ -41,7 +44,7 @@ class SegyioLoader: "little": "<", "lsb": "<", - } + } #: :meta private: def __init__(self, path, endian='big', strict=False, ignore_geometry=True): # Parse arguments for errors @@ -93,22 +96,23 @@ def delay(self): def load_headers(self, headers, indices=None, reconstruct_tsf=True, sort_columns=True, return_specs=False, tracewise=True, pbar=False, **kwargs): """ Load requested trace headers from a SEG-Y file for each trace into a dataframe. - If needed, we reconstruct the `'TRACE_SEQUENCE_FILE'` manually be re-indexing traces. + If needed, we reconstruct the ``'TRACE_SEQUENCE_FILE'`` manually be re-indexing traces. Parameters ---------- headers : sequence An array-like where each element can be: - - str -- header name, - - int -- header starting byte, - - :class:~`.utils.TraceHeaderSpec` -- used as is, - - tuple -- args to init :class:~`.utils.TraceHeaderSpec`, - - dict -- kwargs to init :class:~`.utils.TraceHeaderSpec`. + - ``str`` -- header name, + - ``int`` -- header starting byte, + - :class:`~.trace_header_spec.TraceHeaderSpec` -- used as is, + - ``tuple`` -- args to init :class:`~.trace_header_spec.TraceHeaderSpec`, + - ``dict`` -- kwargs to init :class:`~.trace_header_spec.TraceHeaderSpec`. + Note that for :class:`.SegyioLoader` all nonstandard headers byte positions and dtypes will be ignored. indices : sequence or None Indices of traces to load trace headers for. If not given, trace headers are loaded for all traces. reconstruct_tsf : bool - Whether to reconstruct `TRACE_SEQUENCE_FILE` manually. + Whether to reconstruct ``TRACE_SEQUENCE_FILE`` manually. sort_columns : bool Whether to sort columns in the resulting dataframe by their starting bytes. return_specs : bool @@ -116,8 +120,12 @@ def load_headers(self, headers, indices=None, reconstruct_tsf=True, sort_columns tracewise : bool Whether to iterate over the file in a trace-wise manner, instead of header-wise. pbar : bool, str - If bool, then whether to display progress bar over the file sweep. - If str, then type of progress bar to display: `'t'` for textual, `'n'` for widget. + If ``bool``, then whether to display the progress bar over the file sweep. + If ``str``, then type of progress bar to display: ``'t'`` for textual, ``'n'`` for widget. + + Return + ------ + ``pandas.DataFrame`` """ _ = kwargs headers = self.make_headers_specs(headers) @@ -157,7 +165,10 @@ def load_header(self, header, indices=None, **kwargs): @staticmethod def postprocess_headers_dataframe(dataframe, headers, indices=None, reconstruct_tsf=True, sort_columns=True): - """ Optionally add TSF header and sort columns of a headers dataframe. """ + """ Optionally add ``'TRACE_SEQUENCE_FILE'`` header and sort columns of a headers dataframe. + + :meta private: + """ if reconstruct_tsf: if indices is None: dtype = np.int32 if len(dataframe) < np.iinfo(np.int32).max else np.int64 @@ -174,7 +185,7 @@ def postprocess_headers_dataframe(dataframe, headers, indices=None, reconstruct_ return dataframe, headers def make_headers_specs(self, headers): - """ Make instances of TraceHeaderSpec. """ + """ Transform headers list to list of :class:`~.trace_header_spec.TraceHeaderSpec` instances. """ byteorder = self.ENDIANNESS_TO_SYMBOL[self.endian] if headers == 'all': @@ -207,10 +218,10 @@ def load_traces(self, indices, limits=None, buffer=None): Parameters ---------- indices : sequence - Indices (TRACE_SEQUENCE_FILE) of the traces to read. + Indices (``TRACE_SEQUENCE_FILE``) of the traces to read. limits : sequence of ints, slice, optional Slice of the data along the depth axis. - buffer : np.ndarray, optional + buffer : numpy.ndarray, optional Buffer to read the data into. If possible, avoids copies. """ limits = self.process_limits(limits) @@ -224,7 +235,7 @@ def load_traces(self, indices, limits=None, buffer=None): return buffer def process_limits(self, limits): - """ Convert given `limits` to a `slice`. """ + """ Convert given ``limits`` to a ``slice`` instance. """ if limits is None: return slice(0, self.n_samples, 1) if isinstance(limits, int): @@ -241,7 +252,7 @@ def process_limits(self, limits): return slice(*indices) def load_trace(self, index, buffer, limits): - """ Load one trace into buffer. """ + """ Load one trace into the buffer. """ self.file_handler.xfd.gettr(buffer, index, 1, 1, limits.start, limits.stop, limits.step, buffer.size) @@ -250,14 +261,18 @@ def load_trace(self, index, buffer, limits): # Data loading: depth slices def load_depth_slices(self, indices, buffer=None): """ Load horizontal (depth) slices of the data. - Requires a ~full sweep through SEG-Y, therefore is slow. + Requires an almost full sweep through SEG-Y, therefore is slow. Parameters ---------- indices : sequence Indices (ordinals) of the depth slices to read. - buffer : np.ndarray, optional + buffer : numpy.ndarray, optional Buffer to read the data into. If possible, avoids copies. + + Return + ------ + numpy.ndarray """ if buffer is None: buffer = np.empty((len(indices), self.n_traces), dtype=self.dtype) @@ -273,13 +288,13 @@ def load_depth_slice(self, index, buffer): # Convenience and utility methods def make_chunk_iterator(self, chunk_size=None, n_chunks=None, limits=None, buffer=None): - """ Create on iterator over the entire file traces in chunks. + """ Create an iterator over the entire file traces in chunks. - Each chunk contains no more than `chunk_size` traces. - If `chunk_size` is not provided and `n_chunks` is given instead, there are no more than `n_chunks` chunks. - One and only one of `chunk_size` and `n_chunks` should be provided. + Each chunk contains no more than ``chunk_size`` traces. + If ``chunk_size`` is not provided and ``n_chunks`` is given instead, there are no more than ``n_chunks`` chunks. + One and only one of ``chunk_size`` and ``n_chunks`` should be provided. - Each element in the iterator is a dictionary with `'data'`, `'start'` and `'end'` keys. + Each element in the iterator is a dictionary with ``'data'``, ``'start'`` and ``'end'`` keys. Parameters ---------- @@ -288,19 +303,20 @@ def make_chunk_iterator(self, chunk_size=None, n_chunks=None, limits=None, buffe n_chunks : int, optional Maximum number of chunks. limits : sequence of ints, slice, optional - Slice of the data along the depth (last) axis. Passed directly to :meth:`load_traces`. - buffer : np.ndarray, optional - Buffer to read the data into. If possible, avoids copies. Passed directly to :meth:`load_traces`. + Slice of the data along the depth (last) axis. Passed directly to :meth:`.load_traces`. + buffer : numpy.ndarray, optional + Buffer to read the data into. If possible, avoids copies. Passed directly to :meth:`.load_traces`. - Returns - ------- + Return + ------ iterator, info : tuple with two elements iterator : iterable An iterator over the entire SEG-Y traces. - Each element in the iterator is a dictionary with `'data'`, `'start'` and `'end'` keys. + Each element in the iterator is a dictionary with ``'data'``, ``'start'`` and ``'end'`` keys. info : dict - Description of the iterator with `'chunk_size'`, `'n_chunks'`, `'chunk_starts'` and `'chunk_ends'` keys. + Description of the iterator with ``'chunk_size'``, ``'n_chunks'``, ``'chunk_starts'`` and ``'chunk_ends'`` + keys. """ # Parse input parameters if chunk_size is None and n_chunks is None: @@ -333,7 +349,7 @@ def make_chunk_iterator(self, chunk_size=None, n_chunks=None, limits=None, buffe return iterator, info def chunk_iterator(self, chunk_size=None, n_chunks=None, limits=None, buffer=None): - """ A shorthand for :meth:`make_chunk_iterator` with no info returned. """ + """ A shorthand for :meth:`.make_chunk_iterator` with no info returned. """ return self.make_chunk_iterator(chunk_size=chunk_size, n_chunks=n_chunks, limits=limits, buffer=buffer)[0] @@ -346,7 +362,7 @@ def __exit__(self, _, __, ___): self.file_handler.close() def __getstate__(self): - """ Create pickling state from `__dict__` by setting SEG-Y file handler to `None`. """ + """ Create a pickling state from ``__dict__`` by setting SEG-Y file handler to ``None``. """ state = copy(self.__dict__) state["file_handler"] = None return state @@ -360,9 +376,9 @@ def __setstate__(self, state): class SafeSegyioLoader(SegyioLoader): - """ A thin wrapper around `segyio` library for convenient loading of headers and traces. + """ A thin wrapper around **segyio** library for convenient loading of headers and traces. - Unlike :class:`SegyioLoader`, uses only public APIs to load traces. + Unlike :class:`.SegyioLoader`, uses only public APIs to load traces. Used mainly for performance measurements. """ diff --git a/segfast/trace_header_spec.py b/segfast/trace_header_spec.py index d36adf8..f0e5abc 100644 --- a/segfast/trace_header_spec.py +++ b/segfast/trace_header_spec.py @@ -4,29 +4,46 @@ import segyio class TraceHeaderSpec: - """ Trace header class to store its name and byte position. By default, byte position is defined by name - accordingly to SEG-Y specification. + """ Trace header class to store its name, byte position and dtype (including endianness). By default, byte position + is defined by name according to SEG-Y specification. Parameters ---------- name : str Name of the header. start_byte : int, optional - Byte position of the header, by default None. If None, default byte position from the spec will be used. + Byte position of the header, by default ``None``. If ``None``, default byte position from the spec will be used. dtype : int, str or dtype, optional - dtype for header (e.g. 'i2', '>f4', `np.float32`) or its length in bytes (then is interpreted as integer type). + dtype for header (e.g. ``'i2'``, ``'>f4'``, ``numpy.float32``) or its length in bytes (then is interpreted + as integer type). byteorder : '>' or '<', optional - Endianness to use, if it's not defined by dtype. If None and dtype doesn't specify, architecture default - will be used. + Endianness to use, if it's not defined by ``dtype``. If ``None`` and dtype doesn't specify it, architecture + default will be used. """ TRACE_HEADER_SIZE = 240 STANDARD_HEADER_TO_BYTE = segyio.tracefield.keys + """ Mapping from standard header name to its start byte. + + :meta hide-value: + """ STANDARD_BYTE_TO_HEADER = {v: k for k, v in STANDARD_HEADER_TO_BYTE.items()} + """ Mapping from start byte to header name accordingly to standard. + + :meta hide-value: + """ START_BYTES = sorted(STANDARD_HEADER_TO_BYTE.values()) + """ List bytes positions for standard headers + + :meta hide-value: + """ STANDARD_BYTE_TO_LEN = {start: end - start for start, end in zip(START_BYTES, START_BYTES[1:] + [TRACE_HEADER_SIZE + 1])} + """ Mapping from start byte to length of header in bytes accordingly to standard. + + :meta hide-value: + """ def __init__(self, name=None, start_byte=None, dtype=None, byteorder=None): self.name = name or self.STANDARD_BYTE_TO_HEADER[start_byte] @@ -64,7 +81,7 @@ def has_standard_location(self): @property def standard_name(self): - """ The name from specification for header (if 'has_standard_location' is True). """ + """ The name from the specification for the header (if ``has_standard_location`` is ``True``). """ if not self.has_standard_location: raise ValueError("The header has non-standard start byte or dtype") return self.STANDARD_BYTE_TO_HEADER[self.start_byte] @@ -100,7 +117,7 @@ def __hash__(self): return hash(self._spec_params) def set_default_byteorder(self, byteorder): - """ Set byteorder to use as default (if not specified by dtype). """ + """ Set byteorder to use as a default, if not specified by ``dtype``. """ dtype = self.dtype.str if not self.has_explicit_byteorder: dtype = dtype[1:] diff --git a/segfast/utils.py b/segfast/utils.py index 3ebc8f3..f7699f8 100644 --- a/segfast/utils.py +++ b/segfast/utils.py @@ -49,8 +49,9 @@ def update(self, n=1): class ForPoolExecutor(Executor): """ A sequential executor of tasks in a for loop. - Inherits `Executor` interface, so can serve as a drop-in replacement for either - `ThreadPoolExecutor` or `ProcessPoolExecutor` when threads or processes spawning is undesirable. + Inherits :class:`concurrent.futures.Executor` interface, so can serve as a drop-in replacement for either + :class:`concurrent.futures.ThreadPoolExecutor` or :class:`concurrent.futures.ProcessPoolExecutor` when threads or + processes spawning is undesirable. """ def __init__(self, *args, **kwargs):