diff --git a/src/allocator.f90 b/src/allocator.f90
index 698beda79..8a204f0ae 100644
--- a/src/allocator.f90
+++ b/src/allocator.f90
@@ -1,4 +1,11 @@
 module m_allocator
+  !! Memory allocator module for managing field data blocks.
+  !!
+  !! This module provides an allocator type that manages a pool of memory blocks
+  !! (`field_t` objects) organised in a linked list. The allocator supports efficient
+  !! memory reuse by allowing blocks to be requested and released, minimizing
+  !! allocation/deallocation overhead during simulations.
+
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C, NULL_LOC
@@ -34,16 +41,18 @@ module m_allocator
      !! [[m_allocator(module):release_block(subroutine)]].  The
      !! released block is then pushed in front of the block list.
 
-    integer :: ngrid, sz
-    !> The id for the next allocated block.  This counter is
-    !> incremented each time a new block is allocated.
+    integer :: ngrid  !! Total number of grid points per block
+    integer :: sz     !! Block size for data reordering
+    !> The ID for the next allocated block. This counter is
+    !! incremented each time a new block is allocated.
     integer :: next_id = 0
-    !> padded dimensions and n_groups in all 'dir's
+    !> Padded dimensions in all directions [3 dims x 4 directions].
+    !! Dimensions are padded based on block size for efficient reordering.
     integer, private :: dims_padded_dir(3, 4)
+    !> Number of groups for reordering in each direction [x, y, z].
     integer, private :: n_groups_dir(3)
-    !> The pointer to the first block on the list.  Non associated if
-    !> the list is empty
-    ! TODO: Rename first to head
+    !> Pointer to the first block on the linked list. Non-associated if
+    !! the list is empty. (TODO: Rename first to head)
     class(field_t), pointer :: first => null()
   contains
     procedure :: get_block
@@ -62,8 +71,14 @@ module m_allocator
 contains
 
   function allocator_init(dims, sz) result(allocator)
-    integer, intent(in) :: dims(3), sz
-    type(allocator_t) :: allocator
+    !! Initialise an allocator for the given grid dimensions and block size.
+    !!
+    !! Creates a new allocator configured for the specified grid dimensions
+    !! with the given block size. Computes padded dimensions and number of
+    !! groups for efficient data reordering operations.
+    integer, intent(in) :: dims(3)    !! Grid dimensions [nx, ny, nz]
+    integer, intent(in) :: sz         !! Block size for reordering
+    type(allocator_t) :: allocator    !! Initialised allocator
 
     integer :: nx, ny, nz, nx_padded, ny_padded, nz_padded
 
@@ -205,21 +220,31 @@ function get_block_ids(self)
   end function get_block_ids
 
   function get_padded_dims(self, dir) result(dims)
+    !! Get padded dimensions for a specific direction.
+    !!
+    !! Returns the padded dimensions used for memory allocation in the
+    !! specified direction. Padding is applied to ensure efficient memory
+    !! access patterns and alignment.
     implicit none
 
-    class(allocator_t), intent(inout) :: self
-    integer, intent(in) :: dir
-    integer :: dims(3)
+    class(allocator_t), intent(inout) :: self  !! Allocator object
+    integer, intent(in) :: dir                 !! Direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer :: dims(3)                         !! Padded dimensions [nx_pad, ny_pad, nz_pad]
 
     dims = self%dims_padded_dir(1:3, dir)
   end function get_padded_dims
 
   function get_n_groups(self, dir) result(n_groups)
+    !! Get number of groups for data reordering in a direction.
+    !!
+    !! Returns the number of groups used for data reordering operations
+    !! in the specified direction. Groups are determined by the block size
+    !! and grid dimensions.
     implicit none
 
-    class(allocator_t), intent(inout) :: self
-    integer, intent(in) :: dir
-    integer :: n_groups
+    class(allocator_t), intent(inout) :: self  !! Allocator object
+    integer, intent(in) :: dir                 !! Direction (DIR_X, DIR_Y, or DIR_Z)
+    integer :: n_groups                        !! Number of groups
 
     n_groups = self%n_groups_dir(dir)
   end function get_n_groups
diff --git a/src/backend/backend.f90 b/src/backend/backend.f90
index 4c10d2c74..5c73da7ba 100644
--- a/src/backend/backend.f90
+++ b/src/backend/backend.f90
@@ -1,4 +1,47 @@
 module m_base_backend
+  !! Abstract base backend defining the computational interface for X3D2 solver.
+  !!
+  !! This module defines the `base_backend_t` abstract type, which establishes
+  !! the interface for all backend implementations (CUDA GPU, OpenMP CPU, etc.).
+  !! The solver operates exclusively through these abstract interfaces, enabling
+  !! complete architecture independence.
+  !!
+  !! **Architecture Pattern:**
+  !!
+  !! The backend abstraction follows the Strategy design pattern:
+  !!
+  !! - **Abstract interface** (`base_backend_t`): Defines deferred procedures for
+  !!   all computational operations required by the solver
+  !! - **Concrete implementations**: CUDA backend (`m_cuda_backend`) and OMP
+  !!   backend (`m_omp_backend`) extend this base and provide architecture-specific
+  !!   implementations
+  !! - **Solver independence**: The solver (`m_solver`) calls backend methods
+  !!   through the abstract interface without knowing the underlying implementation
+  !!
+  !! **Key Operations Defined:**
+  !!
+  !! - **Transport equation derivatives**: `transeq_x`, `transeq_y`, `transeq_z`
+  !!   compute directional derivatives with halo exchange for distributed compact schemes
+  !! - **Tridiagonal solves**: `tds_solve` applies compact finite difference operators
+  !! - **Data reordering**: `reorder` transforms data between pencil decomposition
+  !!   orientations (X, Y, Z directions)
+  !! - **Field operations**: Vector arithmetic (`veccopy`, `vecadd`, `vecmult`),
+  !!   reductions (`scalar_product`, `field_volume_integral`), and utilities
+  !!   (`field_scale`, `field_shift`, `field_set_face`)
+  !! - **Summation**: `sum_yintox`, `sum_zintox` for integrating fields along
+  !!   specific directions
+  !!
+  !! **Backend Implementations:**
+  !!
+  !! - **CUDA backend** (`src/backend/cuda/backend.f90`): GPU-accelerated using
+  !!   NVIDIA CUDA with device memory management and kernel launches
+  !! - **OMP backend** (`src/backend/omp/backend.f90`): CPU parallelism via
+  !!   OpenMP threading and MPI domain decomposition
+  !!
+  !! **Usage:**
+  !!
+  !! Backends are instantiated at runtime based on compile-time configuration and
+  !! passed to the solver as a polymorphic pointer (`class(base_backend_t), pointer`).
   use mpi
 
   use m_allocator, only: allocator_t
@@ -11,19 +54,37 @@ module m_base_backend
   implicit none
 
   type, abstract :: base_backend_t
-      !! base_backend class defines all the abstract operations that the
-      !! solver class requires.
+      !! Abstract base type defining the computational backend interface.
       !!
-      !! For example, transport equation in solver class evaluates the
-      !! derivatives in x, y, and z directions, and reorders the input
-      !! fields as required. Then finally, combines all the directional
-      !! derivatives to obtain the divergence of U*.
+      !! This type encapsulates all architecture-specific operations required
+      !! by the solver, enabling transparent execution on different hardware
+      !! platforms (GPU via CUDA, CPU via OpenMP) without modifying solver code.
       !!
-      !! All these high level operations solver class executes are
-      !! defined here using the abstract interfaces. Every backend
-      !! implementation extends the present abstact backend class to
-      !! define the specifics of these operations based on the target
-      !! architecture.
+      !! **Design Philosophy:**
+      !!
+      !! The solver executes high-level operations (compute transport equation,
+      !! solve tridiagonal systems, reorder data, etc.) through deferred procedures
+      !! defined in this abstract interface. Each backend (CUDA, OMP) extends this
+      !! type and implements these procedures using architecture-specific kernels,
+      !! libraries, and memory management strategies.
+      !!
+      !! **Example Workflow:**
+      !!
+      !! When computing the transport equation, the solver calls:
+      !!
+      !! 1. `transeq_x`, `transeq_y`, `transeq_z` to compute directional derivatives
+      !! 2. `reorder` to transform data between pencil orientations
+      !! 3. `vecadd` to combine derivatives into divergence of \(U^*\)
+      !!
+      !! Each call dispatches to the appropriate backend implementation at runtime
+      !! via dynamic polymorphism.
+      !!
+      !! **Components:**
+      !!
+      !! - `n_halo`: Number of halo layers for distributed compact schemes (fixed at 4)
+      !! - `mesh`: Pointer to mesh object (grid dimensions, boundary conditions, decomposition)
+      !! - `allocator`: Memory allocator for field storage (host for OMP, device for CUDA)
+      !! - `poisson_fft`: FFT-based Poisson solver for pressure correction
 
     !> DistD2 implementation is hardcoded for 4 halo layers for all backends
     integer :: n_halo = 4
@@ -59,11 +120,35 @@ module m_base_backend
 
   abstract interface
     subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps)
-         !! transeq equation obtains the derivatives direction by
-         !! direction, and the exact algorithm used to obtain these
-         !! derivatives are decided at runtime. Backend implementations
-         !! are responsible from directing calls to transeq_ders into
-         !! the correct algorithm.
+      !! Compute transport equation derivatives for velocity components.
+      !!
+      !! This is the core computational kernel for the transport equation,
+      !! computing the advection-diffusion terms in one coordinate direction:
+      !!
+      !! \[
+      !! \frac{\partial u_i}{\partial t} = -u \frac{\partial u_i}{\partial x_j}
+      !!                                    - v \frac{\partial u_i}{\partial x_j}
+      !!                                    - w \frac{\partial u_i}{\partial x_j}
+      !!                                    + \nu \nabla^2 u_i
+      !! \]
+      !!
+      !! (where the direction \(x_j\) is specified by `dirps`).
+      !!
+      !! **Runtime algorithm selection:**
+      !!
+      !! The exact algorithm used to obtain the derivatives is decided at runtime
+      !! by the backend implementation. Backend implementations are responsible
+      !! for directing calls to the appropriate algorithm based on:
+      !!
+      !! - Operator configuration in `dirps` (distributed vs local compact schemes)
+      !! - Domain decomposition (number of processes in current direction)
+      !! - Boundary conditions (periodic vs non-periodic)
+      !!
+      !! The implementation routes to either:
+      !!
+      !! - **Distributed algorithm** (`exec_dist_transeq_3fused`): For distributed
+      !!   compact schemes with MPI halo exchange
+      !! - **Thomas algorithm** (`exec_thom_transeq`): For localized/periodic operators
       import :: base_backend_t
       import :: field_t
       import :: dirps_t
@@ -71,20 +156,34 @@ subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps)
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: du, dv, dw
-      class(field_t), intent(in) :: u, v, w
-      real(dp), intent(in) :: nu
-      type(dirps_t), intent(in) :: dirps
+      class(field_t), intent(inout) :: du, dv, dw  !! Derivative outputs (momentum equation RHS)
+      class(field_t), intent(in) :: u, v, w        !! Velocity components
+      real(dp), intent(in) :: nu                   !! Kinematic viscosity
+      type(dirps_t), intent(in) :: dirps           !! Directional derivative operators
     end subroutine transeq_ders
   end interface
 
   abstract interface
     subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync)
-         !! transeq equation obtains the derivatives direction by
-         !! direction, and the exact algorithm used to obtain these
-         !! derivatives are decided at runtime. Backend implementations
-         !! are responsible from directing calls to transeq_ders into
-         !! the correct algorithm.
+      !! Compute transport equation derivatives for passive scalar species.
+      !!
+      !! Similar to `transeq_ders` but for passive scalar transport:
+      !!
+      !! \[
+      !! \frac{\partial \phi}{\partial t} = -u \frac{\partial \phi}{\partial x_j}
+      !!                                     - v \frac{\partial \phi}{\partial x_j}
+      !!                                     - w \frac{\partial \phi}{\partial x_j}
+      !!                                     + \nu \nabla^2 \phi
+      !! \]
+      !!
+      !! where \(\phi\) is the scalar concentration and \(x_j\) is the direction
+      !! specified by `dirps`.
+      !!
+      !! **Synchronization:**
+      !!
+      !! The `sync` flag controls whether to synchronize device-to-host memory
+      !! transfers (CUDA backend) after computation. Set `.false.` when chaining
+      !! multiple operations to avoid unnecessary transfers.
       import :: base_backend_t
       import :: field_t
       import :: dirps_t
@@ -92,144 +191,275 @@ subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync)
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: dspec
-      class(field_t), intent(in) :: uvw, spec
-      real(dp), intent(in) :: nu
-      type(dirps_t), intent(in) :: dirps
-      logical, intent(in) :: sync
+      class(field_t), intent(inout) :: dspec  !! Scalar derivative output
+      class(field_t), intent(in) :: uvw       !! Velocity component in current direction
+      class(field_t), intent(in) :: spec      !! Scalar species concentration
+      real(dp), intent(in) :: nu              !! Diffusion coefficient
+      type(dirps_t), intent(in) :: dirps      !! Directional derivative operators
+      logical, intent(in) :: sync             !! Synchronize device transfers (CUDA only)
     end subroutine transeq_ders_spec
   end interface
 
   abstract interface
     subroutine tds_solve(self, du, u, tdsops)
-      !! transeq equation obtains the derivatives direction by
-      !! direction, and the exact algorithm used to obtain these
-      !! derivatives are decided at runtime. Backend implementations
-      !! are responsible from directing calls to tds_solve to the
-      !! correct algorithm.
+      !! Apply a tridiagonal operator to a field (compact finite difference operation).
+      !!
+      !! Solves the tridiagonal system arising from compact finite difference
+      !! schemes:
+      !!
+      !! \[
+      !! A f' = B f
+      !! \]
+      !!
+      !! where \(A\) is the implicit (tridiagonal) operator, \(B\) is the explicit
+      !! stencil, and \(f'\) is the derivative (or interpolated value).
+      !!
+      !! **Backend dispatch:**
+      !!
+      !! Routes to the appropriate tridiagonal solver:
+      !!
+      !! - **Distributed compact**: Uses `exec_dist_tds_compact` with MPI communication
+      !!   for boundary coupling between processes
+      !! - **Thomas algorithm**: Uses `exec_thom_tds_compact` for local/periodic systems
+      !! - **GPU**: Uses batched tridiagonal solvers (cuSPARSE or custom kernels)
+      !!
+      !! **Operations supported:**
+      !!
+      !! First derivative, second derivative, interpolation, staggered derivatives
+      !! (configured in `tdsops`).
       import :: base_backend_t
       import :: field_t
       import :: tdsops_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: du
-      class(field_t), intent(in) :: u
-      class(tdsops_t), intent(in) :: tdsops
+      class(field_t), intent(inout) :: du     !! Output field (derivative or interpolated values)
+      class(field_t), intent(in) :: u         !! Input field
+      class(tdsops_t), intent(in) :: tdsops   !! Tridiagonal operator (preprocessed)
     end subroutine tds_solve
   end interface
 
   abstract interface
     subroutine reorder(self, u_, u, direction)
-         !! reorder subroutines are straightforward, they rearrange
-         !! data into our specialist data structure so that regardless
-         !! of the direction tridiagonal systems are solved efficiently
-         !! and fast.
+      !! Reorder field data between pencil decomposition orientations.
+      !!
+      !! Transforms field layout from one pencil orientation to another to enable
+      !! efficient tridiagonal solves in different coordinate directions:
+      !!
+      !! - **DIR_X**: X-pencils (data contiguous in X, decomposed in Y-Z)
+      !! - **DIR_Y**: Y-pencils (data contiguous in Y, decomposed in X-Z)
+      !! - **DIR_Z**: Z-pencils (data contiguous in Z, decomposed in X-Y)
+      !! - **DIR_C**: Special compact orientation
+      !!
+      !! The `direction` parameter specifies the target orientation using reorder
+      !! constants (`RDR_X2Y`, `RDR_Y2Z`, etc.).
+      !!
+      !! **Backend implementation:**
+      !!
+      !! - **CUDA**: GPU transpose kernels with coalesced memory access
+      !! - **OMP**: MPI all-to-all communication with OpenMP threading
+      !!
+      !! **Performance note:** This is a bandwidth-intensive operation requiring
+      !! global data movement (MPI or device memory transfers).
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-      integer, intent(in) :: direction
+      class(field_t), intent(inout) :: u_      !! Output field (reordered)
+      class(field_t), intent(in) :: u          !! Input field
+      integer, intent(in) :: direction         !! Reorder direction (RDR_X2Y, RDR_Y2Z, etc.)
     end subroutine reorder
   end interface
 
   abstract interface
     subroutine sum_intox(self, u, u_)
-         !! sum9into3 subroutine combines all the directional velocity
-         !! derivatives into the corresponding x directional fields.
+      !! Sum directional derivatives back into X-oriented fields.
+      !!
+      !! Combines derivative contributions computed in different pencil orientations
+      !! (Y-pencils, Z-pencils) back into the X-pencil orientation:
+      !!
+      !! \[
+      !! u = u + u'
+      !! \]
+      !!
+      !! This operation accumulates terms when computing composite derivatives
+      !! like divergence:
+      !!
+      !! \[
+      !! \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x}
+      !!                          + \frac{\partial v}{\partial y}
+      !!                          + \frac{\partial w}{\partial z}
+      !! \]
+      !!
+      !! Each directional derivative is computed in its respective pencil orientation,
+      !! then summed into X-pencils via `sum_yintox` and `sum_zintox`.
+      !!
+      !! **Note:** The input field `u_` must be in a Y or Z pencil orientation;
+      !! the output `u` is always in X-pencil orientation.
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: u
-      class(field_t), intent(in) :: u_
+      class(field_t), intent(inout) :: u   !! Accumulated field (X-pencils, updated in-place)
+      class(field_t), intent(in) :: u_     !! Contribution to add (Y or Z pencils)
     end subroutine sum_intox
   end interface
 
   abstract interface
     subroutine veccopy(self, dst, src)
-         !! copy vectors: y = x
+      !! Copy one field to another: `dst = src`.
+      !!
+      !! Performs an element-wise copy of all field data from `src` to `dst`.
+      !! Both fields must have compatible dimensions and memory layout.
+      !!
+      !! **Backend implementation:**
+      !!
+      !! - **CUDA**: Device-to-device memory copy (cudaMemcpy)
+      !! - **OMP**: Host memory copy (array assignment or memcpy)
+      !!
+      !! **Note:** This is a deep copy operation; the fields remain independent
+      !! after the copy.
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: dst
-      class(field_t), intent(in) :: src
+      class(field_t), intent(inout) :: dst  !! Destination field
+      class(field_t), intent(in) :: src     !! Source field
     end subroutine veccopy
   end interface
 
   abstract interface
     subroutine vecadd(self, a, x, b, y)
-         !! adds two vectors together: y = a*x + b*y
+      !! Compute linear combination of two fields (AXPBY operation).
+      !!
+      !! Performs the vector operation: \(y = a \cdot x + b \cdot y\)
+      !!
+      !! This is equivalent to the BLAS AXPBY operation, computing a scaled
+      !! sum of two vectors. The result is stored in-place in `y`.
+      !!
+      !! **Common use cases:**
+      !!
+      !! - **Vector addition**: `vecadd(self, 1.0_dp, x, 1.0_dp, y)` \(\rightarrow\) \(y = x + y\)
+      !! - **Scaled addition**: `vecadd(self, alpha, x, 1.0_dp, y)` \(\rightarrow\) \(y = \alpha x + y\)
+      !! - **Replacement**: `vecadd(self, 1.0_dp, x, 0.0_dp, y)` \(\rightarrow\) \(y = x\)
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      real(dp), intent(in) :: a
-      class(field_t), intent(in) :: x
-      real(dp), intent(in) :: b
-      class(field_t), intent(inout) :: y
+      real(dp), intent(in) :: a         !! Scaling factor for x
+      class(field_t), intent(in) :: x   !! Input field
+      real(dp), intent(in) :: b         !! Scaling factor for y
+      class(field_t), intent(inout) :: y !! Input/output field (modified in-place)
     end subroutine vecadd
   end interface
 
   abstract interface
     subroutine vecmult(self, y, x)
-        !! pointwise multiplication between two vectors: y(:) = y(:) * x(:)
+      !! Element-wise (pointwise) multiplication of two fields.
+      !!
+      !! Performs the element-wise product: \(y = y \odot x\)
+      !!
+      !! Each element of `y` is multiplied by the corresponding element of `x`.
+      !! The result is stored in-place in `y`. This is also known as the
+      !! Hadamard product or pointwise multiplication.
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: y
-      class(field_t), intent(in) :: x
+      class(field_t), intent(inout) :: y !! Input/output field (modified in-place)
+      class(field_t), intent(in) :: x    !! Multiplier field
     end subroutine vecmult
   end interface
 
   abstract interface
     real(dp) function scalar_product(self, x, y) result(s)
-         !! Calculates the scalar product of two input fields
+      !! Compute the global scalar (dot) product of two fields.
+      !!
+      !! Calculates: \(s = \sum_{i} x_i \cdot y_i\)
+      !!
+      !! This computes the inner product (dot product) of two fields across
+      !! all grid points. For distributed memory systems (MPI), partial sums
+      !! from each process are accumulated via MPI reduction to produce the
+      !! global sum.
+      !!
+      !! **Note:** The result includes contributions from all MPI ranks.
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: x, y
+      class(field_t), intent(in) :: x !! First field
+      class(field_t), intent(in) :: y !! Second field
     end function scalar_product
   end interface
 
   abstract interface
     subroutine field_ops(self, f, a)
-      !! Scales or shifts a field by a
+      !! Generic interface for in-place field operations with a scalar constant.
+      !!
+      !! This abstract interface is implemented by two operations:
+      !!
+      !! - **field_scale**: Multiply field by constant: \(f = a \cdot f\)
+      !! - **field_shift**: Add constant to field: \(f = f + a\)
+      !!
+      !! Both operations modify the field in-place and are backend-specific
+      !! (GPU kernels for CUDA, array operations for OMP).
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: f
-      real(dp), intent(in) :: a
+      class(field_t), intent(in) :: f  !! Field to operate on (modified in-place)
+      real(dp), intent(in) :: a        !! Scalar constant (scaling factor or shift amount)
     end subroutine field_ops
   end interface
 
   abstract interface
     real(dp) function field_reduce(self, f) result(s)
-      !! Reduces field to a scalar, example: volume integral
+      !! Reduce a field to a single scalar value via global summation.
+      !!
+      !! This abstract interface is currently implemented by:
+      !!
+      !! - **field_volume_integral**: Computes the volume integral \(\int f \,dV\)
+      !!
+      !! **Algorithm:**
+      !!
+      !! 1. **Local summation**: Each MPI process sums its local field values
+      !!    (optionally weighted by cell volumes for volume integration)
+      !! 2. **Global reduction**: MPI_Allreduce combines partial sums from all
+      !!    processes to produce the global result
+      !!
+      !! **Backend implementations:**
+      !!
+      !! - **CUDA**: GPU reduction kernel followed by MPI_Allreduce
+      !! - **OMP**: OpenMP parallel reduction followed by MPI_Allreduce
+      !!
+      !! **Requirements:**
+      !!
+      !! - Field must have `data_loc` set (cannot be `NULL_LOC`)
+      !! - Field must be in X-pencil orientation (`dir = DIR_X`)
+      !!
+      !! **Use cases:**
+      !!
+      !! - Volume integrals for conservation checks
+      !! - Global norms (L1, L2) for convergence monitoring
+      !! - Total mass/energy calculations
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: f
+      class(field_t), intent(in) :: f  !! Field to reduce
     end function field_reduce
   end interface
 
@@ -255,7 +485,7 @@ subroutine field_set_face(self, f, c_start, c_end, face)
       !! or a global domain boundary based on the location of the subdomain.
       !! This subroutine allows us to set any of these faces to a value,
       !! 'c_start' and 'c_end' for faces at opposite sides.
-      !! 'face' is one of X_FACE, Y_FACE, Z_FACE from common.f90
+      !! 'face' is one of `X_FACE`, `Y_FACE`, `Z_FACE` from `common.f90`
       import :: base_backend_t
       import :: dp
       import :: field_t
@@ -298,6 +528,39 @@ end subroutine copy_f_to_data
 
   abstract interface
     subroutine alloc_tdsops( &
+      !! Allocate and initialise a backend-specific tridiagonal operator.
+      !!
+      !! This deferred procedure creates a `tdsops_t` object configured for
+      !! compact finite difference operations (derivatives, interpolation, etc.).
+      !! The backend implementation allocates the appropriate subtype:
+      !!
+      !! - **CUDA backend**: Allocates `cuda_tdsops_t` with device memory pointers
+      !!   for GPU execution
+      !! - **OMP backend**: Allocates `omp_tdsops_t` with host memory for CPU execution
+      !!
+      !! The operator is fully preprocessed and ready for repeated application via
+      !! `tds_solve`.
+      !!
+      !! **Required arguments:**
+      !!
+      !! - `n_tds`: System size (number of grid points in the operator direction)
+      !! - `delta`: Grid spacing
+      !! - `operation`: Operation type (`'first-deriv'`, `'second-deriv'`,
+      !!   `'interpolate'`, `'stag-deriv'`)
+      !! - `scheme`: Numerical scheme name (e.g., `'compact6'`, `'compact4'`)
+      !! - `bc_start`, `bc_end`: Boundary condition flags (`BC_PERIODIC`,
+      !!   `BC_NEUMANN`, `BC_DIRICHLET`)
+      !!
+      !! **Optional arguments:**
+      !!
+      !! - `stretch`: Stretching coefficients for non-uniform grids
+      !! - `stretch_correct`: Correction for second derivatives on stretched grids
+      !! - `n_halo`: Number of halo layers (default from backend)
+      !! - `from_to`: Staggered grid direction (`'v2p'`, `'p2v'`)
+      !! - `sym`: Field symmetry at Neumann boundaries (`.true.` = symmetric/even,
+      !!   `.false.` = anti-symmetric/odd)
+      !! - `c_nu`, `nu0_nu`: Hyperviscosity parameters for compact6-hyperviscous
+      !!   second derivatives
       self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
       stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
       )
@@ -322,6 +585,37 @@ end subroutine alloc_tdsops
 
   abstract interface
     subroutine init_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+      !! Initialise the backend-specific FFT-based Poisson solver.
+      !!
+      !! This deferred procedure creates and configures the Poisson solver object
+      !! (`self%poisson_fft`) for solving the pressure Poisson equation:
+      !! \(\nabla^2 \phi = f\)
+      !!
+      !! The backend implementation allocates the appropriate solver subtype:
+      !!
+      !! - **CUDA backend**: Allocates `cuda_poisson_fft_t` using cuFFT library
+      !!   for GPU-accelerated FFT transforms
+      !! - **OMP backend**: Allocates `omp_poisson_fft_t` using 2DECOMP&FFT library
+      !!   for CPU FFT transforms with MPI parallelisation
+      !!
+      !! The solver requires directional derivative operators (`xdirps`, `ydirps`,
+      !! `zdirps`) to construct spectral equivalence constants for handling:
+      !!
+      !! - Non-uniform grid spacing (stretching) in the Y-direction
+      !! - Mixed boundary conditions (e.g., periodic in X/Z, Dirichlet in Y)
+      !!
+      !! **Arguments:**
+      !!
+      !! - `mesh`: Mesh object containing grid dimensions, boundary conditions,
+      !!   and parallel decomposition information
+      !! - `xdirps`, `ydirps`, `zdirps`: Second-derivative operators in each direction,
+      !!   used to compute spectral equivalence constants for the modified wavenumbers
+      !! - `lowmem` (optional): Low-memory mode flag. When `.true.`, reduces memory
+      !!   footprint by deallocating temporary arrays after initialisation (CUDA only)
+      !!
+      !! **Note:** The Poisson solver is stored in `self%poisson_fft` and accessed
+      !! by the solver during the pressure correction step of the fractional-step
+      !! method.
       import :: base_backend_t
       import :: dirps_t
       import :: mesh_t
diff --git a/src/backend/cuda/allocator.f90 b/src/backend/cuda/allocator.f90
index 1d21de6e3..6cab14145 100644
--- a/src/backend/cuda/allocator.f90
+++ b/src/backend/cuda/allocator.f90
@@ -1,4 +1,18 @@
 module m_cuda_allocator
+  !! GPU memory allocator for CUDA backend.
+  !!
+  !! GPU memory (device memory) is physically separate from CPU memory (host).
+  !! This allocator manages device-side storage, ensuring field data resides
+  !! in GPU memory for kernel execution. Explicit device allocation avoids
+  !! expensive implicit host-device transfers that would kill performance.
+  !!
+  !! **Design rationale:**
+  !!
+  !! - `cuda_field_t` extends `field_t` with device pointers (`p_data_d`, `data_d`)
+  !! - Maintains both 1D and 3D views of same memory for flexibility
+  !! - Reference counting prevents premature deallocation
+  !! - Block-based allocation reduces allocation overhead
+  !!
   use m_allocator, only: allocator_t
   use m_common, only: dp
   use m_field, only: field_t
@@ -7,8 +21,9 @@ module m_cuda_allocator
   implicit none
 
   type, extends(allocator_t) :: cuda_allocator_t
+    !! GPU memory allocator extending base allocator
   contains
-    procedure :: create_block => create_cuda_block
+    procedure :: create_block => create_cuda_block  !! Allocate GPU field block
   end type cuda_allocator_t
 
   interface cuda_allocator_t
@@ -16,12 +31,13 @@ module m_cuda_allocator
   end interface cuda_allocator_t
 
   type, extends(field_t) :: cuda_field_t
-    real(dp), device, pointer, private :: p_data_d(:)
-    real(dp), device, pointer, contiguous :: data_d(:, :, :)
+    !! Field residing in GPU device memory
+    real(dp), device, pointer, private :: p_data_d(:)  !! 1D device memory pointer (raw allocation)
+    real(dp), device, pointer, contiguous :: data_d(:, :, :)  !! 3D device view (for kernel access)
   contains
-    procedure :: fill => fill_cuda
-    procedure :: get_shape => get_shape_cuda
-    procedure :: set_shape => set_shape_cuda
+    procedure :: fill => fill_cuda               !! Fill with constant value
+    procedure :: get_shape => get_shape_cuda     !! Query 3D dimensions
+    procedure :: set_shape => set_shape_cuda     !! Reshape 3D view
   end type cuda_field_t
 
   interface cuda_field_t
@@ -31,9 +47,15 @@ module m_cuda_allocator
 contains
 
   function cuda_field_init(ngrid, next, id) result(f)
-    integer, intent(in) :: ngrid, id
-    type(cuda_field_t), pointer, intent(in) :: next
-    type(cuda_field_t) :: f
+    !! Initialise GPU field with device memory allocation.
+    !!
+    !! Device memory must be explicitly allocated before use. This constructor
+    !! allocates the 1D device array and sets up metadata for later reshaping
+    !! to 3D when dimensions are known.
+    integer, intent(in) :: ngrid  !! Total number of grid points
+    integer, intent(in) :: id     !! Unique field identifier
+    type(cuda_field_t), pointer, intent(in) :: next  !! Next field in linked list
+    type(cuda_field_t) :: f       !! Initialised field
 
     allocate (f%p_data_d(ngrid))
     f%refcount = 0
@@ -42,47 +64,74 @@ function cuda_field_init(ngrid, next, id) result(f)
   end function cuda_field_init
 
   subroutine fill_cuda(self, c)
+    !! Fill entire field with constant value on GPU.
+    !!
+    !! Initialising fields directly on GPU avoids transferring initialisation
+    !! data from host. Single assignment to device array leverages GPU's
+    !! memory controllers for efficient broadcast to all elements.
     implicit none
 
-    class(cuda_field_t) :: self
-    real(dp), intent(in) :: c
+    class(cuda_field_t) :: self   !! Field to fill
+    real(dp), intent(in) :: c     !! Constant value
 
     self%p_data_d = c
 
   end subroutine fill_cuda
 
   function get_shape_cuda(self) result(dims)
+    !! Query current 3D dimensions of field.
+    !!
+    !! Fields are allocated with total size but reshaped dynamically based
+    !! on decomposition. This query enables algorithms to adapt to actual
+    !! current dimensions without hard-coding sizes.
     implicit none
 
-    class(cuda_field_t) :: self
-    integer :: dims(3)
+    class(cuda_field_t) :: self   !! Field to query
+    integer :: dims(3)            !! Current dimensions
 
     dims = shape(self%data_d)
 
   end function get_shape_cuda
 
   subroutine set_shape_cuda(self, dims)
+    !! Reshape 3D view of device memory.
+    !!
+    !! Same 1D device allocation is reused for different pencil orientations
+    !! (X-pencils, Y-pencils, Z-pencils). Reshaping avoids reallocating GPU
+    !! memory, which is expensive. Fortran pointer remapping is essentially
+    !! free, just changing metadata not data.
     implicit none
 
-    class(cuda_field_t) :: self
-    integer, intent(in) :: dims(3)
+    class(cuda_field_t) :: self   !! Field to reshape
+    integer, intent(in) :: dims(3)  !! New dimensions
 
     self%data_d(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data_d
 
   end subroutine set_shape_cuda
 
   function cuda_allocator_init(dims, sz) result(allocator)
-    integer, intent(in) :: dims(3), sz
-    type(cuda_allocator_t) :: allocator
+    !! Initialise CUDA allocator with grid dimensions.
+    !!
+    !! Base allocator handles dimension calculations and block management
+    !! logic. CUDA allocator only needs to override block creation to use
+    !! device memory, avoiding code duplication.
+    integer, intent(in) :: dims(3)  !! Grid dimensions
+    integer, intent(in) :: sz       !! Pencil size (SZ)
+    type(cuda_allocator_t) :: allocator  !! Initialised allocator
 
     allocator%allocator_t = allocator_t(dims, sz)
   end function cuda_allocator_init
 
   function create_cuda_block(self, next) result(ptr)
-    class(cuda_allocator_t), intent(inout) :: self
-    type(cuda_field_t), pointer, intent(in) :: next
-    type(cuda_field_t), pointer :: newblock
-    class(field_t), pointer :: ptr
+    !! Create new field block in GPU memory.
+    !!
+    !! Central allocation point ensures consistent initialisation and enables
+    !! tracking (via IDs) for debugging memory issues. Returning base class
+    !! pointer maintains polymorphism for generic algorithm code.
+    class(cuda_allocator_t), intent(inout) :: self  !! Allocator instance
+    type(cuda_field_t), pointer, intent(in) :: next  !! Next in linked list
+    type(cuda_field_t), pointer :: newblock  !! Newly allocated block
+    class(field_t), pointer :: ptr           !! Polymorphic return pointer
     allocate (newblock)
     self%next_id = self%next_id + 1
     newblock = cuda_field_t(self%ngrid, next, id=self%next_id)
diff --git a/src/backend/cuda/backend.f90 b/src/backend/cuda/backend.f90
index 8efa5c041..94e09a59f 100644
--- a/src/backend/cuda/backend.f90
+++ b/src/backend/cuda/backend.f90
@@ -1,4 +1,14 @@
 module m_cuda_backend
+  !! CUDA backend implementing GPU-accelerated solver operations.
+  !!
+  !! Extends `base_backend_t` with GPU kernel launches and device memory
+  !! management. Transport equations, tridiagonal solves, FFT operations,
+  !! and field manipulations execute on GPU.
+  !!
+  !! **MPI Communication:** Halo exchange passes device pointers directly to
+  !! MPI calls. With GPU-aware MPI implementations (OpenMPI with CUDA support,
+  !! MVAPICH2-GDR), data transfers directly between GPU memories. Without
+  !! GPU-aware MPI, the implementation stages through host memory automatically.
   use iso_fortran_env, only: stderr => error_unit
   use cudafor
   use mpi
@@ -35,6 +45,10 @@ module m_cuda_backend
   private :: transeq_halo_exchange, transeq_dist_component
 
   type, extends(base_backend_t) :: cuda_backend_t
+    !! GPU backend with device communication buffers and kernel configurations.
+    !!
+    !! Extends [[m_base_backend(module):base_backend_t(type)]] with CUDA-specific
+    !! implementations and device memory buffers for halo exchange.
     !character(len=*), parameter :: name = 'cuda'
     real(dp), device, allocatable, dimension(:, :, :) :: &
       u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev, &
@@ -78,11 +92,16 @@ module m_cuda_backend
 contains
 
   function init(mesh, allocator) result(backend)
+    !! Initialise CUDA backend with kernel configurations and communication buffers.
+    !!
+    !! Sets up CUDA thread blocks ([[m_cuda_common(module):SZ(variable)]] threads per
+    !! warp-aligned block) and allocates device buffers for halo exchange. Buffer size
+    !! accommodates largest pencil direction to support all three orientations.
     implicit none
 
-    type(mesh_t), target, intent(inout) :: mesh
-    class(allocator_t), target, intent(inout) :: allocator
-    type(cuda_backend_t) :: backend
+    type(mesh_t), target, intent(inout) :: mesh  !! Computational mesh
+    class(allocator_t), target, intent(inout) :: allocator  !! GPU memory allocator
+    type(cuda_backend_t) :: backend  !! Initialised CUDA backend
 
     type(cuda_poisson_fft_t) :: cuda_poisson_fft
     integer :: n_groups
@@ -140,19 +159,25 @@ subroutine alloc_cuda_tdsops( &
     self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     )
+    !! Allocate and initialise CUDA tridiagonal operators.
+    !!
+    !! Implements [[m_base_backend(module):alloc_tdsops(interface)]] for GPU.
+    !! Allocates [[m_cuda_tdsops(module):cuda_tdsops_t(type)]] with device-resident
+    !! coefficient arrays.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(tdsops_t), allocatable, intent(inout) :: tdsops
-    integer, intent(in) :: n_tds
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end
-    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)
-    integer, optional, intent(in) :: n_halo
-    character(*), optional, intent(in) :: from_to
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(tdsops_t), allocatable, intent(inout) :: tdsops  !! Output: allocated CUDA operators
+    integer, intent(in) :: n_tds  !! Number of tridiagonal systems
+    real(dp), intent(in) :: delta  !! Grid spacing
+    character(*), intent(in) :: operation  !! Operation type (derivative/interpolation)
+    character(*), intent(in) :: scheme  !! Scheme name
+    integer, intent(in) :: bc_start, bc_end  !! Boundary condition flags
+    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)  !! Grid stretching factors
+    integer, optional, intent(in) :: n_halo  !! Halo width for distributed schemes
+    character(*), optional, intent(in) :: from_to  !! Interpolation direction
+    logical, optional, intent(in) :: sym  !! Symmetry flag
+    real(dp), optional, intent(in) :: c_nu, nu0_nu  !! Viscosity parameters
 
     allocate (cuda_tdsops_t :: tdsops)
 
@@ -166,13 +191,18 @@ subroutine alloc_cuda_tdsops( &
   end subroutine alloc_cuda_tdsops
 
   subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in x-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Routes to distributed or Thomas algorithm based on
+    !! [[m_tdsops(module):dirps_t(type)]] configuration.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     call self%transeq_cuda_dist(du, dv, dw, u, v, w, nu, dirps, &
                                 self%xblocks, self%xthreads)
@@ -180,13 +210,17 @@ subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_x_cuda
 
   subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in y-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Arguments reordered (v, u, w) to match y-pencil orientation.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     ! u, v, w is reordered so that we pass v, u, w
     call self%transeq_cuda_dist(dv, du, dw, v, u, w, nu, dirps, &
@@ -195,13 +229,17 @@ subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_y_cuda
 
   subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in z-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Arguments reordered (w, u, v) to match z-pencil orientation.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     ! u, v, w is reordered so that we pass w, u, v
     call self%transeq_cuda_dist(dw, du, dv, w, u, v, nu, dirps, &
@@ -212,16 +250,19 @@ end subroutine transeq_z_cuda
   subroutine transeq_species_cuda(self, dspec, uvw, spec, nu, dirps, sync)
     !! Compute the convection and diffusion for the given field
     !! in the given direction.
-    !! Halo exchange for the given field is necessary
-    !! When sync is true, halo exchange of momentum is necessary
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders_spec(interface)]].
+    !! Halo exchange for the given field is necessary.
+    !! When sync is true, halo exchange of momentum is necessary.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: dspec
-    class(field_t), intent(in) :: uvw, spec
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    logical, intent(in) :: sync
+    class(field_t), intent(inout) :: dspec  !! Output: RHS contribution for species
+    class(field_t), intent(in) :: uvw  !! Input: velocity component in transport direction
+    class(field_t), intent(in) :: spec  !! Input: species concentration field
+    real(dp), intent(in) :: nu  !! Diffusivity (kinematic viscosity)
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
+    logical, intent(in) :: sync  !! If true, also exchange momentum halos
 
     integer :: n_groups
     type(cuda_tdsops_t), pointer :: der1st, der1st_sym, der2nd, der2nd_sym
@@ -282,14 +323,19 @@ end subroutine transeq_species_cuda
 
   subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, &
                                blocks, threads)
+    !! Compute transport equation using distributed compact scheme on GPU.
+    !!
+    !! Handles halo exchange with [[m_cuda_sendrecv(module):sendrecv_3fields(interface)]],
+    !! launches [[m_cuda_exec_dist(module):exec_dist_transeq_3fused(interface)]] kernel,
+    !! and gathers derivatives.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    type(dim3), intent(in) :: blocks, threads
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     real(dp), device, pointer, dimension(:, :, :) :: u_dev, v_dev, w_dev, &
                                                      du_dev, dv_dev, dw_dev
@@ -342,9 +388,13 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, &
   end subroutine transeq_cuda_dist
 
   subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dir)
+    !! Exchange velocity field halos using MPI with device pointers.
+    !!
+    !! Packs boundary data into communication buffers and exchanges with
+    !! neighbouring ranks. Uses sendrecv_3fields for batched communication.
     class(cuda_backend_t) :: self
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev
-    integer, intent(in) :: dir
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev  !! Velocity components on device
+    integer, intent(in) :: dir  !! Direction for halo exchange
     integer :: n, nproc_dir, pprev, pnext
     integer :: n_groups
 
@@ -376,20 +426,21 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, &
                                     conv_recv_s_dev, conv_recv_e_dev, &
                                     tdsops_du, tdsops_dud, tdsops_d2u, &
                                     dir, blocks, threads)
-    !! Computes RHS_x^u following:
+    !! Compute transport equation RHS component using distributed compact schemes.
     !!
-    !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
+    !! Computes: $\text{rhs} = -\frac{1}{2}(\text{conv} \frac{\partial u}{\partial x} + \frac{\partial (u \cdot \text{conv})}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$
     class(cuda_backend_t) :: self
-    !> The result field, it is also used as temporary storage
-    real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, conv_dev
-    real(dp), intent(in) :: nu
+    real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev  !! Output: transport equation RHS
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev  !! Input: velocity component field
+    real(dp), device, dimension(:, :, :), intent(in) :: conv_dev  !! Input: convecting velocity field
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
     real(dp), device, dimension(:, :, :), intent(in) :: &
-      u_recv_s_dev, u_recv_e_dev, &
-      conv_recv_s_dev, conv_recv_e_dev
-    class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    integer, intent(in) :: dir
-    type(dim3), intent(in) :: blocks, threads
+      u_recv_s_dev, u_recv_e_dev  !! Halo data for u from neighbours
+    real(dp), device, dimension(:, :, :), intent(in) :: &
+      conv_recv_s_dev, conv_recv_e_dev  !! Halo data for conv from neighbours
+    class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for derivatives
+    integer, intent(in) :: dir  !! Direction index
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     class(field_t), pointer :: dud, d2u
 
@@ -425,25 +476,31 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, &
   end subroutine transeq_dist_component
 
   subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps)
-      !! Thomas algorithm implementation. So much more easier than the
-      !! distributed algorithm. It is intended to work only on a single rank
-      !! so there is no MPI communication.
+    !! Compute transport equation using Thomas algorithm.
+    !!
+    !! Simpler than distributed scheme - no MPI communication, uses
+    !! [[m_cuda_exec_thom(module):exec_thom_tds_compact(interface)]] kernel.
+    !! Intended for single-rank execution only.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
   end subroutine transeq_cuda_thom
 
   subroutine tds_solve_cuda(self, du, u, tdsops)
+    !! Solve tridiagonal systems using CUDA kernels.
+    !!
+    !! Implements [[m_base_backend(module):tds_solve(interface)]].
+    !! Dispatches to appropriate CUDA kernel based on pencil direction.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(field_t), intent(inout) :: du  !! Output: solution
+    class(field_t), intent(in) :: u  !! Input: RHS
+    class(tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
 
     type(dim3) :: blocks, threads
 
@@ -464,13 +521,17 @@ subroutine tds_solve_cuda(self, du, u, tdsops)
   end subroutine tds_solve_cuda
 
   subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads)
+    !! Solve distributed tridiagonal systems using CUDA kernels and MPI.
+    !!
+    !! Performs forward sweep, exchanges boundary data via MPI (using device
+    !! pointers for potential GPU-aware MPI benefit), then backward substitution.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
-    type(dim3), intent(in) :: blocks, threads
+    class(field_t), intent(inout) :: du  !! Output: solution
+    class(field_t), intent(in) :: u  !! Input: RHS
+    class(tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     real(dp), device, pointer, dimension(:, :, :) :: du_dev, u_dev
 
@@ -512,12 +573,16 @@ subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads)
   end subroutine tds_solve_dist
 
   subroutine reorder_cuda(self, u_o, u_i, direction)
+    !! Reorder field data between pencil orientations using CUDA kernels.
+    !!
+    !! Implements [[m_base_backend(module):reorder(interface)]].
+    !! Calls appropriate [[m_cuda_kernels_reorder(module)]] kernel based on direction.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: u_o
-    class(field_t), intent(in) :: u_i
-    integer, intent(in) :: direction
+    class(field_t), intent(inout) :: u_o  !! Output: reordered field
+    class(field_t), intent(in) :: u_i  !! Input: source field
+    integer, intent(in) :: direction  !! Reordering direction (RDR_X2Y, RDR_Y2Z, etc)
 
     real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d
     class(field_t), pointer :: u_temp
@@ -632,9 +697,12 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
   end subroutine reorder_cuda
 
   subroutine sum_yintox_cuda(self, u, u_y)
+    !! Sum y-pencil field into x-pencil using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: u  !! Output: x-pencil result
+    class(field_t), intent(in) :: u_y  !! Input: y-pencil field to sum
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_y
 
@@ -654,9 +722,12 @@ subroutine sum_yintox_cuda(self, u, u_y)
   end subroutine sum_yintox_cuda
 
   subroutine sum_zintox_cuda(self, u, u_z)
+    !! Sum z-pencil field into x-pencil using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: u  !! Output: x-pencil result
+    class(field_t), intent(in) :: u_z  !! Input: z-pencil field to sum
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_z
 
@@ -676,11 +747,15 @@ subroutine sum_zintox_cuda(self, u, u_z)
   end subroutine sum_zintox_cuda
 
   subroutine veccopy_cuda(self, dst, src)
+    !! Copy field data using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):veccopy(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):buffer_copy(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: dst
-    class(field_t), intent(in) :: src
+    class(field_t), intent(inout) :: dst  !! Output: destination field
+    class(field_t), intent(in) :: src  !! Input: source field
 
     real(dp), device, pointer, dimension(:, :, :) :: dst_d, src_d
     type(dim3) :: blocks, threads
@@ -697,10 +772,14 @@ subroutine veccopy_cuda(self, dst, src)
   end subroutine veccopy_cuda
 
   subroutine vecadd_cuda(self, a, x, b, y)
+    !! Compute linear combination $y = ax + by$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):vecadd(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):axpby(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    real(dp), intent(in) :: a
+    real(dp), intent(in) :: a  !! Scalar coefficient for x
     class(field_t), intent(in) :: x
     real(dp), intent(in) :: b
     class(field_t), intent(inout) :: y
@@ -720,10 +799,15 @@ subroutine vecadd_cuda(self, a, x, b, y)
   end subroutine vecadd_cuda
 
   subroutine vecmult_cuda(self, y, x)
-    !! [[m_base_backend(module):vecmult(interface)]]
+    !! Compute element-wise product $y = x \cdot y$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):vecmult(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):pwmul(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: y  !! Input/Output: multiplied in-place
+    class(field_t), intent(in) :: x  !! Input: multiplier
     class(field_t), intent(inout) :: y
     class(field_t), intent(in) :: x
     real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
@@ -741,11 +825,14 @@ subroutine vecmult_cuda(self, y, x)
   end subroutine vecmult_cuda
 
   real(dp) function scalar_product_cuda(self, x, y) result(s)
-    !! [[m_base_backend(module):scalar_product(interface)]]
+    !! Compute global scalar product $\langle x, y \rangle$ using CUDA kernel and MPI reduction.
+    !!
+    !! Implements [[m_base_backend(module):scalar_product(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):scalar_product(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: x, y
+    class(field_t), intent(in) :: x, y  !! Input fields
 
     real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
     real(dp), device, allocatable :: sum_d
@@ -791,12 +878,13 @@ real(dp) function scalar_product_cuda(self, x, y) result(s)
   end function scalar_product_cuda
 
   subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
+    !! Copy boundary data into MPI send buffers using CUDA kernel.
     implicit none
 
     real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, &
-                                                         u_send_e_dev
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev
-    integer, intent(in) :: n
+                                                         u_send_e_dev  !! Send buffers
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev  !! Source field
+    integer, intent(in) :: n  !! Grid dimension
 
     type(dim3) :: blocks, threads
     integer :: n_halo = 4
@@ -809,13 +897,16 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
   end subroutine copy_into_buffers
 
   subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc)
-    !! [[m_base_backend(module):field_max_mean(interface)]]
+    !! Compute field maximum and mean using CUDA kernel and MPI reductions.
+    !!
+    !! Implements [[m_base_backend(module):field_max_mean(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):field_max_sum(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    real(dp), intent(out) :: max_val, mean_val
-    class(field_t), intent(in) :: f
-    integer, optional, intent(in) :: enforced_data_loc
+    real(dp), intent(out) :: max_val, mean_val  !! Output: global maximum and mean
+    class(field_t), intent(in) :: f  !! Input field
+    integer, optional, intent(in) :: enforced_data_loc  !! Override field data location
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     real(dp), device, allocatable :: max_d, sum_d
@@ -871,11 +962,15 @@ subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc)
   end subroutine field_max_mean_cuda
 
   subroutine field_scale_cuda(self, f, a)
+    !! Scale field by constant $f = a \cdot f$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):field_ops(interface)]] (field_scale binding).
+    !! Uses [[m_cuda_kernels_fieldops(module):field_scale(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(field_t), intent(in) :: f  !! Field to scale in-place
+    real(dp), intent(in) :: a  !! Scaling factor
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -891,11 +986,15 @@ subroutine field_scale_cuda(self, f, a)
   end subroutine field_scale_cuda
 
   subroutine field_shift_cuda(self, f, a)
+    !! Shift field by constant $f = f + a$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):field_ops(interface)]] (field_shift binding).
+    !! Uses [[m_cuda_kernels_fieldops(module):field_shift(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(field_t), intent(in) :: f  !! Field to shift in-place
+    real(dp), intent(in) :: a  !! Shift amount
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -911,13 +1010,13 @@ subroutine field_shift_cuda(self, f, a)
   end subroutine field_shift_cuda
 
   subroutine field_set_face_cuda(self, f, c_start, c_end, face)
-    !! [[m_base_backend(module):field_set_face(subroutine)]]
+    !! Set boundary face values using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), intent(in) :: c_start, c_end
-    integer, intent(in) :: face
+    class(field_t), intent(inout) :: f  !! Field to modify
+    real(dp), intent(in) :: c_start, c_end  !! Values for start and end faces
+    integer, intent(in) :: face  !! Face identifier (X_FACE, Y_FACE, Z_FACE)
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -952,11 +1051,14 @@ subroutine field_set_face_cuda(self, f, c_start, c_end, face)
   end subroutine field_set_face_cuda
 
   real(dp) function field_volume_integral_cuda(self, f) result(s)
-    !! volume integral of a field
+    !! Compute volume integral using CUDA kernel and MPI reduction.
+    !!
+    !! Implements [[m_base_backend(module):field_reduce(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):volume_integral(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
+    class(field_t), intent(in) :: f  !! Input field
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     real(dp), device, allocatable :: integral_d
@@ -991,28 +1093,34 @@ real(dp) function field_volume_integral_cuda(self, f) result(s)
   end function field_volume_integral_cuda
 
   subroutine copy_data_to_f_cuda(self, f, data)
+    !! Copy host array to device field.
     class(cuda_backend_t), intent(inout) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), dimension(:, :, :), intent(inout) :: data
+    class(field_t), intent(inout) :: f  !! Target device field
+    real(dp), dimension(:, :, :), intent(inout) :: data  !! Source host array
 
     select type (f); type is (cuda_field_t); f%data_d = data; end select
   end subroutine copy_data_to_f_cuda
 
   subroutine copy_f_to_data_cuda(self, data, f)
+    !! Copy device field to host array.
     class(cuda_backend_t), intent(inout) :: self
-    real(dp), dimension(:, :, :), intent(out) :: data
-    class(field_t), intent(in) :: f
+    real(dp), dimension(:, :, :), intent(out) :: data  !! Target host array
+    class(field_t), intent(in) :: f  !! Source device field
 
     select type (f); type is (cuda_field_t); data = f%data_d; end select
   end subroutine copy_f_to_data_cuda
 
   subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+    !! Initialise CUDA FFT Poisson solver.
+    !!
+    !! Implements [[m_base_backend(module):init_poisson_fft(interface)]].
+    !! Allocates [[m_cuda_poisson_fft(module):cuda_poisson_fft_t(type)]] instance.
     implicit none
 
     class(cuda_backend_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    type(mesh_t), intent(in) :: mesh  !! Computational mesh
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Directional operators
+    logical, optional, intent(in) :: lowmem  !! Low memory mode flag
 
     allocate (cuda_poisson_fft_t :: self%poisson_fft)
 
@@ -1024,8 +1132,9 @@ subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
   end subroutine init_cuda_poisson_fft
 
   subroutine resolve_field_t(u_dev, u)
-    real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev
-    class(field_t), intent(in) :: u
+    !! Helper to extract device pointer from cuda_field_t.
+    real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev  !! Device pointer
+    class(field_t), intent(in) :: u  !! Field object
 
     select type (u)
     type is (cuda_field_t)
diff --git a/src/backend/cuda/common.f90 b/src/backend/cuda/common.f90
index 6165c38a7..d67dc477a 100644
--- a/src/backend/cuda/common.f90
+++ b/src/backend/cuda/common.f90
@@ -1,6 +1,16 @@
 module m_cuda_common
+  !! Common constants for CUDA backend.
+  !!
+  !! CUDA GPUs execute threads in groups of 32 called warps. Setting the
+  !! pencil size to 32 ensures coalesced memory access patterns, where all
+  !! threads in a warp access consecutive memory locations simultaneously.
+  !! This is critical for GPU memory bandwidth efficiency.
+  !!
+  !! **Performance impact:** Matching the hardware warp size eliminates
+  !! divergence and maximises memory throughput, typically improving
+  !! performance by 2-3x compared to non-coalesced access.
   implicit none
 
-  integer, parameter :: SZ = 32
+  integer, parameter :: SZ = 32  !! Pencil size matching GPU warp width
 
 end module m_cuda_common
diff --git a/src/backend/cuda/exec_dist.f90 b/src/backend/cuda/exec_dist.f90
index 5a71bcc76..08048481c 100644
--- a/src/backend/cuda/exec_dist.f90
+++ b/src/backend/cuda/exec_dist.f90
@@ -1,4 +1,9 @@
 module m_cuda_exec_dist
+  !! Distributed compact scheme execution on GPU.
+  !!
+  !! Orchestrates CUDA kernel launches and MPI halo exchange for distributed
+  !! compact finite difference schemes. Handles both generic derivative operations
+  !! and fused transport equation computation.
   use cudafor
   use mpi
 
@@ -17,21 +22,28 @@ subroutine exec_dist_tds_compact( &
     du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, &
     tdsops, nproc, pprev, pnext, blocks, threads &
     )
+    !! Execute distributed compact scheme derivative $du = d(u)$ on GPU.
+    !!
+    !! Calls distributed kernel, exchanges halo data for $2 \times 2$ boundary
+    !! systems, then applies substitution kernel.
     implicit none
 
     ! du = d(u)
-    real(dp), device, dimension(:, :, :), intent(out) :: du
-    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care the data they pass back
+    real(dp), device, dimension(:, :, :), intent(out) :: du  !! Output: derivative
+    real(dp), device, dimension(:, :, :), intent(in) :: u  !! Input: field with local data
+    real(dp), device, dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e  !! Halo data from neighbours
+
+    ! Temporary buffers for halo exchange (overwritten during computation)
     real(dp), device, dimension(:, :, :), intent(out) :: &
       du_send_s, du_send_e, du_recv_s, du_recv_e
 
-    type(cuda_tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: nproc, pprev, pnext
-    type(dim3), intent(in) :: blocks, threads
+    type(cuda_tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    integer, intent(in) :: nproc, pprev, pnext  !! MPI ranks (total, previous, next)
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     integer :: n_data
 
@@ -64,27 +76,28 @@ subroutine exec_dist_transeq_3fused( &
     tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, &
     blocks, threads &
     )
+    !! Execute fused transport equation computation on GPU with distributed compact scheme.
+    !!
+    !! Computes $r\_du = -\frac{1}{2}(v \frac{\partial u}{\partial x} + \frac{\partial (uv)}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$
+    !! Launches distributed kernel for three operators (du, dud, d2u), exchanges halo data for all
+    !! boundary systems in one batch, then applies substitution kernel.
     implicit none
 
-    ! r_du = -1/2*(v*d1(u) + d1(u*v)) + nu*d2(u)
-    !> The result array, it is also used as temporary storage
-    real(dp), device, dimension(:, :, :), intent(out) :: r_du
-    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
-    real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e
+    real(dp), device, dimension(:, :, :), intent(out) :: r_du  !! Output: transport equation RHS
+    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Field u with halos
+    real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e  !! Field v with halos
 
-    ! The ones below are intent(out) just so that we can write data in them,
-    ! not because we actually need the data they store later where this
-    ! subroutine is called. We absolutely don't care the data they pass back
+    ! Temporary storage for derivatives and halo exchange buffers
     real(dp), device, dimension(:, :, :), intent(out) :: dud, d2u
     real(dp), device, dimension(:, :, :), intent(out) :: &
       du_send_s, du_send_e, du_recv_s, du_recv_e, &
       dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
       d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
 
-    type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    real(dp), intent(in) :: nu
-    integer, intent(in) :: nproc, pprev, pnext
-    type(dim3), intent(in) :: blocks, threads
+    type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for each derivative
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    integer, intent(in) :: nproc, pprev, pnext  !! MPI ranks
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     integer :: n_data
 
diff --git a/src/backend/cuda/exec_thom.f90 b/src/backend/cuda/exec_thom.f90
index 50e757579..3834a20ac 100644
--- a/src/backend/cuda/exec_thom.f90
+++ b/src/backend/cuda/exec_thom.f90
@@ -1,4 +1,8 @@
 module m_cuda_exec_thom
+  !! Thomas algorithm execution on GPU for local tridiagonal systems.
+  !!
+  !! Dispatches to periodic or non-periodic Thomas kernels based on
+  !! boundary conditions. No MPI communication required.
   use cudafor
 
   use m_common, only: dp
@@ -10,12 +14,15 @@ module m_cuda_exec_thom
 contains
 
   subroutine exec_thom_tds_compact(du, u, tdsops, blocks, threads)
+    !! Execute Thomas algorithm for compact scheme derivative $du = d(u)$ on GPU.
+    !!
+    !! Selects periodic or non-periodic kernel variant based on operator configuration.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: du
-    real(dp), device, dimension(:, :, :), intent(in) :: u
-    type(cuda_tdsops_t), intent(in) :: tdsops
-    type(dim3), intent(in) :: blocks, threads
+    real(dp), device, dimension(:, :, :), intent(out) :: du  !! Output: derivative
+    real(dp), device, dimension(:, :, :), intent(in) :: u  !! Input: field
+    type(cuda_tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     if (tdsops%periodic) then
       call der_univ_thom_per<<<blocks, threads>>>( & !&
diff --git a/src/backend/cuda/kernels/distributed.f90 b/src/backend/cuda/kernels/distributed.f90
index 8e7a1ba94..4cf97fc19 100644
--- a/src/backend/cuda/kernels/distributed.f90
+++ b/src/backend/cuda/kernels/distributed.f90
@@ -1,4 +1,9 @@
 module m_cuda_kernels_dist
+  !! CUDA kernels for distributed compact finite difference schemes.
+  !!
+  !! GPU kernels implementing forward and backward sweeps for compact schemes
+  !! across MPI domain boundaries. Handles stencil application using halo data,
+  !! forward elimination, and backward substitution for distributed tridiagonal systems.
   use cudafor
 
   use m_common, only: dp
@@ -11,16 +16,20 @@ attributes(global) subroutine der_univ_dist( &
     du, send_u_s, send_u_e, u, u_s, u_e, &
     n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf &
     )
+    !! CUDA kernel for distributed compact scheme forward sweep and boundary setup.
+    !!
+    !! Applies compact stencils using local data (u) and halo data (u_s, u_e) from
+    !! neighbours. Performs forward elimination and prepares boundary data for MPI exchange.
     implicit none
 
-    ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, &
-                                                         send_u_e
-    real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e
-    integer, value, intent(in) :: n_tds, n_rhs
-    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: derivatives with forward elimination
+    real(dp), device, intent(out), dimension(:, :, :) :: send_u_s, send_u_e  !! Boundary data for MPI exchange
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: local field data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_s, u_e  !! Halo data from start/end neighbours
+    integer, value, intent(in) :: n_tds, n_rhs  !! Grid and RHS dimensions
+    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Boundary stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Bulk stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf  !! Forward elimination factors
 
     ! Local variables
     integer :: i, j, b, k, lj
@@ -148,17 +157,22 @@ end subroutine der_univ_dist
 
   attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, &
                                               n, dist_sa, dist_sc, strch)
+    !! Backward substitution for distributed compact scheme.
+    !!
+    !! Completes the tridiagonal solve using boundary solutions received from
+    !! neighbouring MPI ranks. Applies Sherman-Morrison-like correction for
+    !! distributed system using Toeplitz matrix symmetry properties.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e
-    real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch
-    integer, value, intent(in) :: n
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Final derivative solution
+    real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e  !! Boundary solutions from neighbours
+    real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch  !! Distributed coefficients and stretching
+    integer, value, intent(in) :: n  !! Number of local grid points
 
     ! Local variables
-    integer :: i, j, b
-    real(dp) :: ur, bl, recp, du_s, du_e
+    integer :: i, j, b  !! Thread, loop, and block indices
+    real(dp) :: ur, bl, recp, du_s, du_e  !! Upper-right, bottom-left, reciprocal, boundary solutions
 
     i = threadIdx%x
     b = blockIdx%x
@@ -201,39 +215,44 @@ attributes(global) subroutine transeq_3fused_dist( &
     dud_coeffs_s, dud_coeffs_e, dud_coeffs, dud_fw, dud_bw, dud_af, &
     d2u_coeffs_s, d2u_coeffs_e, d2u_coeffs, d2u_fw, d2u_bw, d2u_af &
     )
+    !! Distributed forward sweep for 3 fused transport equation derivatives.
+    !!
+    !! Computes du, dud (convective), and d2u simultaneously using independent
+    !! compact stencils. Performs forward elimination and prepares boundary data
+    !! for MPI exchange. Optimised for transport equation with convective terms.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u
+    real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u  !! Output: Three derivative fields
     real(dp), device, intent(out), dimension(:, :, :) :: &
-      send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e
+      send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e  !! Boundary data for MPI exchange
     real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e, &
-                                                        v, v_s, v_e
-    integer, value, intent(in) :: n_tds, n_rhs
+                                                        v, v_s, v_e  !! Input fields and halos
+    integer, value, intent(in) :: n_tds, n_rhs  !! Grid dimensions
     real(dp), device, intent(in) :: du_coeffs_s(:, :), du_coeffs_e(:, :), &
-                                    du_coeffs(:)
-    real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:)
+                                    du_coeffs(:)  !! du stencil coefficients
+    real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:)  !! du forward/backward/alpha factors
     real(dp), device, intent(in) :: dud_coeffs_s(:, :), dud_coeffs_e(:, :), &
-                                    dud_coeffs(:)
-    real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:)
+                                    dud_coeffs(:)  !! dud stencil coefficients
+    real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:)  !! dud forward/backward/alpha factors
     real(dp), device, intent(in) :: d2u_coeffs_s(:, :), d2u_coeffs_e(:, :), &
-                                    d2u_coeffs(:)
-    real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:)
+                                    d2u_coeffs(:)  !! d2u stencil coefficients
+    real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:)  !! d2u forward/backward/alpha factors
 
     ! Local variables
-    integer :: i, j, b
+    integer :: i, j, b  !! Thread, loop, and block indices
 
     real(dp) :: du_c_m4, du_c_m3, du_c_m2, du_c_m1, du_c_j, &
                 du_c_p1, du_c_p2, du_c_p3, du_c_p4, &
-                du_alpha, du_last_r
+                du_alpha, du_last_r  !! du stencil coefficients and factors
     real(dp) :: dud_c_m4, dud_c_m3, dud_c_m2, dud_c_m1, dud_c_j, &
                 dud_c_p1, dud_c_p2, dud_c_p3, dud_c_p4, &
-                dud_alpha, dud_last_r
+                dud_alpha, dud_last_r  !! dud stencil coefficients and factors
     real(dp) :: d2u_c_m4, d2u_c_m3, d2u_c_m2, d2u_c_m1, d2u_c_j, &
                 d2u_c_p1, d2u_c_p2, d2u_c_p3, d2u_c_p4, &
-                d2u_alpha, d2u_last_r
-    real(dp) :: temp_du, temp_dud, temp_d2u
-    real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4
+                d2u_alpha, d2u_last_r  !! d2u stencil coefficients and factors
+    real(dp) :: temp_du, temp_dud, temp_d2u  !! Temporary derivative values
+    real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4  !! Reused field values
     real(dp) :: v_m4, v_m3, v_m2, v_m1, v_j, v_p1, v_p2, v_p3, v_p4
     real(dp) :: old_du, old_dud, old_d2u
 
@@ -593,26 +612,31 @@ attributes(global) subroutine transeq_3fused_subs( &
     n, nu, du_sa, du_sc, du_strch, dud_sa, dud_sc, dud_strch, &
     d2u_sa, d2u_sc, d2u_strch, d2u_strch_cor &
     )
+    !! Backward substitution for 3 fused transport equation derivatives.
+    !!
+    !! Completes distributed tridiagonal solves for du, dud, d2u using boundary
+    !! solutions from neighbours. Combines results to form RHS of transport equation:
+    !! r_du = -conv*dud + nu*d2u. Applies Sherman-Morrison corrections for all three fields.
     implicit none
 
     ! Arguments
     !> The result array, it stores 'du' first then its overwritten
-    real(dp), device, intent(inout), dimension(:, :, :) :: r_du
-    real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u
+    real(dp), device, intent(inout), dimension(:, :, :) :: r_du  !! In/out: Stores du then overwritten with RHS
+    real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u  !! Input: Convection velocity and derivatives
     real(dp), device, intent(in), dimension(:, :, :) :: &
-      recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e
-    integer, value, intent(in) :: n
-    real(dp), value, intent(in) :: nu
+      recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e  !! Boundary solutions from neighbours
+    integer, value, intent(in) :: n  !! Number of local grid points
+    real(dp), value, intent(in) :: nu  !! Kinematic viscosity
     real(dp), device, intent(in), dimension(:) :: du_sa, du_sc, du_strch, &
                                                   dud_sa, dud_sc, dud_strch, &
                                                   d2u_sa, d2u_sc, d2u_strch, &
-                                                  d2u_strch_cor
+                                                  d2u_strch_cor  !! Distributed coefficients for all three fields
 
     ! Local variables
-    integer :: i, j, b
-    real(dp) :: ur, bl, recp
-    real(dp) :: du_temp, dud_temp, d2u_temp
-    real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e
+    integer :: i, j, b  !! Thread, loop, and block indices
+    real(dp) :: ur, bl, recp  !! Upper-right, bottom-left, reciprocal for Sherman-Morrison
+    real(dp) :: du_temp, dud_temp, d2u_temp  !! Temporary derivative values
+    real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e  !! Boundary solutions for all three fields
 
     i = threadIdx%x
     b = blockIdx%x
diff --git a/src/backend/cuda/kernels/fieldops.f90 b/src/backend/cuda/kernels/fieldops.f90
index 949bc6ab6..d3147fa5a 100644
--- a/src/backend/cuda/kernels/fieldops.f90
+++ b/src/backend/cuda/kernels/fieldops.f90
@@ -1,4 +1,10 @@
 module m_cuda_kernels_fieldops
+  !! CUDA kernels for field operations (copy, scale, vector arithmetic, reductions).
+  !!
+  !! Provides GPU kernels for basic field manipulation: copying, scaling, shifting,
+  !! linear combinations (AXPBY), pointwise multiplication, scalar products, and
+  !! reductions (max, sum, volume integral). All kernels use thread-per-pencil-point
+  !! parallelisation with [[m_cuda_common(module):SZ(variable)]] threads per block.
   use cudafor
 
   use m_common, only: dp
@@ -7,13 +13,16 @@ module m_cuda_kernels_fieldops
 contains
 
   attributes(global) subroutine copy(n, dst, src)
+    !! Copy field data: dst = src.
     implicit none
 
-    integer, value, intent(in) :: n
-    real(dp), device, intent(out), dimension(:, :, :) :: dst
-    real(dp), device, intent(in), dimension(:, :, :) :: src
+    integer, value, intent(in) :: n  !! Pencil length
+    real(dp), device, intent(out), dimension(:, :, :) :: dst  !! Destination array
+    real(dp), device, intent(in), dimension(:, :, :) :: src  !! Source array
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -25,14 +34,17 @@ attributes(global) subroutine copy(n, dst, src)
   end subroutine copy
 
   attributes(global) subroutine axpby(n, alpha, x, beta, y)
+    !! Compute linear combination: y = alpha*x + beta*y.
     implicit none
 
-    integer, value, intent(in) :: n
-    real(dp), value, intent(in) :: alpha, beta
-    real(dp), device, intent(in), dimension(:, :, :) :: x
-    real(dp), device, intent(inout), dimension(:, :, :) :: y
+    integer, value, intent(in) :: n  !! Pencil length
+    real(dp), value, intent(in) :: alpha, beta  !! Scalar coefficients
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! Input array
+    real(dp), device, intent(inout), dimension(:, :, :) :: y  !! Input/Output array
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -44,13 +56,16 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)
   end subroutine axpby
 
   attributes(global) subroutine pwmul(y, x, n)
+    !! Pointwise multiplication: y = y * x.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: y
-    real(dp), device, intent(in), dimension(:, :, :) :: x
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: y  !! Input/Output array
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! Multiplier array
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -62,13 +77,20 @@ attributes(global) subroutine pwmul(y, x, n)
   end subroutine pwmul
 
   attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
+    !! Copy halo regions into send buffers.
+    !!
+    !! Extracts first and last n_halo planes into separate buffers for MPI communication.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n, n_halo
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s  !! Start buffer
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_e  !! End buffer
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Source field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_halo  !! Halo width
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Halo plane index
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -81,13 +103,16 @@ attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
   end subroutine buffer_copy
 
   attributes(global) subroutine field_scale(f, alpha, n)
+    !! Scale field by constant: f = alpha * f.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: alpha
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to scale
+    real(dp), value, intent(in) :: alpha  !! Scaling factor
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -99,13 +124,16 @@ attributes(global) subroutine field_scale(f, alpha, n)
   end subroutine field_scale
 
   attributes(global) subroutine field_shift(f, const, n)
+    !! Shift field by constant: f = f + const.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: const
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to shift
+    real(dp), value, intent(in) :: const  !! Shift constant
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -117,14 +145,24 @@ attributes(global) subroutine field_shift(f, const, n)
   end subroutine field_shift
 
   attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j)
+    !! Compute scalar product with atomic reduction: s += sum(x * y).
+    !!
+    !! Uses atomic addition to accumulate partial sums from each pencil.
     implicit none
 
-    real(dp), device, intent(inout) :: s
-    real(dp), device, intent(in), dimension(:, :, :) :: x, y
-    integer, value, intent(in) :: n, n_i_pad, n_j
+    real(dp), device, intent(inout) :: s  !! Accumulated scalar product
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! First field
+    real(dp), device, intent(in), dimension(:, :, :) :: y  !! Second field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
 
-    real(dp) :: s_pncl !! pencil sum
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp) :: s_pncl  !! Pencil sum
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
@@ -142,14 +180,26 @@ attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j)
   end subroutine scalar_product
 
   attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j)
+    !! Compute field maximum and sum with atomic reductions.
+    !!
+    !! Uses atomic max and add operations to accumulate pencil-wise results.
     implicit none
 
-    real(dp), device, intent(inout) :: max_f, sum_f
-    real(dp), device, intent(in), dimension(:, :, :) :: f
-    integer, value, intent(in) :: n, n_i_pad, n_j
-
-    real(dp) :: max_pncl, sum_pncl, val
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp), device, intent(inout) :: max_f  !! Accumulated maximum
+    real(dp), device, intent(inout) :: sum_f  !! Accumulated sum
+    real(dp), device, intent(in), dimension(:, :, :) :: f  !! Input field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
+
+    real(dp) :: max_pncl  !! Pencil maximum
+    real(dp) :: sum_pncl  !! Pencil sum
+    real(dp) :: val  !! Absolute value
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
@@ -171,15 +221,21 @@ attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j)
   end subroutine field_max_sum
 
   attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz)
-    !! Set domain Y_FACE to a constant
-    !! c_start at the bottom and c_end at the top
+    !! Set Y-face boundary values to constants.
+    !!
+    !! Sets bottom face (y=0) to c_start and top face (y=L) to c_end.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: c_start, c_end
-    integer, value, intent(in) :: nx, ny, nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to modify
+    real(dp), value, intent(in) :: c_start  !! Bottom boundary value
+    real(dp), value, intent(in) :: c_end  !! Top boundary value
+    integer, value, intent(in) :: nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, b, n_mod, b_end
+    integer :: i  !! Thread index
+    integer :: j  !! X-coordinate
+    integer :: b  !! Z-coordinate block
+    integer :: n_mod  !! Modulo for top boundary indexing
+    integer :: b_end  !! Top boundary block index
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x ! from 1 to nx
     b = blockIdx%y ! from 1 to nz
@@ -195,14 +251,23 @@ attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz)
   end subroutine field_set_y_face
 
   attributes(global) subroutine volume_integral(s, f, n, n_i_pad, n_j)
+    !! Compute volume integral with atomic reduction: s += sum(f).
+    !!
+    !! Uses atomic addition to accumulate partial sums from each pencil.
     implicit none
 
-    real(dp), device, intent(inout) :: s
-    real(dp), device, intent(in), dimension(:, :, :) :: f
-    integer, value, intent(in) :: n, n_i_pad, n_j
-
-    real(dp) :: s_pncl !! pencil sum
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp), device, intent(inout) :: s  !! Accumulated integral
+    real(dp), device, intent(in), dimension(:, :, :) :: f  !! Input field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
+
+    real(dp) :: s_pncl  !! Pencil sum
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
diff --git a/src/backend/cuda/kernels/reorder.f90 b/src/backend/cuda/kernels/reorder.f90
index 4065a2595..cd96a029b 100644
--- a/src/backend/cuda/kernels/reorder.f90
+++ b/src/backend/cuda/kernels/reorder.f90
@@ -1,4 +1,10 @@
 module m_cuda_kernels_reorder
+  !! CUDA kernels for pencil reordering and accumulation between X/Y/Z orientations.
+  !!
+  !! Provides GPU kernels for rearranging field data between different pencil decompositions
+  !! (X-pencils, Y-pencils, Z-pencils, and Cartesian). Most kernels use shared memory tiles
+  !! for coalesced memory access. Thread blocks use [[m_cuda_common(module):SZ(variable)]]
+  !! configuration (32x1 or 32x32 depending on operation).
   use cudafor
 
   use m_common, only: dp
@@ -7,14 +13,18 @@ module m_cuda_kernels_reorder
 contains
 
   attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
+    !! Reorder from Cartesian to X-pencil orientation.
+    !!
+    !! Uses shared memory transpose for efficient reordering.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_c
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_c  !! Input: Cartesian data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -42,14 +52,18 @@ attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
   end subroutine reorder_c2x
 
   attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
+    !! Reorder from X-pencil to Cartesian orientation.
+    !!
+    !! Inverse of reorder_c2x. Uses shared memory transpose.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_c
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_c  !! Output: Cartesian data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -77,14 +91,18 @@ attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
   end subroutine reorder_x2c
 
   attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
+    !! Reorder from X-pencil to Y-pencil orientation.
+    !!
+    !! Uses shared memory transpose for efficient reordering.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_y
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_y  !! Output: Y-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -112,13 +130,19 @@ attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
   end subroutine reorder_x2y
 
   attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
+    !! Reorder from X-pencil to Z-pencil orientation.
+    !!
+    !! No shared memory needed - memory access pattern is already favourable.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_z
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_z  !! Output: Z-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i  !! Thread index
+    integer :: j  !! Loop index
+    integer :: b_i, b_j  !! Block indices
+    integer :: nx  !! Grid X-dimension
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
@@ -132,14 +156,15 @@ attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
   end subroutine reorder_x2z
 
   attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
+    !! Reorder from Y-pencil to X-pencil orientation.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread and block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -167,14 +192,15 @@ attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
   end subroutine reorder_y2x
 
   attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
+    !! Reorder from Y-pencil to Z-pencil orientation.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_z
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nx, nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_z  !! Output: Z-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data
+    integer, value, intent(in) :: nx, nz  !! Grid dimensions
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread and block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -202,13 +228,16 @@ attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
   end subroutine reorder_y2z
 
   attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
+    !! Reorder from Z-pencil to X-pencil orientation.
+    !!
+    !! No shared memory needed - favourable memory access pattern.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i, j, b_i, b_j, nx  !! Thread, loop, block indices and grid size
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
@@ -220,14 +249,17 @@ attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
   end subroutine reorder_z2x
 
   attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
+    !! Reorder from Z-pencil to Y-pencil orientation.
+    !!
+    !! Uses shared memory tile for coalesced access pattern.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_y
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nx, nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_y  !! Output: Y-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data
+    integer, value, intent(in) :: nx, nz  !! X and Z dimension sizes
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory tile for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread, block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -255,14 +287,18 @@ attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
   end subroutine reorder_z2y
 
   attributes(global) subroutine sum_yintox(u_x, u_y, nz)
+    !! Accumulate Y-pencil contributions into X-pencil data.
+    !!
+    !! Performs u_x += u_y with reordering. Uses shared memory tile
+    !! for efficient transpose and coalesced memory access.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_x  !! In/out: X-pencil data to accumulate into
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data to add
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory tile for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread, block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -294,14 +330,18 @@ attributes(global) subroutine sum_yintox(u_x, u_y, nz)
   end subroutine sum_yintox
 
   attributes(global) subroutine sum_zintox(u_x, u_z, nz)
+    !! Accumulate Z-pencil contributions into X-pencil data.
+    !!
+    !! Performs u_x += u_z with reordering. No shared memory needed
+    !! due to favourable memory access pattern.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_x  !! In/out: X-pencil data to accumulate into
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data to add
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i, j, b_i, b_j, nx  !! Thread, loop, block indices and grid size
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
diff --git a/src/backend/cuda/kernels/spectral_processing.f90 b/src/backend/cuda/kernels/spectral_processing.f90
index 54b27b364..0a97f6cee 100644
--- a/src/backend/cuda/kernels/spectral_processing.f90
+++ b/src/backend/cuda/kernels/spectral_processing.f90
@@ -1,4 +1,14 @@
 module m_cuda_spectral
+  !! CUDA kernels for spectral space processing and FFT post-processing.
+  !!
+  !! This module contains kernels for:
+  !!
+  !! - Post-processing spectral transforms (forward/backward)
+  !! - Solving Poisson equations in spectral space
+  !! - Enforcing and undoing periodicity in Y-direction
+  !!
+  !! Implements spectral equivalence method from JCP 228 (2009), 5989-6015, Sec 4.
+  !! Handles both periodic (000) and non-periodic (010) boundary conditions.
   use cudafor
 
   use m_common, only: dp
@@ -8,14 +18,16 @@ module m_cuda_spectral
 contains
 
   attributes(global) subroutine memcpy3D(dst, src, nx, ny, nz)
-    !! Copy data between x3d2 padded arrays and cuFFTMp descriptors
+    !! Copy data between x3d2 padded arrays and cuFFTMp descriptors.
+    !!
+    !! Each thread handles one Y-Z plane position, looping over X.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: dst
-    real(dp), device, intent(in), dimension(:, :, :) :: src
-    integer, value, intent(in) :: nx, ny, nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: dst  !! Output: Destination array
+    real(dp), device, intent(in), dimension(:, :, :) :: src  !! Input: Source array
+    integer, value, intent(in) :: nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Loop and thread indices
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x !ny
     k = blockIdx%y !nz
@@ -34,23 +46,19 @@ attributes(global) subroutine process_spectral_000( &
     !! Post-processes the divergence of velocity in spectral space, including
     !! scaling w.r.t. grid size.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs forward post-processing, Poisson solve, and backward post-processing
+    !! using spectral equivalence method. Ref: JCP 228 (2009), 5989-6015, Sec 4.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    complex(dp), device, intent(in), dimension(:, :, :) :: waves
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
-    !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
-    !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence of velocity in spectral space
+    complex(dp), device, intent(in), dimension(:, :, :) :: waves  !! Input: Spectral wavenumbers for Poisson solve
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in the permuted slabs in spectral space
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz
-    real(dp) :: tmp_r, tmp_c, div_r, div_c
+    integer :: i, j, k, ix, iy, iz  !! Loop and spectral mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c  !! Temporary real/imaginary components
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -130,26 +138,22 @@ attributes(global) subroutine process_spectral_010( &
     div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, &
     ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Post-process divergence field and solve Poisson equation in spectral space
+    !! for non-periodic boundary conditions in Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs forward post-processing with odd/even mode handling, Poisson solve,
+    !! and backward post-processing. Ref: JCP 228 (2009), 5989-6015, Sec 4.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    complex(dp), device, intent(in), dimension(:, :, :) :: waves
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
-    !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
-    !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence field / pressure solution
+    complex(dp), device, intent(in), dimension(:, :, :) :: waves  !! Input: Spectral wavenumbers for Poisson solve
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary components for left/right modes
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -288,25 +292,23 @@ end subroutine process_spectral_010
   attributes(global) subroutine process_spectral_010_fw( &
     div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Forward post-processing only for non-periodic Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs normalisation, post-processing in X and Z, and odd/even mode handling
+    !! in Y. Used when Poisson solve and backward processing are separate steps.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence field to post-process
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
     !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
     !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
     !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary real/imaginary components
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -368,22 +370,19 @@ end subroutine process_spectral_010_fw
   attributes(global) subroutine process_spectral_010_poisson( &
     div_u, a_re, a_im, off, inc, nx_spec, n, nx, ny, nz &
     )
-    !! Solve the Poisson equation at cell centres with non-perioic BC along y
+    !! Solve Poisson equation for non-periodic Y-direction using pentadiagonal solver.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Handles odd/even mode separation using offset and increment parameters.
+    !! Modifies pentadiagonal coefficients in-place during forward/backward passes.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im
-    !> offset and increment. increment is 2 when considering only odd or even
-    integer, value, intent(in) :: off, inc
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, n, nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: RHS / Solution
+    real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im  !! In/out: Pentadiagonal coefficients (real/imag)
+    integer, value, intent(in) :: off, inc  !! Offset and increment for odd/even modes
+    integer, value, intent(in) :: nx_spec, n, nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, k, jm, nm
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon
+    integer :: i, j, k, jm, nm  !! Loop indices and mapped indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon  !! Temporary variables and tolerance
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -527,25 +526,23 @@ end subroutine process_spectral_010_poisson
   attributes(global) subroutine process_spectral_010_bw( &
     div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Backward post-processing only for non-periodic Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs odd/even mode recombination and post-processing in X and Z directions.
+    !! Completes the spectral-to-physical transformation after Poisson solve.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Solution field to post-process
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
     !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
     !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
     !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary real/imaginary components
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -605,13 +602,17 @@ attributes(global) subroutine process_spectral_010_bw( &
   end subroutine process_spectral_010_bw
 
   attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny)
+    !! Enforce Y-direction periodicity by reordering data for non-periodic transforms.
+    !!
+    !! Maps full domain [1:ny] to symmetric layout required by non-periodic FFT.
+    !! First half: odd points, second half: even points in reverse order.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: f_out
-    real(dp), device, intent(in), dimension(:, :, :) :: f_in
-    integer, value, intent(in) :: ny
+    real(dp), device, intent(out), dimension(:, :, :) :: f_out  !! Output: Reordered field
+    real(dp), device, intent(in), dimension(:, :, :) :: f_in  !! Input: Original field
+    integer, value, intent(in) :: ny  !! Y-dimension size
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Thread and loop indices
 
     i = threadIdx%x
     k = blockIdx%x
@@ -626,13 +627,17 @@ attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny)
   end subroutine enforce_periodicity_y
 
   attributes(global) subroutine undo_periodicity_y(f_out, f_in, ny)
+    !! Undo Y-direction periodicity reordering after non-periodic transforms.
+    !!
+    !! Inverse of enforce_periodicity_y: reconstructs original domain layout
+    !! from symmetric FFT ordering. Restores odd/even point positions.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: f_out
-    real(dp), device, intent(in), dimension(:, :, :) :: f_in
-    integer, value, intent(in) :: ny
+    real(dp), device, intent(out), dimension(:, :, :) :: f_out  !! Output: Restored field
+    real(dp), device, intent(in), dimension(:, :, :) :: f_in  !! Input: Reordered field
+    integer, value, intent(in) :: ny  !! Y-dimension size
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Thread and loop indices
 
     i = threadIdx%x
     k = blockIdx%x
diff --git a/src/backend/cuda/kernels/thomas.f90 b/src/backend/cuda/kernels/thomas.f90
index b5bf81169..af9522f7a 100644
--- a/src/backend/cuda/kernels/thomas.f90
+++ b/src/backend/cuda/kernels/thomas.f90
@@ -1,4 +1,14 @@
 module m_cuda_kernels_thom
+  !! CUDA kernels for Thomas algorithm-based tridiagonal solvers.
+  !!
+  !! Implements compact finite difference schemes using Thomas algorithm
+  !! for both periodic and non-periodic boundary conditions. Each thread
+  !! handles one pencil line through the domain.
+  !!
+  !! Variants:
+  !!
+  !! - `der_univ_thom`: Non-periodic boundaries with explicit near-boundary stencils
+  !! - `der_univ_thom_per`: Periodic boundaries with cyclic reduction
   use cudafor
 
   use m_common, only: dp
@@ -11,18 +21,24 @@ attributes(global) subroutine der_univ_thom( &
     du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, &
     thom_f, thom_s, thom_w, strch &
     )
+    !! Compute derivatives using Thomas algorithm with non-periodic boundaries.
+    !!
+    !! Forward pass: Apply compact stencil and eliminate sub-diagonal.
+    !! Backward pass: Back-substitution to solve tridiagonal system.
+    !! Near-boundary points use explicit stencils from coeffs_s/coeffs_e.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n_tds, n_rhs
-    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Derivative field
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: Field to differentiate
+    integer, value, intent(in) :: n_tds, n_rhs  !! Number of unknowns and RHS points
+    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Start/end explicit stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Bulk stencil coefficients (9-point)
+    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, &
+                                                  thom_w, strch  !! Thomas algorithm coefficients and stretching
 
-    integer :: i, j, b
+    integer :: i, j, b  !! Thread, loop, and block indices
 
-    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du
+    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du  !! Stencil coefficients and temporary
 
     i = threadIdx%x
     b = blockIdx%x
@@ -120,21 +136,26 @@ end subroutine der_univ_thom
   attributes(global) subroutine der_univ_thom_per( &
     du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch &
     )
+    !! Compute derivatives using Thomas algorithm with periodic boundaries.
+    !!
+    !! Forward pass: Apply periodic compact stencil with modulo indexing.
+    !! Backward pass: Standard back-substitution.
+    !! Periodic correction: Sherman-Morrison formula for cyclic system.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), value, intent(in) :: alpha
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Derivative field
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: Field to differentiate
+    integer, value, intent(in) :: n  !! Number of points in periodic direction
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Stencil coefficients (9-point)
+    real(dp), value, intent(in) :: alpha  !! Periodic coupling coefficient
     real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, &
-                                                  thom_p, strch
+                                                  thom_p, strch  !! Thomas and periodic correction coefficients
 
-    integer :: i, j, b
-    integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4
+    integer :: i, j, b  !! Thread, loop, and block indices
+    integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4  !! Periodic neighbor indices
 
-    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4
-    real(dp) :: temp_du, ss
+    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4  !! Stencil coefficients
+    real(dp) :: temp_du, ss  !! Temporary derivative and Sherman-Morrison correction
 
     i = threadIdx%x
     b = blockIdx%x
diff --git a/src/backend/cuda/poisson_fft.f90 b/src/backend/cuda/poisson_fft.f90
index 32a362f10..5b421a3ef 100644
--- a/src/backend/cuda/poisson_fft.f90
+++ b/src/backend/cuda/poisson_fft.f90
@@ -1,4 +1,9 @@
 module m_cuda_poisson_fft
+  !! FFT-based Poisson solver on GPU using cuFFT.
+  !!
+  !! Extends `poisson_fft_t` with device-resident spectral data and cuFFT plans.
+  !! Handles forward/backward transforms, spectral post-processing for different
+  !! boundary conditions, and periodic extensions.
   use iso_c_binding, only: c_loc, c_ptr, c_f_pointer, c_int, c_float, &
                            c_double_complex, c_float_complex
   use iso_fortran_env, only: stderr => error_unit
@@ -24,7 +29,7 @@ module m_cuda_poisson_fft
   implicit none
 
   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
-    !! FFT based Poisson solver
+    !! GPU-accelerated FFT-based Poisson solver with device-resident spectral data.
 
     !> Local domain sized array storing the spectral equivalence constants
     complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
@@ -149,20 +154,28 @@ end subroutine create_fft_plan
 
   function init(mesh, xdirps, ydirps, zdirps, lowmem) &
     result(poisson_fft)
+    !! Initialise CUDA Poisson FFT solver with cuFFT plans and spectral arrays.
+    !!
+    !! Sets up 3D FFT plans, allocates device storage for wave numbers and
+    !! stretching operators, and configures 1D decomposition (Z in real space,
+    !! Y in spectral space).
     implicit none
 
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    type(mesh_t), intent(in) :: mesh  !! Computational mesh
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Directional operators
+    logical, optional, intent(in) :: lowmem  !! Low memory mode flag
 
-    type(cuda_poisson_fft_t) :: poisson_fft
+    type(cuda_poisson_fft_t) :: poisson_fft  !! Initialised solver
 
-    integer :: nx, ny, nz
+    integer :: nx, ny, nz  !! Global grid dimensions
 
-    integer :: ierr
-    integer(int_ptr_kind()) :: worksize
+    integer :: ierr  !! Error code
+    integer(int_ptr_kind()) :: worksize  !! cuFFT workspace size
 
-    integer :: dims_glob(3), dims_loc(3), n_spec(3), n_sp_st(3)
+    integer :: dims_glob(3)  !! Global domain dimensions
+    integer :: dims_loc(3)  !! Local domain dimensions
+    integer :: n_spec(3)  !! Spectral space dimensions
+    integer :: n_sp_st(3)  !! Spectral space start indices
 
     ! 1D decomposition along Z in real domain, and along Y in spectral space
     if (mesh%par%nproc_dir(2) /= 1) print *, 'nproc_dir in y-dir must be 1'
@@ -282,19 +295,25 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) &
   end function init
 
   subroutine fft_forward_cuda(self, f)
+    !! Execute forward 3D FFT on device field.
+    !!
+    !! Copies padded field data into cuFFT descriptor storage and performs
+    !! forward transform using cuFFTMp.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(in) :: f
+    class(field_t), intent(in) :: f  !! Input field in real space
 
-    real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: padded_dev(:, :, :)  !! Padded field data
+    real(dp), device, pointer :: d_dev(:, :, :)  !! cuFFT descriptor data
+    real(dp), device, pointer :: f_ptr  !! Workaround device pointer for cuFFT
+    type(c_ptr) :: f_c_ptr  !! Intermediate C pointer for workaround
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    integer :: tsize, ierr
-    type(dim3) :: blocks, threads
+    integer :: tsize  !! Thread block size
+    integer :: ierr  !! Error code
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f)
     type is (cuda_field_t)
@@ -340,19 +359,25 @@ subroutine fft_forward_cuda(self, f)
   end subroutine fft_forward_cuda
 
   subroutine fft_backward_cuda(self, f)
+    !! Execute backward 3D FFT and copy result to device field.
+    !!
+    !! Performs inverse transform using cuFFTMp and copies result from
+    !! descriptor storage back to field's device array.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f
+    class(field_t), intent(inout) :: f  !! Output field in real space
 
-    real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: padded_dev(:, :, :)  !! Padded field data
+    real(dp), device, pointer :: d_dev(:, :, :)  !! cuFFT descriptor data
+    real(dp), device, pointer :: f_ptr  !! Workaround device pointer for cuFFT
+    type(c_ptr) :: f_c_ptr  !! Intermediate C pointer for workaround
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    integer :: tsize, ierr
-    type(dim3) :: blocks, threads
+    integer :: tsize  !! Thread block size
+    integer :: ierr  !! Error code
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f)
     type is (cuda_field_t)
@@ -399,15 +424,19 @@ subroutine fft_backward_cuda(self, f)
   end subroutine fft_backward_cuda
 
   subroutine fft_postprocess_000_cuda(self)
+    !! Post-process spectral data for Periodic-Periodic-Periodic boundaries.
+    !!
+    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with homogeneous
+    !! Periodic boundaries in all directions.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    complex(dp), device, dimension(:, :, :), pointer :: c_dev
-    type(dim3) :: blocks, threads
-    integer :: tsize
+    complex(dp), device, dimension(:, :, :), pointer :: c_dev  !! Spectral data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
+    integer :: tsize  !! Thread block size
 
     ! tsize is different than SZ, because here we work on a 3D Cartesian
     ! data structure, and free to specify any suitable thread/block size.
@@ -438,15 +467,22 @@ subroutine fft_postprocess_000_cuda(self)
   end subroutine fft_postprocess_000_cuda
 
   subroutine fft_postprocess_010_cuda(self)
+    !! Post-process spectral data for Periodic-Non-periodic-Periodic boundaries.
+    !!
+    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with Periodic
+    !! boundaries in X and Z, non-periodic in Y. Handles stretched meshes with
+    !! matrix solves in spectral space.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
 
     type(cudaXtDesc), pointer :: descriptor
 
-    complex(dp), device, dimension(:, :, :), pointer :: c_dev
-    type(dim3) :: blocks, threads
-    integer :: tsize, off, inc
+    complex(dp), device, dimension(:, :, :), pointer :: c_dev  !! Spectral data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
+    integer :: tsize  !! Thread block size
+    integer :: off  !! Array offset for odd/even modes
+    integer :: inc  !! Array increment stride
 
     ! tsize is different than SZ, because here we work on a 3D Cartesian
     ! data structure, and free to specify any suitable thread/block size.
@@ -542,14 +578,19 @@ subroutine fft_postprocess_010_cuda(self)
   end subroutine fft_postprocess_010_cuda
 
   subroutine enforce_periodicity_y_cuda(self, f_out, f_in)
+    !! Enforce periodic extension in Y for non-periodic boundaries.
+    !!
+    !! Extends field from physical domain size to doubled periodic domain
+    !! by symmetry (f(y+L) = f(L-y)) for non-periodic boundary FFTs.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(field_t), intent(inout) :: f_out  !! Extended periodic field
+    class(field_t), intent(in) :: f_in  !! Original physical field
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev
-    type(dim3) :: blocks, threads
+    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev  !! Output device data
+    real(dp), device, pointer, dimension(:, :, :) :: f_in_dev  !! Input device data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f_out)
     type is (cuda_field_t)
@@ -569,14 +610,19 @@ subroutine enforce_periodicity_y_cuda(self, f_out, f_in)
   end subroutine enforce_periodicity_y_cuda
 
   subroutine undo_periodicity_y_cuda(self, f_out, f_in)
+    !! Extract physical domain from periodic extension in Y.
+    !!
+    !! Reverses enforce_periodicity_y by extracting original domain size
+    !! from doubled periodic field after inverse FFT.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(field_t), intent(inout) :: f_out  !! Physical domain field
+    class(field_t), intent(in) :: f_in  !! Extended periodic field
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev
-    type(dim3) :: blocks, threads
+    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev  !! Output device data
+    real(dp), device, pointer, dimension(:, :, :) :: f_in_dev  !! Input device data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f_out)
     type is (cuda_field_t)
diff --git a/src/backend/cuda/sendrecv.f90 b/src/backend/cuda/sendrecv.f90
index 4df5861d3..37ed2184e 100644
--- a/src/backend/cuda/sendrecv.f90
+++ b/src/backend/cuda/sendrecv.f90
@@ -1,4 +1,18 @@
 module m_cuda_sendrecv
+  !! MPI communication for CUDA backend using device pointers.
+  !!
+  !! Passes device pointers directly to MPI calls. With GPU-aware MPI
+  !! implementations (e.g., OpenMPI with CUDA support, MVAPICH2-GDR),
+  !! data transfers directly between GPU memories without staging through
+  !! host, reducing latency and increasing bandwidth.
+  !!
+  !! Without GPU-aware MPI, the implementation may stage through host
+  !! memory automatically, still functional but with additional overhead.
+  !!
+  !! - `sendrecv_fields`: Single field halo exchange
+  !! - `sendrecv_3fields`: Batch exchange for three fields (velocity components
+  !!   or derivatives). Batching amortises MPI overhead and enables better
+  !!   network utilisation.
   use cudafor
   use mpi
 
@@ -10,11 +24,21 @@ module m_cuda_sendrecv
 
   subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, &
                              n_data, nproc, prev, next)
+    !! Exchange boundary halos using MPI with device pointers.
+    !!
+    !! MPI_Isend/Irecv allows all four communications (send to prev/next,
+    !! receive from prev/next) to proceed concurrently, enabling network
+    !! pipelining. MPI_Waitall synchronises only when results needed.
+    !!
+    !! When nproc=1, data copied directly on device without MPI.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e
-    real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+    real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e  !! Device receive buffers
+    real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e   !! Device send buffers
+    integer, intent(in) :: n_data    !! Number of data elements
+    integer, intent(in) :: nproc     !! Number of processes in direction
+    integer, intent(in) :: prev      !! Previous neighbour rank
+    integer, intent(in) :: next      !! Next neighbour rank
 
     integer :: req(4), err(4), ierr, tag = 1234
 
@@ -41,13 +65,22 @@ subroutine sendrecv_3fields( &
     f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e, &
     n_data, nproc, prev, next &
     )
+    !! Exchange three fields simultaneously using batched MPI communication.
+    !!
+    !! Used for: (1) velocity component halos (u, v, w) before computing transport
+    !! equation, (2) derivative field halos (du, dud, d2u) in distributed compact
+    !! schemes. Batching all three fields amortises MPI setup overhead. Single
+    !! MPI_Waitall for all 12 operations reduces synchronisation points.
     implicit none
 
     real(dp), device, dimension(:, :, :), intent(out) :: &
-      f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e
+      f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e  !! Device receive buffers
     real(dp), device, dimension(:, :, :), intent(in) :: &
-      f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+      f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e  !! Device send buffers
+    integer, intent(in) :: n_data    !! Number of data elements per field
+    integer, intent(in) :: nproc     !! Number of processes
+    integer, intent(in) :: prev      !! Previous neighbour rank
+    integer, intent(in) :: next      !! Next neighbour rank
 
     integer :: req(12), err(12), ierr, tag = 1234
 
diff --git a/src/backend/cuda/tdsops.f90 b/src/backend/cuda/tdsops.f90
index b14cf5614..2ce9e47e8 100644
--- a/src/backend/cuda/tdsops.f90
+++ b/src/backend/cuda/tdsops.f90
@@ -1,4 +1,9 @@
 module m_cuda_tdsops
+  !! GPU-resident tridiagonal operator coefficients.
+  !!
+  !! Extends base `tdsops_t` with device memory copies of all coefficient
+  !! arrays. One-time upload to GPU avoids repeated host-device transfers
+  !! during kernel execution, critical for performance.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp
@@ -7,18 +12,15 @@ module m_cuda_tdsops
   implicit none
 
   type, extends(tdsops_t) :: cuda_tdsops_t
-    !! CUDA extension of the Tridiagonal Solver Operators class.
-    !!
-    !! Regular tdsops_t class is initiated and the coefficient arrays are
-    !! copied into device arrays so that cuda kernels can use them.
+    !! Tridiagonal operators with device-resident coefficients.
     real(dp), device, allocatable :: dist_fw_dev(:), dist_bw_dev(:), &
                                      dist_sa_dev(:), dist_sc_dev(:), &
-                                     dist_af_dev(:)
+                                     dist_af_dev(:)  !! Distributed compact scheme coefficients
     real(dp), device, allocatable :: thom_f_dev(:), thom_s_dev(:), &
-                                     thom_w_dev(:), thom_p_dev(:)
-    real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:)
+                                     thom_w_dev(:), thom_p_dev(:)  !! Thomas algorithm coefficients
+    real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:)  !! Grid stretching factors
     real(dp), device, allocatable :: coeffs_dev(:), &
-                                     coeffs_s_dev(:, :), coeffs_e_dev(:, :)
+                                     coeffs_s_dev(:, :), coeffs_e_dev(:, :)  !! Finite difference stencils
   contains
   end type cuda_tdsops_t
 
@@ -32,11 +34,13 @@ function cuda_tdsops_init( &
     n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     ) result(tdsops)
-    !! Constructor function for the cuda_tdsops_t class.
-    !! See tdsops_t for details.
+    !! Initialise tridiagonal operators and upload to GPU.
+    !!
+    !! Computes coefficients on CPU via base tdsops_init, then copies
+    !! to device arrays for kernel access. See tdsops_t for parameters.
     implicit none
 
-    type(cuda_tdsops_t) :: tdsops !! return value of the function
+    type(cuda_tdsops_t) :: tdsops
 
     integer, intent(in) :: n_tds
     real(dp), intent(in) :: delta
diff --git a/src/backend/omp/backend.f90 b/src/backend/omp/backend.f90
index 370108a5a..5a812b16f 100644
--- a/src/backend/omp/backend.f90
+++ b/src/backend/omp/backend.f90
@@ -1,4 +1,39 @@
 module m_omp_backend
+  !! OpenMP/CPU backend implementation for X3D2 solver operations.
+  !!
+  !! This module provides the CPU-based backend using OpenMP for shared-memory
+  !! parallelism and MPI for distributed-memory parallelism. It implements all
+  !! abstract backend operations defined in `base_backend_t`.
+  !!
+  !! **Parallelisation Strategy:**
+  !!
+  !! - **MPI**: Domain decomposition across nodes/processes
+  !! - **OpenMP**: Thread parallelism within each MPI rank
+  !! - **Hybrid MPI+OpenMP**: Enables efficient use of multi-core clusters
+  !!
+  !! **Key Features:**
+  !!
+  !! - Compact finite difference operators (tridiagonal solves)
+  !! - Halo exchange for distributed derivatives
+  !! - FFT-based Poisson solver integration
+  !! - Vectorised array operations
+  !! - Optimised data reordering between decomposition directions
+  !!
+  !! **Memory Management:**
+  !!
+  !! - Send/receive buffers for MPI halo exchange (`u`, `v`, `w`, `du`, `dud`, `d2u`)
+  !! - Buffers sized based on largest decomposition direction
+  !! - Persistent buffers to avoid repeated allocation
+  !!
+  !! **Solver Operations:**
+  !!
+  !! - `transeq`: Transport equation terms with halo exchange
+  !! - `tds_solve`: Tridiagonal system solves (Thomas algorithm)
+  !! - `reorder`: Data layout transformations (`DIR_X`, `DIR_Y`, `DIR_Z`)
+  !! - Field operations: copy, add, multiply, integrate, etc.
+  !!
+  !! **Note:** This backend requires 2DECOMP&FFT library for FFT operations
+  !! when using the spectral Poisson solver.
   use mpi
 
   use m_allocator, only: allocator_t
@@ -20,37 +55,53 @@ module m_omp_backend
   private :: transeq_halo_exchange, transeq_dist_component
 
   type, extends(base_backend_t) :: omp_backend_t
-    !character(len=*), parameter :: name = 'omp'
+    !! OpenMP/CPU backend for solver operations.
+    !!
+    !! Extends `base_backend_t` with CPU-specific implementations using
+    !! OpenMP for threading and MPI for distributed parallelism. Maintains
+    !! communication buffers for halo exchange operations.
+    !!
+    !! **Communication Buffers:**
+    !! Arrays sized (SZ, n_halo, n_groups) where:
+    !! - SZ: maximum pencil size for data reordering
+    !! - n_halo: halo region depth (typically 4 for compact schemes)
+    !! - n_groups: maximum number of groups across all directions
+    !!
+    !! Buffer naming convention: {field}_{send/recv}_{s/e}
+    !! - field: u, v, w (velocity), du, dud, d2u (derivatives)
+    !! - send/recv: data direction
+    !! - s/e: start/end of domain (neighbouring ranks)
+    !character(len=*), parameter :: name = 'omp' !! Backend identifier
     real(dp), allocatable, dimension(:, :, :) :: &
-      u_recv_s, u_recv_e, u_send_s, u_send_e, &
-      v_recv_s, v_recv_e, v_send_s, v_send_e, &
-      w_recv_s, w_recv_e, w_send_s, w_send_e, &
-      du_send_s, du_send_e, du_recv_s, du_recv_e, &
-      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
-      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
+      u_recv_s, u_recv_e, u_send_s, u_send_e, &     !! Velocity u halo buffers
+      v_recv_s, v_recv_e, v_send_s, v_send_e, &     !! Velocity v halo buffers
+      w_recv_s, w_recv_e, w_send_s, w_send_e, &     !! Velocity w halo buffers
+      du_send_s, du_send_e, du_recv_s, du_recv_e, & !! First derivative buffers
+      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & !! Mixed derivative buffers
+      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e    !! Second derivative buffers
   contains
-    procedure :: alloc_tdsops => alloc_omp_tdsops
-    procedure :: transeq_x => transeq_x_omp
-    procedure :: transeq_y => transeq_y_omp
-    procedure :: transeq_z => transeq_z_omp
-    procedure :: transeq_species => transeq_species_omp
-    procedure :: tds_solve => tds_solve_omp
-    procedure :: reorder => reorder_omp
-    procedure :: sum_yintox => sum_yintox_omp
-    procedure :: sum_zintox => sum_zintox_omp
-    procedure :: veccopy => veccopy_omp
-    procedure :: vecadd => vecadd_omp
-    procedure :: vecmult => vecmult_omp
-    procedure :: scalar_product => scalar_product_omp
-    procedure :: field_max_mean => field_max_mean_omp
-    procedure :: field_scale => field_scale_omp
-    procedure :: field_shift => field_shift_omp
-    procedure :: field_set_face => field_set_face_omp
-    procedure :: field_volume_integral => field_volume_integral_omp
-    procedure :: copy_data_to_f => copy_data_to_f_omp
-    procedure :: copy_f_to_data => copy_f_to_data_omp
-    procedure :: init_poisson_fft => init_omp_poisson_fft
-    procedure :: transeq_omp_dist
+    procedure :: alloc_tdsops => alloc_omp_tdsops          !! Allocate tridiagonal operators
+    procedure :: transeq_x => transeq_x_omp                !! Transport equation in X
+    procedure :: transeq_y => transeq_y_omp                !! Transport equation in Y
+    procedure :: transeq_z => transeq_z_omp                !! Transport equation in Z
+    procedure :: transeq_species => transeq_species_omp    !! Transport for species/scalars
+    procedure :: tds_solve => tds_solve_omp                !! Tridiagonal solve
+    procedure :: reorder => reorder_omp                    !! Data reordering
+    procedure :: sum_yintox => sum_yintox_omp              !! Sum Y data into X
+    procedure :: sum_zintox => sum_zintox_omp              !! Sum Z data into X
+    procedure :: veccopy => veccopy_omp                    !! Vector copy
+    procedure :: vecadd => vecadd_omp                      !! Vector add
+    procedure :: vecmult => vecmult_omp                    !! Vector multiply
+    procedure :: scalar_product => scalar_product_omp      !! Scalar product
+    procedure :: field_max_mean => field_max_mean_omp      !! Compute max and mean
+    procedure :: field_scale => field_scale_omp            !! Scale field
+    procedure :: field_shift => field_shift_omp            !! Shift field values
+    procedure :: field_set_face => field_set_face_omp      !! Set face values
+    procedure :: field_volume_integral => field_volume_integral_omp !! Volume integral
+    procedure :: copy_data_to_f => copy_data_to_f_omp      !! Copy data to field
+    procedure :: copy_f_to_data => copy_f_to_data_omp      !! Copy field to data
+    procedure :: init_poisson_fft => init_omp_poisson_fft  !! Initialise FFT Poisson
+    procedure :: transeq_omp_dist                          !! Distributed transeq (internal)
   end type omp_backend_t
 
   interface omp_backend_t
@@ -60,11 +111,21 @@ module m_omp_backend
 contains
 
   function init(mesh, allocator) result(backend)
+    !! Initialise OpenMP backend with mesh and allocator.
+    !!
+    !! Sets up the backend by:
+    !! 1. Calling base initialisation
+    !! 2. Linking mesh and allocator
+    !! 3. Determining maximum number of groups across directions
+    !! 4. Allocating communication buffers for halo exchange
+    !!
+    !! **Buffer Sizing:** Buffers are sized based on the largest decomposition
+    !! direction to handle all reordering operations efficiently.
     implicit none
 
-    type(mesh_t), target, intent(inout) :: mesh
-    class(allocator_t), target, intent(inout) :: allocator
-    type(omp_backend_t) :: backend
+    type(mesh_t), target, intent(inout) :: mesh         !! Mesh with decomposition
+    class(allocator_t), target, intent(inout) :: allocator !! Memory allocator
+    type(omp_backend_t) :: backend                       !! Initialised backend instance
 
     integer :: n_groups
 
@@ -113,19 +174,24 @@ subroutine alloc_omp_tdsops( &
     self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     )
+    !! Allocate and initialise tridiagonal operator for OMP backend.
+    !!
+    !! Creates a `tdsops_t` object configured for the specified operation
+    !! (derivative, interpolation) with chosen compact scheme and boundary
+    !! conditions. Handles grid stretching and viscous corrections.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(tdsops_t), allocatable, intent(inout) :: tdsops
-    integer, intent(in) :: n_tds
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end
-    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)
-    integer, optional, intent(in) :: n_halo
-    character(*), optional, intent(in) :: from_to
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(omp_backend_t) :: self                        !! Backend instance
+    class(tdsops_t), allocatable, intent(inout) :: tdsops !! Tridiagonal operator to allocate
+    integer, intent(in) :: n_tds                        !! Number of points in direction
+    real(dp), intent(in) :: delta                       !! Grid spacing
+    character(*), intent(in) :: operation, scheme       !! Operation type and scheme name
+    integer, intent(in) :: bc_start, bc_end             !! Boundary condition codes
+    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) !! Grid stretching
+    integer, optional, intent(in) :: n_halo             !! Halo depth
+    character(*), optional, intent(in) :: from_to       !! Data location transition
+    logical, optional, intent(in) :: sym                !! Symmetry flag
+    real(dp), optional, intent(in) :: c_nu, nu0_nu      !! Viscous correction parameters
 
     allocate (tdsops_t :: tdsops)
 
@@ -139,26 +205,40 @@ subroutine alloc_omp_tdsops( &
   end subroutine alloc_omp_tdsops
 
   subroutine transeq_x_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in X direction.
+    !!
+    !! Evaluates convection and diffusion terms for momentum equations:
+    !! \( du/dt = -u \cdot \nabla u + \nu \nabla^2 u \)
+    !!
+    !! Delegates to `transeq_omp_dist` which handles halo exchange and
+    !! distributed compact schemes.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self                  !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: velocity RHS
+    class(field_t), intent(in) :: u, v, w        !! Input: velocity fields
+    real(dp), intent(in) :: nu                    !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps            !! Directional operators
 
     call self%transeq_omp_dist(du, dv, dw, u, v, w, nu, dirps)
 
   end subroutine transeq_x_omp
 
   subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in Y direction.
+    !!
+    !! Calculates convective and viscous terms for Y-pencil decomposition.
+    !! Velocity components are reordered (v, u, w) to align primary
+    !! direction with pencil orientation before calling distributed kernel.
+    !!
+    !! See [[transeq_x_omp]] for transport equation formulation.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     ! u, v, w is reordered so that we pass v, u, w
     call self%transeq_omp_dist(dv, du, dw, v, u, w, nu, dirps)
@@ -166,13 +246,20 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_y_omp
 
   subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in Z direction.
+    !!
+    !! Calculates convective and viscous terms for Z-pencil decomposition.
+    !! Velocity components are reordered (w, u, v) to align primary
+    !! direction with pencil orientation before calling distributed kernel.
+    !!
+    !! See [[transeq_x_omp]] for transport equation formulation.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     ! u, v, w is reordered so that we pass w, u, v
     call self%transeq_omp_dist(dw, du, dv, w, u, v, nu, dirps)
@@ -180,18 +267,24 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_z_omp
 
   subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync)
-    !! Compute the convection and diffusion for the given field
-    !! in the given direction.
-    !! Halo exchange for the given field is necessary
-    !! When sync is true, halo exchange of momentum is necessary
+    !! Compute transport equation RHS for scalar species.
+    !!
+    !! Calculates convective and diffusive terms for a passive scalar
+    !! (temperature, concentration, etc.) transported by velocity field.
+    !!
+    !! **Equation:** `$\partial\phi/\partial t = -\mathbf{u}\cdot\nabla\phi + \nu\nabla^2\phi$` where $\phi$ is the scalar species.
+    !!
+    !! **Synchronisation:** When `sync=.true.`, performs halo exchange
+    !! for velocity field before computation. Always exchanges scalar halos.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: dspec
-    class(field_t), intent(in) :: uvw, spec
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    logical, intent(in) :: sync
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: dspec     !! Time derivative of species (output)
+    class(field_t), intent(in) :: uvw          !! Velocity component in pencil direction
+    class(field_t), intent(in) :: spec         !! Species concentration/temperature
+    real(dp), intent(in) :: nu                 !! Diffusivity coefficient
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
+    logical, intent(in) :: sync                !! Perform velocity halo exchange if true
 
     integer :: n_groups
 
@@ -229,13 +322,21 @@ subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync)
   end subroutine transeq_species_omp
 
   subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Internal: Distributed transport equation implementation.
+    !!
+    !! Orchestrates the complete transport equation calculation for
+    !! all velocity components. First performs halo exchange for
+    !! distributed compact derivatives, then computes each component's
+    !! RHS using transeq_dist_component.
+    !!
+    !! **Called by:** transeq_x/y/z_omp after velocity reordering
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components (reordered for pencil direction)
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     call transeq_halo_exchange(self, u, v, w, dirps%dir)
 
@@ -258,9 +359,17 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_omp_dist
 
   subroutine transeq_halo_exchange(self, u, v, w, dir)
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: u, v, w
-    integer, intent(in) :: dir
+    !! Internal: Perform halo exchange for all velocity components.
+    !!
+    !! Exchanges 4-point halos between neighbouring MPI processes for
+    !! distributed compact finite difference stencils. Copies boundary
+    !! data into send buffers, performs MPI sendrecv, stores in receive
+    !! buffers for use in derivative calculations.
+    !!
+    !! **Operation:** Copy to buffers $\rightarrow$ MPI_Sendrecv $\rightarrow$ Store halos
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    integer, intent(in) :: dir                 !! Communication direction
     integer :: n, nproc_dir, pprev, pnext
     integer :: n_groups
 
@@ -296,20 +405,27 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, &
                                     u_recv_s, u_recv_e, &
                                     conv_recv_s, conv_recv_e, &
                                     tdsops_du, tdsops_dud, tdsops_d2u, dir)
-    !! Computes RHS_x^u following:
+    !! Internal: Compute single component of transport equation RHS.
+    !!
+    !! Calculates RHS for one velocity component using skew-symmetric form:
+    !!
+    !! **Formula:** `rhs = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2`
+    !!
+    !! Uses distributed compact FD kernels with halo data from neighbours.
+    !! Allocates temporary storage for derivatives and releases after use.
     !!
-    !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
-    class(omp_backend_t) :: self
+    !! **Skew-symmetric:** Reduces aliasing errors in nonlinear convection.
+    class(omp_backend_t) :: self               !! Backend instance
     !> The result field, it is also used as temporary storage
-    class(field_t), intent(inout) :: rhs_du
-    class(field_t), intent(in) :: u, conv
-    real(dp), intent(in) :: nu
+    class(field_t), intent(inout) :: rhs_du   !! RHS output (also temp storage)
+    class(field_t), intent(in) :: u, conv     !! Velocity component and convecting velocity
+    real(dp), intent(in) :: nu                !! Kinematic viscosity
     real(dp), dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e, &
-                                                conv_recv_s, conv_recv_e
-    class(tdsops_t), intent(in) :: tdsops_du
-    class(tdsops_t), intent(in) :: tdsops_dud
-    class(tdsops_t), intent(in) :: tdsops_d2u
-    integer, intent(in) :: dir
+                                                conv_recv_s, conv_recv_e  !! Halo data from neighbours
+    class(tdsops_t), intent(in) :: tdsops_du      !! First derivative operator
+    class(tdsops_t), intent(in) :: tdsops_dud     !! Product derivative operator
+    class(tdsops_t), intent(in) :: tdsops_d2u     !! Second derivative operator
+    integer, intent(in) :: dir                    !! Direction index
     class(field_t), pointer :: d2u, dud
 
     dud => self%allocator%get_block(dir)
@@ -334,12 +450,20 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, &
   end subroutine transeq_dist_component
 
   subroutine tds_solve_omp(self, du, u, tdsops)
+    !! Solve tridiagonal system for compact finite difference operation.
+    !!
+    !! Applies compact scheme operator to field using Thomas algorithm.
+    !! Handles both local (single-process) and distributed (multi-process)
+    !! solves depending on decomposition configuration.
+    !!
+    !! **Data Location:** Updates output data location based on operator's
+    !! `move` specification (e.g., CELL to VERT for interpolation).
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(omp_backend_t) :: self              !! Backend instance
+    class(field_t), intent(inout) :: du      !! Output field
+    class(field_t), intent(in) :: u          !! Input field
+    class(tdsops_t), intent(in) :: tdsops    !! Tridiagonal operator
 
     ! Check if direction matches for both in/out fields
     if (u%dir /= du%dir) then
@@ -355,12 +479,22 @@ subroutine tds_solve_omp(self, du, u, tdsops)
   end subroutine tds_solve_omp
 
   subroutine tds_solve_dist(self, du, u, tdsops)
+    !! Internal: Distributed tridiagonal solve with halo exchange.
+    !!
+    !! Solves compact finite difference system across multiple MPI processes.
+    !! Performs halo exchange before calling distributed Thomas algorithm
+    !! kernel. Used when domain decomposition splits the pencil direction.
+    !!
+    !! **Algorithm:**
+    !! 1. Copy boundary data into send buffers
+    !! 2. MPI_Sendrecv for halo exchange
+    !! 3. Distributed Thomas algorithm with boundary coupling
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du       !! Solution field (output)
+    class(field_t), intent(in) :: u           !! RHS field
+    class(tdsops_t), intent(in) :: tdsops     !! Tridiagonal operator
     integer :: n_groups, dir
 
     dir = u%dir
@@ -387,12 +521,24 @@ subroutine tds_solve_dist(self, du, u, tdsops)
   end subroutine tds_solve_dist
 
   subroutine reorder_omp(self, u_, u, direction)
+    !! Reorder field data between different pencil decompositions.
+    !!
+    !! Transforms field layout from one decomposition direction to another
+    !! (e.g., X-pencils to Y-pencils). Uses MPI All-to-All communication
+    !! to redistribute data across processes.
+    !!
+    !! **Directions:** DIR_X, DIR_Y, DIR_Z specify pencil orientations.
+    !! Each pencil is contiguous along its direction and distributed in
+    !! the other two dimensions.
+    !!
+    !! **Performance:** Critical operation for multi-dimensional algorithms.
+    !! Uses `get_index_reordering` for efficient cache-friendly reordering.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u_
-    class(field_t), intent(in) :: u
-    integer, intent(in) :: direction
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u_   !! Output field (reordered)
+    class(field_t), intent(in) :: u       !! Input field
+    integer, intent(in) :: direction      !! Reordering direction code
     integer, dimension(3) :: dims, cart_padded
     integer :: i, j, k
     integer :: out_i, out_j, out_k
@@ -420,33 +566,50 @@ subroutine reorder_omp(self, u_, u, direction)
   end subroutine reorder_omp
 
   subroutine sum_yintox_omp(self, u, u_)
+    !! Sum Y-pencils into X-pencils through reordering.
+    !!
+    !! Performs directional reduction by reordering from Y to X pencils
+    !! and summing the result into the destination field. Useful for
+    !! integrating quantities along the Y direction.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (X-pencils, accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field (Y-pencils)
 
     call sum_intox_omp(self, u, u_, DIR_Y)
 
   end subroutine sum_yintox_omp
 
   subroutine sum_zintox_omp(self, u, u_)
+    !! Sum Z-pencils into X-pencils through reordering.
+    !!
+    !! Performs directional reduction by reordering from Z to X pencils
+    !! and summing the result into the destination field. Useful for
+    !! integrating quantities along the Z direction.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (X-pencils, accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field (Z-pencils)
 
     call sum_intox_omp(self, u, u_, DIR_Z)
 
   end subroutine sum_zintox_omp
 
   subroutine sum_intox_omp(self, u, u_, dir_to)
+    !! Internal helper: Sum reordered field into X-pencils.
+    !!
+    !! Reorders source field from X-pencils to specified direction,
+    !! then accumulates into destination field. Called by sum_yintox_omp
+    !! and sum_zintox_omp for directional integration.
+    !!
+    !! **Algorithm:** Reorder with index mapping, accumulate with +=
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
-    integer, intent(in) :: dir_to
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field
+    integer, intent(in) :: dir_to         !! Target direction (DIR_Y or DIR_Z)
 
     integer :: dir_from
     integer, dimension(3) :: dims, cart_padded
@@ -473,12 +636,16 @@ subroutine sum_intox_omp(self, u, u_, dir_to)
   end subroutine sum_intox_omp
 
   subroutine veccopy_omp(self, dst, src)
+    !! Copy field data from source to destination.
+    !!
+    !! Element-wise copy with OpenMP parallelisation. Both fields
+    !! must have the same decomposition direction and dimensions.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: dst
-    class(field_t), intent(in) :: src
-    integer :: i, j, k
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: dst  !! Destination field
+    class(field_t), intent(in) :: src     !! Source field
+    integer :: i, j, k                     !! Loop indices
 
     if (src%dir /= dst%dir) then
       error stop "Called vector copy with incompatible fields"
@@ -501,13 +668,19 @@ subroutine veccopy_omp(self, dst, src)
   end subroutine veccopy_omp
 
   subroutine vecadd_omp(self, a, x, b, y)
+    !! Vector addition: y = a*x + b*y (in-place AXPBY).
+    !!
+    !! Scaled in-place vector addition with OpenMP parallelisation
+    !! and SIMD vectorisation. Implements the BLAS AXPBY operation.
+    !!
+    !! **Formula:** `y := a*x + b*y` where a, b are scalars.
     implicit none
 
-    class(omp_backend_t) :: self
-    real(dp), intent(in) :: a
-    class(field_t), intent(in) :: x
-    real(dp), intent(in) :: b
-    class(field_t), intent(inout) :: y
+    class(omp_backend_t) :: self           !! Backend instance
+    real(dp), intent(in) :: a              !! Scalar multiplier for x
+    class(field_t), intent(in) :: x        !! First input field
+    real(dp), intent(in) :: b              !! Scalar multiplier for y
+    class(field_t), intent(inout) :: y     !! Second input field (overwritten with result)
     integer :: i, j, k
 
     if (x%dir /= y%dir) then
@@ -531,13 +704,18 @@ subroutine vecadd_omp(self, a, x, b, y)
   end subroutine vecadd_omp
 
   subroutine vecmult_omp(self, y, x)
+    !! Element-wise multiplication: y = y * x (in-place).
+    !!
+    !! In-place element-wise multiplication with OpenMP parallelisation
+    !! and SIMD vectorisation. Often used for applying masks or
+    !! multiplying solution components.
     !! [[m_base_backend(module):vecmult(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: y
-    class(field_t), intent(in) :: x
-    integer :: i, j, k
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: y    !! Field to multiply and store result
+    class(field_t), intent(in) :: x        !! Multiplier field
+    integer :: i, j, k                     !! Loop indices
 
     if (x%dir /= y%dir) then
       error stop "Called vector multiply with incompatible fields"
@@ -560,16 +738,24 @@ subroutine vecmult_omp(self, y, x)
   end subroutine vecmult_omp
 
   real(dp) function scalar_product_omp(self, x, y) result(s)
+    !! Compute global scalar product (dot product) of two fields.
+    !!
+    !! Calculates the dot product $\sum(x_i \times y_i)$ across all grid points
+    !! and all MPI processes. Uses OpenMP parallelisation with reduction
+    !! and MPI_Allreduce for global sum.
+    !!
+    !! **Algorithm:** Local parallel reduction $\rightarrow$ MPI_Allreduce
+    !! **Data location:** Both fields must be at the same location (CELL/VERT).
     !! [[m_base_backend(module):scalar_product(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: x, y
-    class(field_t), pointer :: x_, y_
-    integer, dimension(3) :: dims
-    integer :: i, j, k, ii
-    integer :: nvec, remstart
-    integer :: ierr
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: x, y     !! Input fields
+    class(field_t), pointer :: x_, y_      !! Pointers for data access
+    integer, dimension(3) :: dims          !! Field dimensions
+    integer :: i, j, k, ii                 !! Loop indices
+    integer :: nvec, remstart              !! Vectorisation variables
+    integer :: ierr                        !! MPI error code
 
     if ((x%data_loc == NULL_LOC) .or. (y%data_loc == NULL_LOC)) then
       error stop "You must set the data_loc before calling scalar product"
@@ -623,14 +809,22 @@ real(dp) function scalar_product_omp(self, x, y) result(s)
   end function scalar_product_omp
 
   subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups)
+    !! Internal helper: Copy halo data into send buffers.
+    !!
+    !! Extracts 4-point halos from start and end of domain for
+    !! MPI communication. Used in transeq_halo_exchange to prepare
+    !! boundary data for neighbour processes.
+    !!
+    !! **Buffer layout:** (SZ, 4, n_groups) for cache efficiency
     implicit none
 
-    real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e
-    real(dp), dimension(:, :, :), intent(in) :: u
-    integer, intent(in) :: n
-    integer, intent(in) :: n_groups
-    integer :: i, j, k
-    integer :: n_halo = 4
+    real(dp), dimension(:, :, :), intent(out) :: u_send_s  !! Send buffer for start boundary
+    real(dp), dimension(:, :, :), intent(out) :: u_send_e  !! Send buffer for end boundary
+    real(dp), dimension(:, :, :), intent(in) :: u          !! Field data
+    integer, intent(in) :: n                               !! Domain size in communication direction
+    integer, intent(in) :: n_groups                        !! Number of pencil groups
+    integer :: i, j, k                                     !! Loop indices
+    integer :: n_halo = 4                                  !! Halo width (compact scheme stencil)
 
     !$omp parallel do
     do k = 1, n_groups
@@ -648,13 +842,25 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups)
   end subroutine copy_into_buffers
 
   subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc)
+    !! Compute global maximum and mean of a field.
+    !!
+    !! Calculates maximum and mean values across all grid points and
+    !! MPI processes. Uses data location (CELL/VERT) to determine
+    !! valid domain extents, excluding padding and ghost cells.
+    !!
+    !! **Algorithm:**
+    !! 1. Local parallel max/sum reduction with OpenMP
+    !! 2. MPI_Allreduce for global max/sum
+    !! 3. Mean = global_sum / global_count
+    !!
+    !! **Data location:** Can be enforced or read from field metadata.
     !! [[m_base_backend(module):field_max_mean(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    real(dp), intent(out) :: max_val, mean_val
-    class(field_t), intent(in) :: f
-    integer, optional, intent(in) :: enforced_data_loc
+    class(omp_backend_t) :: self                   !! Backend instance
+    real(dp), intent(out) :: max_val, mean_val     !! Global maximum and mean values
+    class(field_t), intent(in) :: f                !! Input field
+    integer, optional, intent(in) :: enforced_data_loc  !! Override data location if provided
 
     real(dp) :: val, max_p, sum_p, max_pncl, sum_pncl
     integer :: data_loc, dims(3), dims_padded(3), n, n_i, n_i_pad, n_j
@@ -721,33 +927,48 @@ subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc)
   end subroutine field_max_mean_omp
 
   subroutine field_scale_omp(self, f, a)
+    !! Scale field by constant: f = a * f.
+    !!
+    !! Multiplies all field values by scalar a in-place.
+    !! Uses Fortran array syntax for simplicity.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to scale (modified in-place)
+    real(dp), intent(in) :: a              !! Scaling factor
 
     f%data = a*f%data
   end subroutine field_scale_omp
 
   subroutine field_shift_omp(self, f, a)
+    !! Shift field by constant: f = f + a.
+    !!
+    !! Adds scalar a to all field values in-place.
+    !! Uses Fortran array syntax for simplicity.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to shift (modified in-place)
+    real(dp), intent(in) :: a              !! Shift amount
 
     f%data = f%data + a
   end subroutine field_shift_omp
 
   subroutine field_set_face_omp(self, f, c_start, c_end, face)
+    !! Set boundary face values to specified constants.
+    !!
+    !! Sets values on a specified domain face (X/Y/Z start/end)
+    !! to given constants. Used for boundary condition enforcement.
+    !!
+    !! **Faces:** VERT_START_FACE, VERT_END_FACE, etc.
     !! [[m_base_backend(module):field_set_face(subroutine)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), intent(in) :: c_start, c_end
-    integer, intent(in) :: face
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: f     !! Field to modify
+    real(dp), intent(in) :: c_start        !! Value for start side of face
+    real(dp), intent(in) :: c_end          !! Value for end side of face
+    integer, intent(in) :: face            !! Face identifier constant
 
     integer :: dims(3), k, j, i_mod, k_end
 
@@ -784,11 +1005,18 @@ subroutine field_set_face_omp(self, f, c_start, c_end, face)
   end subroutine field_set_face_omp
 
   real(dp) function field_volume_integral_omp(self, f) result(s)
-    !! volume integral of a field
+    !! Compute volume integral of field over domain.
+    !!
+    !! Calculates $\int f \,dV$ by summing all field values (at cell centres)
+    !! and multiplying by grid cell volumes. Uses MPI_Allreduce for
+    !! global sum across all processes.
+    !!
+    !! **Formula:** $\int f \,dV = \sum(f_i \times \Delta V_i)$ where $\Delta V$ from mesh
+    !! **Assumption:** Field at cell centres (data_loc = CELL)
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to integrate
 
     real(dp) :: sum_p, sum_pncl
     integer :: dims(3), stacked, i, j, k, k_i, k_j, ierr
@@ -829,32 +1057,48 @@ real(dp) function field_volume_integral_omp(self, f) result(s)
   end function field_volume_integral_omp
 
   subroutine copy_data_to_f_omp(self, f, data)
-    class(omp_backend_t), intent(inout) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), dimension(:, :, :), intent(in) :: data
+    !! Copy raw array into field structure.
+    !!
+    !! Simple wrapper for field initialisation from external data.
+    !! Uses Fortran array assignment for efficiency.
+    class(omp_backend_t), intent(inout) :: self  !! Backend instance
+    class(field_t), intent(inout) :: f           !! Target field
+    real(dp), dimension(:, :, :), intent(in) :: data  !! Source data array
 
     f%data = data
   end subroutine copy_data_to_f_omp
 
   subroutine copy_f_to_data_omp(self, data, f)
-    class(omp_backend_t), intent(inout) :: self
-    real(dp), dimension(:, :, :), intent(out) :: data
-    class(field_t), intent(in) :: f
+    !! Copy field structure into raw array.
+    !!
+    !! Simple wrapper for field extraction to external data.
+    !! Uses Fortran array assignment for efficiency.
+    class(omp_backend_t), intent(inout) :: self  !! Backend instance
+    real(dp), dimension(:, :, :), intent(out) :: data  !! Destination data array
+    class(field_t), intent(in) :: f              !! Source field
 
     data = f%data
   end subroutine copy_f_to_data_omp
 
   subroutine init_omp_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+    !! Initialise FFT-based Poisson solver for OMP backend.
+    !!
+    !! Creates and configures omp_poisson_fft_t solver for pressure
+    !! correction step. Uses 2DECOMP&FFT library for parallel FFTs
+    !! in pencil decomposition.
+    !!
+    !! **Requirement:** WITH_2DECOMPFFT must be defined at compile time.
+    !! **Low-memory mode:** Optional flag to reduce memory footprint.
 #ifdef WITH_2DECOMPFFT
     use m_omp_poisson_fft, only: omp_poisson_fft_t
 #endif
 
     implicit none
 
-    class(omp_backend_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    class(omp_backend_t) :: self                   !! Backend instance
+    type(mesh_t), intent(in) :: mesh               !! Mesh with grid spacing
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Spectral operators for each direction
+    logical, optional, intent(in) :: lowmem        !! Enable low-memory mode
 
 #ifdef WITH_2DECOMPFFT
     allocate (omp_poisson_fft_t :: self%poisson_fft)
diff --git a/src/backend/omp/common.f90 b/src/backend/omp/common.f90
index 6d3df179a..b687782d8 100644
--- a/src/backend/omp/common.f90
+++ b/src/backend/omp/common.f90
@@ -1,6 +1,15 @@
 module m_omp_common
+  !! Common constants for OpenMP backend implementation.
+  !!
+  !! Defines compile-time constants used throughout the OMP backend
+  !! for performance tuning and buffer sizing.
+  !!
+  !! **SZ (pencil size):** Maximum pencil dimension for data reordering
+  !! operations. Set to 16 for optimal cache utilisation and vectorisation
+  !! on typical CPU architectures. Larger values may improve performance
+  !! for very large problems but increase memory overhead.
   implicit none
 
-  integer, parameter :: SZ = 16
+  integer, parameter :: SZ = 16 !! Maximum pencil size for reordering buffers
 
 end module m_omp_common
diff --git a/src/backend/omp/exec_dist.f90 b/src/backend/omp/exec_dist.f90
index d1334512e..e56c3a61f 100644
--- a/src/backend/omp/exec_dist.f90
+++ b/src/backend/omp/exec_dist.f90
@@ -1,4 +1,23 @@
 module m_omp_exec_dist
+  !! Distributed compact finite difference execution for OMP backend.
+  !!
+  !! Orchestrates parallel execution of distributed compact schemes across
+  !! MPI processes. Manages OpenMP threading, halo exchanges, forward/backward
+  !! sweeps, and boundary system solves for multi-process compact operators.
+  !!
+  !! **Key features:**
+  !!
+  !! - Forward/backward elimination with boundary coupling
+  !! - Non-blocking MPI communication for 2x2 boundary systems
+  !! - OpenMP parallelisation over pencil groups
+  !! - Fused kernels for transport equation efficiency
+  !!
+  !! **Distributed algorithm:**
+  !!
+  !! 1. Forward/backward sweep on local domain \(\rightarrow\) generate boundary systems
+  !! 2. MPI exchange boundary data between neighbours
+  !! 3. Solve coupled 2x2 systems at process interfaces
+  !! 4. Substitution sweep to complete solution
   use mpi
 
   use m_common, only: dp
@@ -15,21 +34,34 @@ module m_omp_exec_dist
   subroutine exec_dist_tds_compact( &
     du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, &
     tdsops, nproc, pprev, pnext, n_groups)
+    !! Execute distributed compact finite difference operation.
+    !!
+    !! Applies compact scheme operator across multiple MPI processes using
+    !! distributed Thomas algorithm. Performs forward/backward elimination,
+    !! exchanges boundary systems via MPI, then completes with substitution.
+    !!
+    !! **Algorithm:**
+    !! 1. `der_univ_dist`: Forward/backward sweep $\rightarrow$ boundary 2x2 systems
+    !! 2. `sendrecv_fields`: MPI exchange boundary data with neighbours
+    !! 3. `der_univ_subs`: Solve boundaries $\rightarrow$ back-substitution
+    !!
+    !! **Parallelisation:** OpenMP over pencil groups, MPI across processes
     implicit none
 
     ! du = d(u)
-    real(dp), dimension(:, :, :), intent(out) :: du
-    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
+    real(dp), dimension(:, :, :), intent(out) :: du          !! Derivative output
+    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Field and halos
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care about the data they pass back
     real(dp), dimension(:, :, :), intent(out) :: &
-      du_send_s, du_send_e, du_recv_s, du_recv_e
+      du_send_s, du_send_e, du_recv_s, du_recv_e  !! Boundary system buffers (scratch)
 
-    type(tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: nproc, pprev, pnext
-    integer, intent(in) :: n_groups
+    type(tdsops_t), intent(in) :: tdsops  !! Compact scheme operator
+    integer, intent(in) :: nproc          !! Number of processes in direction
+    integer, intent(in) :: pprev, pnext   !! Previous/next neighbour ranks
+    integer, intent(in) :: n_groups       !! Number of pencil groups
 
     integer :: n_data
     integer :: k
@@ -71,31 +103,49 @@ subroutine exec_dist_transeq_compact( &
     u, u_recv_s, u_recv_e, &
     v, v_recv_s, v_recv_e, &
     tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_groups)
+    !! Execute distributed transport equation RHS calculation.
+    !!
+    !! Computes three compact derivative operations required for transport
+    !! equation in skew-symmetric form, then fuses final RHS assembly.
+    !! All three derivatives (du, d(u*v), d2u) computed in parallel with
+    !! single halo exchange pass.
+    !!
+    !! **Derivatives computed:**
+    !! - `du`: First derivative of u
+    !! - `dud`: First derivative of u*v (product computed locally with halos)
+    !! - `d2u`: Second derivative of u (viscous term)
+    !!
+    !! **Fused assembly:** Final RHS combines all three derivatives with
+    !! viscosity scaling in single kernel (der_univ_fused_subs).
+    !!
+    !! **Optimisation:** Product u*v computed on-the-fly to avoid storing
+    !! extra field. Reduces memory footprint.
 
     implicit none
 
     !> The result array, it is also used as temporary storage
-    real(dp), dimension(:, :, :), intent(out) :: rhs_du
+    real(dp), dimension(:, :, :), intent(out) :: rhs_du  !! Transport equation RHS output
     !> Temporary storage arrays
-    real(dp), dimension(:, :, :), intent(out) :: dud, d2u
+    real(dp), dimension(:, :, :), intent(out) :: dud, d2u  !! Product derivative and second derivative
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care about the data they pass back
     real(dp), dimension(:, :, :), intent(out) :: &
-      du_send_s, du_send_e, du_recv_s, du_recv_e
+      du_send_s, du_send_e, du_recv_s, du_recv_e      !! Boundary buffers for du (scratch)
     real(dp), dimension(:, :, :), intent(out) :: &
-      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e
+      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e  !! Boundary buffers for dud (scratch)
     real(dp), dimension(:, :, :), intent(out) :: &
-      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
+      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e  !! Boundary buffers for d2u (scratch)
 
-    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
-    real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e
+    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Velocity component and halos
+    real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e  !! Convecting velocity and halos
 
-    type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    real(dp), intent(in) :: nu
-    integer, intent(in) :: nproc, pprev, pnext
-    integer, intent(in) :: n_groups
+    type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for each derivative
+    real(dp), intent(in) :: nu       !! Kinematic viscosity
+    integer, intent(in) :: nproc     !! Number of processes in direction
+    integer, intent(in) :: pprev, pnext  !! Previous/next neighbour ranks
+    integer, intent(in) :: n_groups  !! Number of pencil groups
 
     real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e
 
diff --git a/src/backend/omp/exec_thom.f90 b/src/backend/omp/exec_thom.f90
index b1f0c6028..575da1f8b 100644
--- a/src/backend/omp/exec_thom.f90
+++ b/src/backend/omp/exec_thom.f90
@@ -1,4 +1,19 @@
 module m_exec_thom
+  !! Local Thomas algorithm execution for OMP backend.
+  !!
+  !! Provides parallel execution of compact finite difference schemes using
+  !! standard Thomas algorithm (tridiagonal solver). Used when domain is not
+  !! decomposed in the derivative direction (all data local to process).
+  !!
+  !! **Two variants:**
+  !!
+  !! - **Non-periodic:** Standard Thomas with arbitrary boundary conditions
+  !! - **Periodic:** Modified Thomas for cyclic tridiagonal systems
+  !!
+  !! **Parallelisation:** OpenMP over pencil groups (no MPI needed)
+  !!
+  !! **Contrast with distributed:** exec_dist handles multi-process case,
+  !! this module handles single-process-per-direction case.
 
   use m_common, only: dp
   use m_tdsops, only: tdsops_t
@@ -13,11 +28,22 @@ module m_exec_thom
 contains
 
   subroutine exec_thom_tds_compact(du, u, tdsops, n_groups)
+    !! Execute local Thomas algorithm for compact scheme.
+    !!
+    !! Applies compact finite difference operator using tridiagonal solver.
+    !! Chooses periodic or non-periodic variant based on operator configuration.
+    !! All computation local to process (no MPI communication).
+    !!
+    !! **Algorithm selection:**
+    !! - `periodic=.true.`: Sherman-Morrison formula for cyclic system
+    !! - `periodic=.false.`: Standard forward/backward Thomas algorithm
+    !!
+    !! **Parallelisation:** OpenMP parallel loop over pencil groups
 
-    real(dp), dimension(:, :, :), intent(out) :: du
-    real(dp), dimension(:, :, :), intent(in) :: u
-    type(tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: n_groups
+    real(dp), dimension(:, :, :), intent(out) :: du  !! Derivative output
+    real(dp), dimension(:, :, :), intent(in) :: u    !! Input field
+    type(tdsops_t), intent(in) :: tdsops             !! Compact scheme operator
+    integer, intent(in) :: n_groups                  !! Number of pencil groups
 
     integer :: k
 
diff --git a/src/backend/omp/kernels/distributed.f90 b/src/backend/omp/kernels/distributed.f90
index a02f39f8d..8d0b85866 100644
--- a/src/backend/omp/kernels/distributed.f90
+++ b/src/backend/omp/kernels/distributed.f90
@@ -1,4 +1,31 @@
 module m_omp_kernels_dist
+  !! Distributed compact finite difference kernels for OpenMP backend.
+  !!
+  !! This module implements high-performance kernels for distributed compact
+  !! finite difference operators. These operators require halo exchange across
+  !! MPI ranks to compute derivatives near subdomain boundaries.
+  !!
+  !! **Key Features:**
+  !!
+  !! - 9-point stencil compact schemes (4th-6th order accuracy)
+  !! - Explicit vectorisation with OpenMP SIMD directives
+  !! - Near and far boundary treatments for non-periodic domains
+  !! - Forward and backward elimination phases for distributed solves
+  !!
+  !! **Kernels:**
+  !!
+  !! - `der_univ_dist`: Universal derivative (1st/2nd) with halo exchange
+  !! - `interpl_dist`: Interpolation from cell to vertices or vice versa
+  !!
+  !! **Distributed Algorithm:**
+  !! Compact schemes couple neighbouring points via implicit systems.
+  !! In distributed memory:
+  !!
+  !! 1. Near-boundary points use special coefficients incorporating halo data
+  !! 2. Interior points use standard bulk coefficients
+  !! 3. Modified Thomas algorithm handles cross-process dependencies
+  !!
+  !! **Performance:** Explicitly vectorized inner loops for SIMD execution.
   use omp_lib
 
   use m_common, only: dp
@@ -12,15 +39,24 @@ subroutine der_univ_dist( &
     du, send_u_s, send_u_e, u, u_s, u_e, &
     n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf &
     )
+    !! Compute distributed compact derivative (1st or 2nd order).
+    !!
+    !! Evaluates derivative using compact finite difference scheme across
+    !! distributed domain. Handles boundary points with halo data and applies
+    !! appropriate scaling factors.
+    !!
+    !! **Stencil:** 9-point compact scheme requiring 4-point halo on each side.
+    !! Near boundaries (first/last 4 points): use boundary-specific coefficients.
+    !! Interior: use uniform bulk coefficients for efficiency.
     implicit none
 
     ! Arguments
-    real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e
-    real(dp), intent(in), dimension(:, :) :: u, u_s, u_e
-    integer, intent(in) :: n_tds, n_rhs
-    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in), dimension(:) :: ffr, fbc, faf
+    real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e !! Output derivative and send buffers
+    real(dp), intent(in), dimension(:, :) :: u, u_s, u_e !! Input field and halo data (start/end)
+    integer, intent(in) :: n_tds, n_rhs                   !! System sizes
+    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Boundary coefficients
+    real(dp), intent(in), dimension(:) :: coeffs          !! Bulk stencil coefficients
+    real(dp), intent(in), dimension(:) :: ffr, fbc, faf   !! Scaling factors
 
     ! Local variables
     integer :: i, j
diff --git a/src/backend/omp/kernels/spectral_processing.f90 b/src/backend/omp/kernels/spectral_processing.f90
index 75d3bbd1a..fbc188928 100644
--- a/src/backend/omp/kernels/spectral_processing.f90
+++ b/src/backend/omp/kernels/spectral_processing.f90
@@ -1,4 +1,20 @@
 module m_omp_spectral
+  !! Spectral space processing for FFT-based Poisson solver.
+  !!
+  !! Provides kernels for solving Poisson equation in Fourier space with
+  !! spectral equivalence transformations. Handles different boundary
+  !! condition combinations: fully periodic (000) and non-periodic in Y (010).
+  !!
+  !! **Spectral equivalence:** Modified wavenumbers for finite-difference
+  !! grid (Lele 1992). Ensures spectral solver matches compact FD schemes.
+  !!
+  !! **Reference:** JCP 228 (2009), 5989-6015, Section 4
+  !!
+  !! **Processing steps:**
+  !!
+  !! 1. Forward spectral equivalence transform (physical \(\rightarrow\) modified wavenumbers)
+  !! 2. Solve: \(\hat{\phi}_k = -\hat{f}_k / k^2\)
+  !! 3. Backward spectral equivalence transform (modified wavenumbers \(\rightarrow\) physical)
   use m_common, only: dp
   implicit none
 
@@ -8,22 +24,34 @@ subroutine process_spectral_000( &
     div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, &
     nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-process div U* in spectral space for all periodic BCs.
+    !! Solve Poisson in spectral space for (0,0,0) boundary conditions.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Processes fully periodic case. Applies spectral equivalence transforms
+    !! in all three directions, divides by squared wavenumber, then applies
+    !! inverse transforms.
+    !!
+    !! **Algorithm:**
+    !! 1. Normalise by grid size (FFT convention)
+    !! 2. Forward spectral equivalence: physical $\rightarrow$ modified waves (Z, Y, X order)
+    !! 3. Solve: $\phi_k = -f_k / k^2$ (handle zero mode specially)
+    !! 4. Backward spectral equivalence: modified waves $\rightarrow$ physical
+    !!
+    !! **Special case:** Zero wavenumber (k=0) set to zero to remove constant mode.
+    !!
+    !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4
     implicit none
 
     !> Divergence of velocity in spectral space
-    complex(dp), intent(inout), dimension(:, :, :) :: div_u
+    complex(dp), intent(inout), dimension(:, :, :) :: div_u  !! In: RHS, Out: Solution
     !> Spectral equivalence constants
-    complex(dp), intent(in), dimension(:, :, :) :: waves
-    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), intent(in), dimension(:, :, :) :: waves  !! Modified wavenumbers squared
+    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Spectral equivalence coefficients
     !> Grid size in spectral space
-    integer, intent(in) :: nx_spec, ny_spec, nz_spec
+    integer, intent(in) :: nx_spec, ny_spec, nz_spec  !! Local spectral dimensions
     !> Offsets in the permuted pencils in spectral space
-    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st
+    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st  !! Global offsets
     !> Global cell size
-    integer, intent(in) :: nx, ny, nz
+    integer, intent(in) :: nx, ny, nz  !! Global grid dimensions
 
     integer :: i, j, k, ix, iy, iz
     real(dp) :: tmp_r, tmp_c, div_r, div_c
@@ -109,22 +137,37 @@ subroutine process_spectral_010( &
     div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, &
     nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-process div U* in spectral space, for non-periodic BC in y-dir.
+    !! Solve Poisson in spectral space for (0,1,0) boundary conditions.
+    !!
+    !! Processes non-periodic in Y, periodic in X and Z. Uses sine series
+    !! in Y-direction (symmetry/antisymmetry transform) combined with
+    !! Fourier in X and Z.
+    !!
+    !! **Algorithm:**
+    !! 1. Normalise by grid size
+    !! 2. Forward spectral equivalence in Z and X (not Y, handled separately)
+    !! 3. Apply Y symmetry transform (combine left/right halves)
+    !! 4. Solve: $\phi_k = -f_k / k^2$
+    !! 5. Inverse Y symmetry transform
+    !! 6. Backward spectral equivalence in X and Z
+    !!
+    !! **Y-direction:** Sine series requires special symmetric processing
+    !! to maintain real-valued solution with non-periodic BCs.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4
     implicit none
 
     !> Divergence of velocity in spectral space
-    complex(dp), intent(inout), dimension(:, :, :) :: div_u
+    complex(dp), intent(inout), dimension(:, :, :) :: div_u  !! In: RHS, Out: Solution
     !> Spectral equivalence constants
-    complex(dp), intent(in), dimension(:, :, :) :: waves
-    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), intent(in), dimension(:, :, :) :: waves  !! Modified wavenumbers squared
+    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Spectral equivalence coefficients
     !> Grid size in spectral space
-    integer, intent(in) :: nx_spec, ny_spec, nz_spec
+    integer, intent(in) :: nx_spec, ny_spec, nz_spec  !! Local spectral dimensions
     !> Offsets in the permuted pencils in spectral space
-    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st
+    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st  !! Global offsets
     !> Global cell size
-    integer, intent(in) :: nx, ny, nz
+    integer, intent(in) :: nx, ny, nz  !! Global grid dimensions
 
     integer :: i, j, k, ix, iy, iz, iy_r
     real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
diff --git a/src/backend/omp/kernels/thomas.f90 b/src/backend/omp/kernels/thomas.f90
index 88ec8d771..bc0d5c69f 100644
--- a/src/backend/omp/kernels/thomas.f90
+++ b/src/backend/omp/kernels/thomas.f90
@@ -1,4 +1,18 @@
 module m_omp_kernels_thom
+  !! Thomas algorithm kernels for local compact finite differences.
+  !!
+  !! Implements tridiagonal solvers for compact schemes when domain is
+  !! not decomposed in derivative direction. Provides both standard
+  !! (non-periodic) and cyclic (periodic) Thomas algorithm variants.
+  !!
+  !! **Thomas algorithm:** Standard forward elimination and backward
+  !! substitution for tridiagonal systems, O(n) complexity.
+  !!
+  !! **Periodic Thomas:** Sherman-Morrison formula to handle cyclic
+  !! tridiagonal systems arising from periodic boundary conditions.
+  !!
+  !! **Vectorisation:** Explicit SIMD directives for SZ-wide vectors,
+  !! processing multiple pencils simultaneously.
   use m_common, only: dp
   use m_omp_common, only: SZ
 
@@ -8,14 +22,32 @@ module m_omp_kernels_thom
 
   subroutine der_univ_thom(du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, &
                            thom_f, thom_s, thom_w, strch)
+    !! Thomas algorithm for non-periodic compact finite differences.
+    !!
+    !! Solves tridiagonal system arising from compact scheme with arbitrary
+    !! boundary conditions. Uses standard forward elimination followed by
+    !! backward substitution.
+    !!
+    !! **Algorithm:**
+    !! 1. Forward pass: Eliminate lower diagonal, form modified RHS
+    !! 2. Backward pass: Back-substitution with grid stretching correction
+    !!
+    !! **Boundary treatment:** Special stencils at start (j=1..4) and
+    !! end (j=n-3..n) to handle non-periodic boundaries.
+    !!
+    !! **Stretching:** Applied during backward pass via `strch` array.
     implicit none
 
-    real(dp), dimension(:, :), intent(out) :: du
-    real(dp), dimension(:, :), intent(in) :: u
-    integer, intent(in) :: n_tds, n_rhs
-    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch
+    real(dp), dimension(:, :), intent(out) :: du  !! Solution (derivative)
+    real(dp), dimension(:, :), intent(in) :: u    !! Input field
+    integer, intent(in) :: n_tds                  !! Number of unknowns (tridiagonal size)
+    integer, intent(in) :: n_rhs                  !! Number of RHS points (stencil size)
+    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Start/end stencil coefficients
+    real(dp), intent(in), dimension(:) :: coeffs  !! Interior stencil coefficients (9-point)
+    real(dp), intent(in), dimension(:) :: thom_f  !! Forward elimination factors
+    real(dp), intent(in), dimension(:) :: thom_s  !! Subdiagonal elimination factors
+    real(dp), intent(in), dimension(:) :: thom_w  !! Diagonal weights for back-substitution
+    real(dp), intent(in), dimension(:) :: strch   !! Grid stretching correction factors
 
     integer :: i, j
     real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4
@@ -132,14 +164,34 @@ end subroutine der_univ_thom
   subroutine der_univ_thom_per( &
     du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch &
     )
+    !! Periodic Thomas algorithm for cyclic tridiagonal systems.
+    !!
+    !! Solves compact scheme with periodic boundary conditions using
+    !! Sherman-Morrison formula. Handles wraparound coupling between
+    !! first and last grid points.
+    !!
+    !! **Algorithm:**
+    !! 1. Forward pass: Standard elimination with periodic indexing
+    !! 2. Backward pass: Standard back-substitution
+    !! 3. Periodic correction: Sherman-Morrison adjustment for cyclic coupling
+    !!
+    !! **Periodic indexing:** Uses modulo arithmetic for stencil access
+    !! to handle wraparound at domain boundaries.
+    !!
+    !! **Sherman-Morrison:** Adds rank-1 correction to handle tridiagonal
+    !! system modified by periodic coupling terms.
     implicit none
 
-    real(dp), dimension(:, :), intent(out) :: du
-    real(dp), dimension(:, :), intent(in) :: u
-    integer, intent(in) :: n
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in) :: alpha
-    real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, thom_p, strch
+    real(dp), dimension(:, :), intent(out) :: du  !! Solution (derivative)
+    real(dp), dimension(:, :), intent(in) :: u    !! Input field
+    integer, intent(in) :: n                      !! Number of grid points
+    real(dp), intent(in), dimension(:) :: coeffs  !! Stencil coefficients (9-point)
+    real(dp), intent(in) :: alpha                 !! Tridiagonal sub/super-diagonal value
+    real(dp), intent(in), dimension(:) :: thom_f  !! Forward elimination factors
+    real(dp), intent(in), dimension(:) :: thom_s  !! Subdiagonal elimination factors
+    real(dp), intent(in), dimension(:) :: thom_w  !! Diagonal weights
+    real(dp), intent(in), dimension(:) :: thom_p  !! Periodic correction vector
+    real(dp), intent(in), dimension(:) :: strch   !! Grid stretching correction factors
 
     integer :: i, j
     integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4
diff --git a/src/backend/omp/poisson_fft.f90 b/src/backend/omp/poisson_fft.f90
index 72a94521c..db5241803 100644
--- a/src/backend/omp/poisson_fft.f90
+++ b/src/backend/omp/poisson_fft.f90
@@ -1,4 +1,24 @@
 module m_omp_poisson_fft
+  !! FFT-based Poisson solver for OMP backend.
+  !!
+  !! Solves \(\nabla^2 \phi = f\) using spectral methods with 2DECOMP&FFT library.
+  !! Transforms to Fourier space, solves diagonal system in spectral space,
+  !! then transforms back to physical space.
+  !!
+  !! **Algorithm:**
+  !!
+  !! 1. Forward FFT: physical \(\rightarrow\) spectral space
+  !! 2. Spectral solve: \(\hat{\phi}_k = \hat{f}_k / k^2\) (with modifications for boundary conditions)
+  !! 3. Backward FFT: spectral \(\rightarrow\) physical space
+  !!
+  !! **Boundary conditions:**
+  !!
+  !! - (0,0,0): Periodic in all directions
+  !! - (0,1,0): Non-periodic in Y, periodic in X/Z (uses symmetry transform)
+  !!
+  !! **Parallelisation:** MPI via 2DECOMP&FFT pencil decomposition
+  !!
+  !! **Limitation:** Does not support Y-direction grid stretching
 
   use decomp_2d_constants, only: PHYSICAL_IN_X
   use decomp_2d_fft, only: decomp_2d_fft_init, decomp_2d_fft_3d, &
@@ -16,14 +36,14 @@ module m_omp_poisson_fft
 
   type, extends(poisson_fft_t) :: omp_poisson_fft_t
       !! FFT based Poisson solver
-    complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z
+    complex(dp), allocatable, dimension(:, :, :) :: c_x  !! Spectral space buffer (X-pencil oriented)
   contains
-    procedure :: fft_forward => fft_forward_omp
-    procedure :: fft_backward => fft_backward_omp
-    procedure :: fft_postprocess_000 => fft_postprocess_000_omp
-    procedure :: fft_postprocess_010 => fft_postprocess_010_omp
-    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp
-    procedure :: undo_periodicity_y => undo_periodicity_y_omp
+    procedure :: fft_forward => fft_forward_omp           !! Transform to spectral space
+    procedure :: fft_backward => fft_backward_omp         !! Transform to physical space
+    procedure :: fft_postprocess_000 => fft_postprocess_000_omp  !! Spectral solve for (0,0,0) BCs
+    procedure :: fft_postprocess_010 => fft_postprocess_010_omp  !! Spectral solve for (0,1,0) BCs
+    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp  !! Symmetry transform for Y non-periodic
+    procedure :: undo_periodicity_y => undo_periodicity_y_omp        !! Inverse symmetry transform
   end type omp_poisson_fft_t
 
   interface omp_poisson_fft_t
@@ -35,15 +55,22 @@ module m_omp_poisson_fft
 contains
 
   function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft)
+    !! Initialise FFT-based Poisson solver.
+    !!
+    !! Sets up 2DECOMP&FFT library and allocates spectral space buffers.
+    !! Computes wavenumbers and coefficients for spectral solve.
+    !!
+    !! **Error checking:** Fails if Y-direction grid stretching requested
+    !! (not supported by FFT method).
     implicit none
 
-    type(mesh_t), intent(in) :: mesh
-    class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
-    integer, dimension(3) :: istart, iend, isize
-    integer :: dims(3)
+    type(mesh_t), intent(in) :: mesh                 !! Mesh with grid spacing
+    class(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Spectral operators
+    logical, optional, intent(in) :: lowmem          !! Low-memory flag (ignored for OMP)
+    integer, dimension(3) :: istart, iend, isize     !! Local spectral dimensions
+    integer :: dims(3)                               !! Global grid dimensions
 
-    type(omp_poisson_fft_t) :: poisson_fft
+    type(omp_poisson_fft_t) :: poisson_fft           !! Initialised solver
 
     if (mesh%par%is_root()) then
       print *, "Initialising 2decomp&fft"
@@ -75,29 +102,43 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft)
   end function init
 
   subroutine fft_forward_omp(self, f_in)
+    !! Forward FFT: physical space to spectral space.
+    !!
+    !! Transforms input field from physical (real) to spectral (complex)
+    !! representation using 2DECOMP&FFT. Result stored in `self%c_x`.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self      !! Solver instance
+    class(field_t), intent(in) :: f_in    !! Physical space field (RHS)
 
     call decomp_2d_fft_3d(f_in%data, self%c_x)
 
   end subroutine fft_forward_omp
 
   subroutine fft_backward_omp(self, f_out)
+    !! Backward FFT: spectral space to physical space.
+    !!
+    !! Transforms spectral solution back to physical (real) space using
+    !! inverse FFT. Reads from `self%c_x`, writes to output field.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
+    class(omp_poisson_fft_t) :: self         !! Solver instance
+    class(field_t), intent(inout) :: f_out   !! Physical space solution
 
     call decomp_2d_fft_3d(self%c_x, f_out%data)
 
   end subroutine fft_backward_omp
 
   subroutine fft_postprocess_000_omp(self)
+    !! Spectral solve for (0,0,0) boundary conditions.
+    !!
+    !! Solves Poisson equation in spectral space for fully periodic domain.
+    !! Divides each Fourier mode by its corresponding $k^2$ eigenvalue.
+    !!
+    !! **Formula:** $\hat{\phi}_k = \hat{f}_k / (k_x^2 + k_y^2 + k_z^2)$
     implicit none
 
-    class(omp_poisson_fft_t) :: self
+    class(omp_poisson_fft_t) :: self  !! Solver instance
 
     call process_spectral_000( &
       self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, &
@@ -109,9 +150,16 @@ subroutine fft_postprocess_000_omp(self)
   end subroutine fft_postprocess_000_omp
 
   subroutine fft_postprocess_010_omp(self)
+    !! Spectral solve for (0,1,0) boundary conditions.
+    !!
+    !! Solves Poisson equation with non-periodic BCs in Y-direction,
+    !! periodic in X and Z. Uses modified wavenumbers accounting for
+    !! symmetry transformation (sine series in Y).
+    !!
+    !! **Formula:** Modified $k_y$ for sine series representation
     implicit none
 
-    class(omp_poisson_fft_t) :: self
+    class(omp_poisson_fft_t) :: self  !! Solver instance
 
     call process_spectral_010( &
       self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, &
@@ -123,11 +171,18 @@ subroutine fft_postprocess_010_omp(self)
   end subroutine fft_postprocess_010_omp
 
   subroutine enforce_periodicity_y_omp(self, f_out, f_in)
+    !! Apply symmetry transform for Y non-periodic boundary conditions.
+    !!
+    !! Converts physical field to symmetric/antisymmetric representation
+    !! suitable for sine series FFT. Used before forward FFT when Y-direction
+    !! has non-periodic BCs.
+    !!
+    !! **Transformation:** Maps domain to symmetric extension for sine basis.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self       !! Solver instance
+    class(field_t), intent(inout) :: f_out  !! Transformed field
+    class(field_t), intent(in) :: f_in      !! Original field
 
     integer :: i, j, k
 
@@ -149,11 +204,17 @@ subroutine enforce_periodicity_y_omp(self, f_out, f_in)
   end subroutine enforce_periodicity_y_omp
 
   subroutine undo_periodicity_y_omp(self, f_out, f_in)
+    !! Inverse symmetry transform for Y non-periodic boundary conditions.
+    !!
+    !! Converts symmetric/antisymmetric representation back to physical
+    !! field. Used after backward FFT when Y-direction has non-periodic BCs.
+    !!
+    !! **Transformation:** Extracts physical domain from symmetric extension.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self       !! Solver instance
+    class(field_t), intent(inout) :: f_out  !! Physical field
+    class(field_t), intent(in) :: f_in      !! Transformed field
 
     integer :: i, j, k
 
diff --git a/src/backend/omp/sendrecv.f90 b/src/backend/omp/sendrecv.f90
index 3aba6e82c..39d8c6caa 100644
--- a/src/backend/omp/sendrecv.f90
+++ b/src/backend/omp/sendrecv.f90
@@ -1,4 +1,14 @@
 module m_omp_sendrecv
+  !! MPI halo exchange utilities for OMP backend.
+  !!
+  !! Provides non-blocking point-to-point communication for exchanging
+  !! boundary halos between neighbouring MPI processes. Used in distributed
+  !! compact finite difference schemes that require off-process data.
+  !!
+  !! **Communication pattern:** Bidirectional simultaneous send/recv with
+  !! neighbours in one decomposition direction.
+  !!
+  !! **Single-process optimisation:** Direct copy when no MPI communication needed.
   use mpi
 
   use m_common, only: dp, MPI_X3D2_DP
@@ -9,11 +19,31 @@ module m_omp_sendrecv
 
   subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, &
                              n_data, nproc, prev, next)
+    !! Exchange boundary halos with neighbouring MPI processes.
+    !!
+    !! Performs bidirectional halo exchange using non-blocking MPI
+    !! communication (MPI_Isend/MPI_Irecv). Sends data to both neighbours
+    !! simultaneously and receives from both, then waits for all operations
+    !! to complete.
+    !!
+    !! **Special case:** Single-process (nproc=1) uses direct memory copy
+    !! for periodic boundaries without MPI overhead.
+    !!
+    !! **Communication pattern:**
+    !! - Send start halo to previous process
+    !! - Receive end halo from next process
+    !! - Send end halo to next process
+    !! - Receive start halo from previous process
+    !!
+    !! **Non-blocking:** All 4 operations initiated before waiting for completion.
     implicit none
 
-    real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e
-    real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+    real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e  !! Receive buffers (start/end halos)
+    real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e   !! Send buffers (start/end halos)
+    integer, intent(in) :: n_data    !! Number of data elements to transfer
+    integer, intent(in) :: nproc     !! Number of processes in this direction
+    integer, intent(in) :: prev      !! Rank of previous neighbour
+    integer, intent(in) :: next      !! Rank of next neighbour
 
     integer :: req(4), err(4), ierr, tag = 1234
 
diff --git a/src/case/base_case.f90 b/src/case/base_case.f90
index e7a673d81..8efadcaf8 100644
--- a/src/case/base_case.f90
+++ b/src/case/base_case.f90
@@ -1,8 +1,39 @@
 module m_base_case
-  !! Provides the base case for running a simulation. New cases are
-  !! implemented by extending this to specify the initial and boundary
-  !! conditions, forcing terms and case-specific postprocessing and analysis.
-
+  !! Base class for flow simulation cases.
+  !!
+  !! This abstract base class provides the framework for implementing specific
+  !! flow cases (channel, TGV, generic, etc.). New cases extend this class and
+  !! override deferred procedures to specify:
+  !!
+  !! - **Initial conditions**: Set velocity and other field initial states
+  !! - **Boundary conditions**: Apply physical boundary conditions each timestep
+  !! - **Forcing terms**: Add body forces or model-specific source terms
+  !! - **Pre-correction**: Modify velocity before pressure correction (e.g., IBM)
+  !! - **Postprocessing**: Compute statistics, output diagnostics, etc.
+  !!
+  !! **Simulation Workflow:**
+  !!
+  !! The `run()` method orchestrates the time integration loop:
+  !!
+  !! 1. Apply boundary conditions
+  !! 2. Advance solution one timestep via solver%step()
+  !! 3. Write checkpoints/snapshots (via checkpoint_mgr)
+  !! 4. Perform case-specific postprocessing
+  !! 5. Repeat until final time reached
+  !!
+  !! **Time Integration:**
+  !!
+  !! Each timestep involves multiple stages (for RK) or steps (for AB):
+  !!
+  !! - Transport equation (transeq) computes velocity derivatives
+  !! - Forcing terms applied after transeq
+  !! - Pre-correction modifies velocity (e.g., for immersed boundaries)
+  !! - Pressure correction enforces incompressibility
+  !!
+  !! **Restart Capability:**
+  !!
+  !! The checkpoint manager handles restart from saved states automatically
+  !! if a restart file is detected.
   use m_allocator, only: allocator_t
   use m_base_backend, only: base_backend_t
   use m_common, only: dp, DIR_X, DIR_Z, DIR_C, VERT
@@ -15,82 +46,109 @@ module m_base_case
   implicit none
 
   type, abstract :: base_case_t
-    class(solver_t), allocatable :: solver
-    type(io_manager_t) :: checkpoint_mgr
+    !! Abstract base type for flow cases.
+    !!
+    !! Derived types must implement all deferred procedures to define
+    !! case-specific behaviour.
+    class(solver_t), allocatable :: solver       !! Incompressible Navier-Stokes solver
+    type(io_manager_t) :: checkpoint_mgr         !! Checkpoint and snapshot manager
   contains
-    procedure(boundary_conditions), deferred :: boundary_conditions
-    procedure(initial_conditions), deferred :: initial_conditions
-    procedure(forcings), deferred :: forcings
-    procedure(pre_correction), deferred :: pre_correction
-    procedure(postprocess), deferred :: postprocess
-    procedure :: case_init
-    procedure :: case_finalise
-    procedure :: set_init
-    procedure :: run
-    procedure :: print_enstrophy
-    procedure :: print_div_max_mean
+    procedure(boundary_conditions), deferred :: boundary_conditions !! Apply BCs (deferred)
+    procedure(initial_conditions), deferred :: initial_conditions   !! Set ICs (deferred)
+    procedure(forcings), deferred :: forcings                       !! Add forcing terms (deferred)
+    procedure(pre_correction), deferred :: pre_correction           !! Pre-pressure correction (deferred)
+    procedure(postprocess), deferred :: postprocess                 !! Case-specific analysis (deferred)
+    procedure :: case_init          !! Initialise case and solver
+    procedure :: case_finalise      !! Clean up and finalise
+    procedure :: set_init           !! Set initial conditions and prepare for run
+    procedure :: run                !! Main time integration loop
+    procedure :: print_enstrophy    !! Print enstrophy diagnostic
+    procedure :: print_div_max_mean !! Print divergence diagnostics
   end type base_case_t
 
   abstract interface
     subroutine boundary_conditions(self)
-      !! Applies case-specific boundary coinditions
+      !! Abstract interface for applying boundary conditions.
+      !!
+      !! Called each timestep before computing derivatives. Implementations
+      !! should set velocity and scalar values at domain boundaries according
+      !! to the physical boundary conditions (Dirichlet, Neumann, periodic, etc.).
       import :: base_case_t
       implicit none
 
-      class(base_case_t) :: self
+      class(base_case_t) :: self !! Case instance
     end subroutine boundary_conditions
 
     subroutine initial_conditions(self)
-      !! Sets case-specific initial conditions
+      !! Abstract interface for setting initial conditions.
+      !!
+      !! Called once during initialisation to set the initial state of velocity
+      !! and scalar fields. Implementations should populate \(u, v, w\) (and species
+      !! if present) with case-appropriate initial values.
       import :: base_case_t
       implicit none
 
-      class(base_case_t) :: self
+      class(base_case_t) :: self !! Case instance
     end subroutine initial_conditions
 
     subroutine forcings(self, du, dv, dw, iter)
-      !! Applies case-specific or model realated forcings after transeq
+      !! Abstract interface for applying forcing terms.
+      !!
+      !! Called after transport equation (transeq) but before pressure correction.
+      !! Add body forces, source terms, or model-specific forcings (e.g., mean
+      !! pressure gradient for channel flow, immersed boundary forces, etc.).
       import :: base_case_t
       import :: field_t
       implicit none
 
-      class(base_case_t) :: self
-      class(field_t), intent(inout) :: du, dv, dw
-      integer, intent(in) :: iter
+      class(base_case_t) :: self                  !! Case instance
+      class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify
+      integer, intent(in) :: iter                 !! Current iteration number
     end subroutine forcings
 
     subroutine pre_correction(self, u, v, w)
-      !! Applies case-specific pre-correction to the velocity fields before
-      !! pressure correction
+      !! Abstract interface for pre-pressure correction modifications.
+      !!
+      !! Called after forcings but before pressure correction. Used for operations
+      !! that need to modify the velocity field before enforcing incompressibility,
+      !! such as immersed boundary method (IBM) velocity corrections.
       import :: base_case_t
       import :: field_t
       implicit none
 
-      class(base_case_t) :: self
-      class(field_t), intent(inout) :: u, v, w
+      class(base_case_t) :: self                 !! Case instance
+      class(field_t), intent(inout) :: u, v, w   !! Velocity fields to modify
     end subroutine pre_correction
 
     subroutine postprocess(self, iter, t)
-      !! Triggers case-specific postprocessings at user specified intervals
+      !! Abstract interface for case-specific postprocessing.
+      !!
+      !! Called at user-specified intervals during time integration. Implement
+      !! this to compute statistics, output diagnostics, write custom data files,
+      !! or perform any case-specific analysis.
       import :: base_case_t
       import :: dp
       implicit none
 
-      class(base_case_t) :: self
-      integer, intent(in) :: iter
-      real(dp), intent(in) :: t
+      class(base_case_t) :: self    !! Case instance
+      integer, intent(in) :: iter   !! Current iteration number
+      real(dp), intent(in) :: t     !! Current simulation time
     end subroutine postprocess
   end interface
 
 contains
 
   subroutine case_init(self, backend, mesh, host_allocator)
+    !! Initialise case with solver and checkpoint manager.
+    !!
+    !! Creates the solver instance and initialises the checkpoint/snapshot
+    !! manager. If a restart file is detected, loads the saved state.
     implicit none
 
-    class(base_case_t) :: self
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
+    class(base_case_t) :: self                              !! Case instance
+    class(base_backend_t), target, intent(inout) :: backend !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh             !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator
 
     self%solver = init(backend, mesh, host_allocator)
 
@@ -104,7 +162,14 @@ subroutine case_init(self, backend, mesh, host_allocator)
   end subroutine case_init
 
   subroutine case_finalise(self)
-    class(base_case_t) :: self
+    !! Finalise the case and clean up resources.
+    !!
+    !! Performs cleanup operations at the end of a simulation run:
+    !! - Finalises the checkpoint manager (closes files, flushes buffers)
+    !! - Prints completion message on root process
+    !!
+    !! This should be called after the main time integration loop completes.
+    class(base_case_t) :: self !! Case instance to finalise
 
     if (self%solver%mesh%par%is_root()) print *, 'run end'
 
@@ -112,18 +177,35 @@ subroutine case_finalise(self)
   end subroutine case_finalise
 
   subroutine set_init(self, field, field_func)
+    !! Initialise a field using an analytical function.
+    !!
+    !! This utility subroutine sets a field's values by evaluating a
+    !! user-provided pure function at each grid point. The function
+    !! is evaluated on the host, then transferred to the backend device
+    !! (if using GPU backend).
+    !!
+    !! **Usage Example:**
+    !! ```fortran
+    !! call self%set_init(self%solver%u, u_initial)
+    !! ```
+    !! where `u_initial` is a pure function taking coordinates [x,y,z]
+    !! and returning the initial velocity value.
+    !!
+    !! This is commonly used in `initial_conditions()` implementations
+    !! to set velocity or scalar fields from analytical expressions.
     implicit none
 
-    class(base_case_t) :: self
-    class(field_t), intent(inout) :: field
+    class(base_case_t) :: self                  !! Case instance
+    class(field_t), intent(inout) :: field      !! Field to initialise
 
     interface
       pure function field_func(coords) result(r)
+        !! Pure function defining field values at each point.
         import dp
         implicit none
 
-        real(dp), intent(in) :: coords(3)
-        real(dp) :: r
+        real(dp), intent(in) :: coords(3)  !! Spatial coordinates [x, y, z]
+        real(dp) :: r                      !! Field value at this location
       end function field_func
     end interface
 
@@ -151,11 +233,24 @@ end function field_func
   end subroutine set_init
 
   subroutine print_enstrophy(self, u, v, w)
-    !! Reports the enstrophy
+    !! Compute and print the volume-averaged enstrophy.
+    !!
+    !! Enstrophy is a measure of the rotational kinetic energy density:
+    !! \[ E = \frac{1}{2V} \int_V |\omega|^2 \, dV = \frac{1}{2V} \int_V |\nabla \times \mathbf{u}|^2 \, dV \]
+    !!
+    !! where \( \omega = \nabla \times \mathbf{u} \) is the vorticity.
+    !!
+    !! This diagnostic is useful for monitoring:
+    !! - Flow transition to turbulence (enstrophy increases)
+    !! - Energy cascade to small scales
+    !! - Numerical stability (sudden spikes indicate problems)
+    !! - Comparison with theoretical predictions (e.g., TGV decay)
+    !!
+    !! Only the root MPI rank prints the result.
     implicit none
 
-    class(base_case_t), intent(in) :: self
-    class(field_t), intent(in) :: u, v, w
+    class(base_case_t), intent(in) :: self  !! Case instance
+    class(field_t), intent(in) :: u, v, w   !! Velocity components
 
     class(field_t), pointer :: du, dv, dw
     real(dp) :: enstrophy
@@ -180,11 +275,31 @@ subroutine print_enstrophy(self, u, v, w)
   end subroutine print_enstrophy
 
   subroutine print_div_max_mean(self, u, v, w)
-    !! Reports the div(u) at cell centres
+    !! Compute and print maximum and mean divergence.
+    !!
+    !! For incompressible flow, the velocity divergence should be zero:
+    !! \[ \nabla \cdot \mathbf{u} = 0 \]
+    !!
+    !! This diagnostic reports:
+    !!
+    !! - **Maximum divergence**: Largest local violation of incompressibility
+    !! - **Mean divergence**: Volume-averaged divergence (should be near machine zero)
+    !!
+    !! **Purpose:**
+    !!
+    !! - Monitor quality of pressure correction (divergence should be ~ 1e-10 or smaller)
+    !! - Detect numerical issues (large divergence indicates solver problems)
+    !! - Verify proper boundary condition implementation
+    !! - Check convergence of iterative Poisson solvers
+    !!
+    !! Divergence is computed at cell centres from vertex velocities using
+    !! staggered derivatives and interpolation.
+    !!
+    !! Only the root MPI rank prints the result.
     implicit none
 
-    class(base_case_t), intent(in) :: self
-    class(field_t), intent(in) :: u, v, w
+    class(base_case_t), intent(in) :: self  !! Case instance
+    class(field_t), intent(in) :: u, v, w   !! Velocity components
 
     class(field_t), pointer :: div_u
     real(dp) :: div_u_max, div_u_mean
@@ -202,11 +317,33 @@ subroutine print_div_max_mean(self, u, v, w)
   end subroutine print_div_max_mean
 
   subroutine run(self)
-    !! Runs the solver forwards in time from t=t_0 to t=T, performing
-    !! postprocessing/IO and reporting diagnostics.
+    !! Main time integration loop for the simulation.
+    !!
+    !! Advances the solution from initial time t=t_0 to final time t=T,
+    !! orchestrating all aspects of the simulation:
+    !!
+    !! **Each Timestep:**
+    !!
+    !! 1. Apply boundary conditions
+    !! 2. Compute derivatives and advance via time_integrator%step()
+    !! 3. Handle checkpointing and snapshot output (via checkpoint_mgr)
+    !! 4. Perform case-specific postprocessing
+    !! 5. Print diagnostics (divergence, enstrophy)
+    !!
+    !! **Time Integration Stages:**
+    !!
+    !! For multi-stage methods (RK), each timestep involves multiple stages.
+    !! The solver%step() method handles the stage-by-stage advancement,
+    !! calling transeq, forcings, pre_correction, and pressure_correction
+    !! at appropriate points.
+    !!
+    !! **Restart Support:**
+    !!
+    !! If a restart file is detected, continues from the saved iteration
+    !! and time rather than starting from t=0.
     implicit none
 
-    class(base_case_t), intent(inout) :: self
+    class(base_case_t), intent(inout) :: self !! Case instance
 
     type(flist_t), allocatable :: curr(:)
     type(flist_t), allocatable :: deriv(:)
diff --git a/src/case/channel.f90 b/src/case/channel.f90
index 846dcf93e..b20222234 100644
--- a/src/case/channel.f90
+++ b/src/case/channel.f90
@@ -1,4 +1,32 @@
 module m_case_channel
+  !! Turbulent channel flow case with optional rotation.
+  !!
+  !! This module implements a turbulent channel flow simulation between
+  !! two parallel walls. The flow is driven by a mean pressure gradient
+  !! to maintain a target bulk velocity.
+  !!
+  !! **Flow Configuration:**
+  !!
+  !! - Domain: Periodic in \(X\) and \(Z\), wall-bounded in \(Y\)
+  !! - Walls at \(y = 0\) and \(y = L_y\) with no-slip boundary conditions
+  !! - Mean pressure gradient maintains constant bulk velocity
+  !! - Optional rotation forcing (Coriolis-like terms) for rotating channel
+  !!
+  !! **Initial Conditions:**
+  !!
+  !! - Parabolic base profile: \( u = 1 - y^2 \)
+  !! - Random perturbations with configurable amplitude (noise parameter)
+  !! - Perturbations concentrated near centreline for faster transition
+  !!
+  !! **Boundary Conditions:**
+  !!
+  !! - No-slip walls: \( u = v = w = 0 \) at \( y = 0 \) and \( y = L_y \)
+  !! - Enforces mean bulk velocity via volume shift (simulates pressure gradient)
+  !!
+  !! **Forcing:**
+  !!
+  !! - Mean pressure gradient (constant in time, via bulk velocity constraint)
+  !! - Optional Coriolis forcing for rotating channel flows
   use iso_fortran_env, only: stderr => error_unit
   use mpi
 
@@ -13,13 +41,14 @@ module m_case_channel
   implicit none
 
   type, extends(base_case_t) :: case_channel_t
-    type(channel_config_t) :: channel_cfg
+    !! Channel flow case with optional rotation forcing.
+    type(channel_config_t) :: channel_cfg !! Channel-specific configuration
   contains
-    procedure :: boundary_conditions => boundary_conditions_channel
-    procedure :: initial_conditions => initial_conditions_channel
-    procedure :: forcings => forcings_channel
-    procedure :: pre_correction => pre_correction_channel
-    procedure :: postprocess => postprocess_channel
+    procedure :: boundary_conditions => boundary_conditions_channel !! Apply bulk velocity constraint
+    procedure :: initial_conditions => initial_conditions_channel   !! Set perturbed parabolic profile
+    procedure :: forcings => forcings_channel                       !! Apply rotation forcing (if enabled)
+    procedure :: pre_correction => pre_correction_channel           !! Enforce wall boundary conditions
+    procedure :: postprocess => postprocess_channel                 !! Compute statistics
   end type case_channel_t
 
   interface case_channel_t
@@ -29,12 +58,15 @@ module m_case_channel
 contains
 
   function case_channel_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise channel flow case.
+    !!
+    !! Reads channel-specific configuration and initialises the base case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_channel_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_channel_t) :: flow_case                               !! Initialised channel case
 
     call flow_case%channel_cfg%read(nml_file=get_argument(1))
 
@@ -43,9 +75,14 @@ function case_channel_init(backend, mesh, host_allocator) result(flow_case)
   end function case_channel_init
 
   subroutine boundary_conditions_channel(self)
+    !! Apply boundary conditions to enforce target bulk velocity.
+    !!
+    !! Computes the current bulk (volume-averaged) velocity and applies
+    !! a uniform shift to maintain the target value of 2/3. This simulates
+    !! the effect of a mean pressure gradient driving the flow.
     implicit none
 
-    class(case_channel_t) :: self
+    class(case_channel_t) :: self !! Channel case instance
 
     real(dp) :: can, ub
     integer :: ierr
@@ -63,9 +100,18 @@ subroutine boundary_conditions_channel(self)
   end subroutine boundary_conditions_channel
 
   subroutine initial_conditions_channel(self)
+    !! Set initial velocity field with perturbed parabolic profile.
+    !!
+    !! Creates a laminar parabolic profile \( u = 1 - y^2 \) and adds random
+    !! perturbations scaled by the noise parameter. Perturbations are
+    !! amplitude-modulated with a Gaussian centred at the channel centreline
+    !! to concentrate disturbances where they are most effective for
+    !! triggering turbulent transition.
+    !!
+    !! No-slip conditions (u = v = w = 0) are enforced at walls (y=0, y=L_y).
     implicit none
 
-    class(case_channel_t) :: self
+    class(case_channel_t) :: self !! Channel case instance
 
     class(field_t), pointer :: u_init, v_init, w_init
 
@@ -119,13 +165,32 @@ subroutine initial_conditions_channel(self)
   end subroutine initial_conditions_channel
 
   subroutine forcings_channel(self, du, dv, dw, iter)
+    !! Apply rotation forcing (Coriolis-like terms) if enabled.
+    !!
+    !! For rotating channel flows, adds Coriolis-like forcing terms that
+    !! couple the streamwise (u) and spanwise (v) velocities:
+    !!
+    !! \[ \frac{du}{dt} = \ldots - \Omega v \]
+    !! \[ \frac{dv}{dt} = \ldots + \Omega u \]
+    !!
+    !! where \( \Omega \) is the rotation rate (omega_rot).
+    !!
+    !! **Configuration:**
+    !! - Activated via `channel_cfg%rotation = .true.`
+    !! - Rotation rate set by `channel_cfg%omega_rot`
+    !! - Applied only for first `n_rotate` iterations to allow spin-up
+    !!
+    !! **Physical Interpretation:**
+    !! Mimics effects of system rotation (e.g., rotating reference frame)
+    !! without explicitly implementing Coriolis force. Useful for studying
+    !! rotation effects on turbulent channel flows.
     implicit none
 
-    class(case_channel_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    integer, intent(in) :: iter
+    class(case_channel_t) :: self                !! Channel case instance
+    class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify
+    integer, intent(in) :: iter                  !! Current iteration number
 
-    real(dp) :: rot
+    real(dp) :: rot !! Rotation rate for current forcing application
 
     if (self%channel_cfg%rotation .and. iter < self%channel_cfg%n_rotate) then
       rot = self%channel_cfg%omega_rot
@@ -136,10 +201,30 @@ subroutine forcings_channel(self, du, dv, dw, iter)
   end subroutine forcings_channel
 
   subroutine pre_correction_channel(self, u, v, w)
+    !! Enforce no-slip boundary conditions at channel walls.
+    !!
+    !! Sets all velocity components to zero at the wall boundaries (Y-faces):
+    !! - Lower wall: y = 0
+    !! - Upper wall: y = L_y
+    !!
+    !! This implements the no-slip condition:
+    !! \[ u = v = w = 0 \quad \text{at walls} \]
+    !!
+    !! **Implementation:**
+    !! Uses `field_set_face` to directly set values on Y-direction faces
+    !! (boundaries perpendicular to Y-axis). This is applied after the
+    !! time integration step but before pressure correction, ensuring that
+    !! the corrected velocity field satisfies both incompressibility and
+    !! no-slip boundary conditions.
+    !!
+    !! **Note:**
+    !! This is the standard approach for wall-bounded flows. For periodic
+    !! or other boundary conditions, this subroutine would be modified or
+    !! left empty.
     implicit none
 
-    class(case_channel_t) :: self
-    class(field_t), intent(inout) :: u, v, w
+    class(case_channel_t) :: self             !! Channel case instance
+    class(field_t), intent(inout) :: u, v, w !! Velocity components to correct
 
     call self%solver%backend%field_set_face(u, 0._dp, 0._dp, Y_FACE)
     call self%solver%backend%field_set_face(v, 0._dp, 0._dp, Y_FACE)
diff --git a/src/case/generic.f90 b/src/case/generic.f90
index 193767999..a1659361a 100644
--- a/src/case/generic.f90
+++ b/src/case/generic.f90
@@ -1,6 +1,26 @@
 module m_case_generic
-  !! An example case set up to run and sustain a freestream flow.
-  !! This is a good place to start for adding a new flow case.
+  !! Generic freestream flow case for general-purpose simulations.
+  !!
+  !! This module provides a minimal template for setting up custom flow
+  !! cases. It implements a simple uniform freestream flow (\(u=1, v=0, w=0\))
+  !! with no forcing or boundary corrections.
+  !!
+  !! **Use Cases:**
+  !! - Starting point for implementing new flow cases
+  !! - Testing solver functionality with simple initial conditions
+  !! - Freestream simulations with immersed boundaries (add IBM via forcings)
+  !! - Custom flow setups requiring minimal default behaviour
+  !!
+  !! **Default Configuration:**
+  !! - Initial condition: Uniform flow \(u=1, v=0, w=0\)
+  !! - No boundary condition corrections
+  !! - No forcing terms
+  !! - No pre-correction
+  !! - Minimal postprocessing
+  !!
+  !! **Customisation:**
+  !! Users can extend this case or modify the procedures directly to implement
+  !! specific flow physics, boundary conditions, or forcing terms.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -14,12 +34,13 @@ module m_case_generic
   implicit none
 
   type, extends(base_case_t) :: case_generic_t
+    !! Generic case with minimal default behaviour.
   contains
-    procedure :: boundary_conditions => boundary_conditions_generic
-    procedure :: initial_conditions => initial_conditions_generic
-    procedure :: forcings => forcings_generic
-    procedure :: pre_correction => pre_correction_generic
-    procedure :: postprocess => postprocess_generic
+    procedure :: boundary_conditions => boundary_conditions_generic !! No action (use domain BCs)
+    procedure :: initial_conditions => initial_conditions_generic   !! Uniform freestream
+    procedure :: forcings => forcings_generic                       !! No forcing
+    procedure :: pre_correction => pre_correction_generic           !! No correction
+    procedure :: postprocess => postprocess_generic                 !! Minimal diagnostics
   end type case_generic_t
 
   interface case_generic_t
@@ -29,12 +50,13 @@ module m_case_generic
 contains
 
   function case_generic_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise generic flow case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_generic_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_generic_t) :: flow_case                               !! Initialised generic case
 
     call flow_case%case_init(backend, mesh, host_allocator)
 
@@ -48,9 +70,19 @@ subroutine boundary_conditions_generic(self)
   end subroutine boundary_conditions_generic
 
   subroutine initial_conditions_generic(self)
+    !! Set initial velocity field for generic freestream case.
+    !!
+    !! Initialises a uniform flow field with:
+    !! - \( u = 1 \) (streamwise velocity)
+    !! - \( v = 0 \) (cross-stream velocity)
+    !! - \( w = 0 \) (spanwise velocity)
+    !!
+    !! All velocity components are located at vertices (VERT).
+    !! This simple uniform flow serves as a starting point that users
+    !! can modify for their specific applications.
     implicit none
 
-    class(case_generic_t) :: self
+    class(case_generic_t) :: self !! Generic case instance
 
     call self%solver%u%fill(1._dp)
     call self%solver%v%fill(0._dp)
diff --git a/src/case/tgv.f90 b/src/case/tgv.f90
index 971ee29fb..bd094717a 100644
--- a/src/case/tgv.f90
+++ b/src/case/tgv.f90
@@ -1,4 +1,40 @@
 module m_case_tgv
+  !! Taylor-Green vortex (TGV) case for validation and benchmarking.
+  !!
+  !! The Taylor-Green vortex is a canonical test case for incompressible
+  !! Navier-Stokes solvers. It features an analytically-defined initial
+  !! condition that transitions from laminar to turbulent flow, providing
+  !! a rigorous test of:
+  !!
+  !! - Spatial discretisation accuracy
+  !! - Time integration stability
+  !! - Energy conservation properties
+  !! - Transition to turbulence physics
+  !!
+  !! **Initial Conditions:**
+  !!
+  !! \[ u = \sin(x) \cos(y) \cos(z) \]
+  !! \[ v = -\cos(x) \sin(y) \cos(z) \]
+  !! \[ w = 0 \]
+  !!
+  !! This satisfies incompressibility (\( \nabla \cdot \mathbf{u} = 0 \)) exactly and is periodic
+  !! in all three directions.
+  !!
+  !! **Domain:**
+  !!
+  !! Typically \( [0, 2\pi]^3 \) with periodic boundary conditions in all directions.
+  !!
+  !! **Validation Metrics:**
+  !!
+  !! - Kinetic energy decay rate
+  !! - Enstrophy evolution
+  !! - Dissipation rate
+  !! - Vorticity dynamics
+  !!
+  !! **Reference:**
+  !!
+  !! Taylor, G. I., & Green, A. E. (1937). Mechanism of the production of
+  !! small eddies from large ones. Proc. R. Soc. Lond. A, 158(895), 499-521.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -12,12 +48,13 @@ module m_case_tgv
   implicit none
 
   type, extends(base_case_t) :: case_tgv_t
+    !! Taylor-Green vortex case (no additional state needed beyond base).
   contains
-    procedure :: boundary_conditions => boundary_conditions_tgv
-    procedure :: initial_conditions => initial_conditions_tgv
-    procedure :: forcings => forcings_tgv
-    procedure :: pre_correction => pre_correction_tgv
-    procedure :: postprocess => postprocess_tgv
+    procedure :: boundary_conditions => boundary_conditions_tgv !! No action (periodic BCs)
+    procedure :: initial_conditions => initial_conditions_tgv   !! Set TGV velocity field
+    procedure :: forcings => forcings_tgv                       !! No forcing
+    procedure :: pre_correction => pre_correction_tgv           !! No correction
+    procedure :: postprocess => postprocess_tgv                 !! Compute diagnostics
   end type case_tgv_t
 
   interface case_tgv_t
@@ -27,21 +64,27 @@ module m_case_tgv
 contains
 
   function case_tgv_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise Taylor-Green vortex case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_tgv_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_tgv_t) :: flow_case                                   !! Initialised TGV case
 
     call flow_case%case_init(backend, mesh, host_allocator)
 
   end function case_tgv_init
 
   subroutine initial_conditions_tgv(self)
+    !! Set Taylor-Green vortex initial velocity field.
+    !!
+    !! Initialises the three velocity components according to the TGV
+    !! analytical solution. The field is exactly divergence-free and
+    !! periodic, making it ideal for testing solver accuracy.
     implicit none
 
-    class(case_tgv_t) :: self
+    class(case_tgv_t) :: self !! TGV case instance
 
     call self%set_init(self%solver%u, u_func)
     call self%set_init(self%solver%v, v_func)
@@ -54,19 +97,25 @@ subroutine initial_conditions_tgv(self)
   end subroutine initial_conditions_tgv
 
   pure function u_func(coords) result(r)
+    !! Compute x-velocity component of TGV at given coordinates.
+    !!
+    !! \[ u = \sin(x) \cos(y) \cos(z) \]
     implicit none
 
-    real(dp), intent(in) :: coords(3)
-    real(dp) :: r
+    real(dp), intent(in) :: coords(3) !! Position [x, y, z]
+    real(dp) :: r                     !! Velocity component u
 
     r = sin(coords(1))*cos(coords(2))*cos(coords(3))
   end function u_func
 
   pure function v_func(coords) result(r)
+    !! Compute y-velocity component of TGV at given coordinates.
+    !!
+    !! \[ v = -\cos(x) \sin(y) \cos(z) \]
     implicit none
 
-    real(dp), intent(in) :: coords(3)
-    real(dp) :: r
+    real(dp), intent(in) :: coords(3) !! Position [x, y, z]
+    real(dp) :: r                     !! Velocity component v
 
     r = -cos(coords(1))*sin(coords(2))*cos(coords(3))
   end function v_func
diff --git a/src/common.f90 b/src/common.f90
index 8eba41c6f..3c5c98ff0 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -1,42 +1,62 @@
 module m_common
+  !! Common module containing global constants, parameters, and utility functions.
+  !!
+  !! This module provides:
+  !!
+  !! - Precision definitions (single or double precision based on compilation flags)
+  !! - Mathematical constants (e.g., \(\pi\))
+  !! - Direction and reordering constants for domain decomposition
+  !! - Data location flags (vertex, cell, face, edge centered)
+  !! - Boundary condition type constants
+  !! - Utility functions for argument parsing and data manipulation
   use mpi
 
   implicit none
 
 #ifdef SINGLE_PREC
-  integer, parameter :: dp = kind(0.0e0)
-  integer, parameter :: nbytes = 4
-  integer, parameter :: MPI_X3D2_DP = MPI_REAL
-  logical, parameter :: is_sp = .true.
+  integer, parameter :: dp = kind(0.0e0)  !! Double precision kind parameter (single precision)
+  integer, parameter :: nbytes = 4        !! Number of bytes for real numbers
+  integer, parameter :: MPI_X3D2_DP = MPI_REAL  !! MPI datatype for real numbers
+  logical, parameter :: is_sp = .true.    !! Flag indicating single precision
 #else
-  integer, parameter :: dp = kind(0.0d0)
-  integer, parameter :: nbytes = 8
-  integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION
-  logical, parameter :: is_sp = .false.
+  integer, parameter :: dp = kind(0.0d0)  !! Double precision kind parameter (double precision)
+  integer, parameter :: nbytes = 8        !! Number of bytes for real numbers
+  integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION  !! MPI datatype for real numbers
+  logical, parameter :: is_sp = .false.   !! Flag indicating double precision
 #endif
 
-  integer, parameter :: sp = kind(0.0e0)
-  integer, parameter :: i8 = selected_int_kind(18)
+  integer, parameter :: sp = kind(0.0e0)  !! Single precision kind parameter
+  integer, parameter :: i8 = selected_int_kind(18)  !! Integer kind for 64-bit integers
 
-  real(dp), parameter :: pi = 4*atan(1.0_dp)
+  real(dp), parameter :: pi = 4*atan(1.0_dp)  !! Mathematical constant \(\pi\)
 
+  !> Reordering constants for data layout transformations between directions.
+  !! Format: RDR_<from_dir>2<to_dir> where directions are X, Y, Z, or C (complete/cell-centered)
   integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
                         RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, &
                         RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, &
                         RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34
-  integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
-  integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
-  integer, parameter :: VERT = 0000, & ! Vertex centered data
-                        CELL = 1110, & ! Cell centered data
-                        X_FACE = 1100, & ! Data on faces normal to X
-                        Y_FACE = 1010, & ! Data on faces normal to Y
-                        Z_FACE = 0110, & ! Data on faces normal to Z
-                        X_EDGE = 0010, & ! Data on edges along X
-                        Y_EDGE = 0100, & ! Data on edges along Y
-                        Z_EDGE = 1000, & ! Data on edges along Z
-                        NULL_LOC = -0001 ! The location of data isn't specified
-  integer, parameter :: BC_PERIODIC = 0, BC_NEUMANN = 1, BC_DIRICHLET = 2, &
-                        BC_HALO = -1
+  integer, parameter :: DIR_X = 1  !! X direction index
+  integer, parameter :: DIR_Y = 2  !! Y direction index
+  integer, parameter :: DIR_Z = 3  !! Z direction index
+  integer, parameter :: DIR_C = 4  !! Complete/cell-centered direction index
+  integer, parameter :: POISSON_SOLVER_FFT = 0  !! FFT-based Poisson solver
+  integer, parameter :: POISSON_SOLVER_CG = 1   !! Conjugate gradient Poisson solver
+  integer, parameter :: VERT = 0000, & !! Vertex centered data
+                        CELL = 1110, & !! Cell centered data
+                        X_FACE = 1100, & !! Data on faces normal to X
+                        Y_FACE = 1010, & !! Data on faces normal to Y
+                        Z_FACE = 0110, & !! Data on faces normal to Z
+                        X_EDGE = 0010, & !! Data on edges along X
+                        Y_EDGE = 0100, & !! Data on edges along Y
+                        Z_EDGE = 1000, & !! Data on edges along Z
+                        NULL_LOC = -0001 !! The location of data isn't specified
+  integer, parameter :: BC_PERIODIC = 0   !! Periodic boundary condition
+  integer, parameter :: BC_NEUMANN = 1    !! Neumann boundary condition
+  integer, parameter :: BC_DIRICHLET = 2  !! Dirichlet boundary condition
+  integer, parameter :: BC_HALO = -1      !! Halo/ghost cell boundary condition
+  !> Reordering map matrix for direction transformations.
+  !! Maps from direction (row) to direction (column), yielding the reordering constant.
   integer, protected :: &
     rdr_map(4, 4) = reshape([0, RDR_Y2X, RDR_Z2X, RDR_C2X, &
                              RDR_X2Y, 0, RDR_Z2Y, RDR_C2Y, &
@@ -46,8 +66,13 @@ module m_common
 contains
 
   pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir)
-    integer, intent(out) :: dir_from, dir_to
-    integer, intent(in) :: rdr_dir
+    !! Extract source and destination directions from a reordering constant.
+    !!
+    !! Given a reordering constant (e.g., RDR_X2Y), this subroutine determines
+    !! the source direction and destination direction.
+    integer, intent(out) :: dir_from  !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(out) :: dir_to    !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(in) :: rdr_dir    !! Reordering constant (e.g., RDR_X2Y)
     integer, dimension(2) :: dirs
 
     dirs = findloc(rdr_map, rdr_dir)
@@ -57,15 +82,23 @@ pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir)
   end subroutine
 
   pure integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir)
-      !! Returns RDR_?2? value based on two direction inputs
-    integer, intent(in) :: dir_from, dir_to
+    !! Returns reordering constant based on two direction inputs.
+    !!
+    !! Given a source and destination direction, this function returns the
+    !! corresponding reordering constant (e.g., RDR_X2Y for X to Y).
+    integer, intent(in) :: dir_from  !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(in) :: dir_to    !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
 
     rdr_dir = rdr_map(dir_from, dir_to)
   end function get_rdr_from_dirs
 
   function get_argument(pos) result(arg)
-    integer, intent(in) :: pos
-    character(:), allocatable :: arg
+    !! Retrieve a command-line argument at the specified position.
+    !!
+    !! This function wraps the intrinsic get_command_argument with error checking
+    !! and automatic string trimming.
+    integer, intent(in) :: pos  !! Position of the command-line argument (1-indexed)
+    character(:), allocatable :: arg  !! The retrieved command-line argument
 
     character(len=200) :: temp
     integer :: stat
@@ -82,7 +115,14 @@ function get_argument(pos) result(arg)
   end function get_argument
 
   integer function move_data_loc(in_data_loc, dir, move) result(out_data_loc)
-    integer, intent(in) :: in_data_loc, dir, move
+    !! Update data location by shifting along a specified direction.
+    !!
+    !! This function modifies a data location flag by moving it along one direction
+    !! (X, Y, or Z) by a specified amount. The data location encoding uses powers of 10
+    !! to represent positions in each direction.
+    integer, intent(in) :: in_data_loc  !! Input data location flag
+    integer, intent(in) :: dir          !! Direction to move (DIR_X, DIR_Y, or DIR_Z)
+    integer, intent(in) :: move         !! Amount to move (typically -1, 0, or 1)
 
     out_data_loc = in_data_loc + move*(10**dir)
   end function move_data_loc
diff --git a/src/config.f90 b/src/config.f90
index 076de2232..ce28d6e98 100644
--- a/src/config.f90
+++ b/src/config.f90
@@ -6,54 +6,86 @@ module m_config
 
   implicit none
 
-  integer, parameter :: n_species_max = 99
+  integer, parameter :: n_species_max = 99  !! Maximum number of transported species
 
   type, abstract :: base_config_t
-    !! All config types have a method read to initialise their data
+    !! Base abstract type for all configuration types.
+    !!
+    !! All config types have a deferred read method to initialise their data
+    !! from either a namelist file or a namelist string.
   contains
     procedure(read), deferred :: read
   end type base_config_t
 
   type, extends(base_config_t) :: domain_config_t
-    character(len=30) :: flow_case_name
-    real(dp) :: L_global(3)
-    integer :: dims_global(3), nproc_dir(3)
-    character(len=20) :: BC_x(2), BC_y(2), BC_z(2)
-    character(len=20) :: stretching(3)
-    real(dp) :: beta(3)
+    !! Domain configuration type containing mesh and decomposition settings.
+    !!
+    !! This type stores all parameters related to the computational domain,
+    !! including global dimensions, boundary conditions, mesh stretching,
+    !! and MPI decomposition.
+    character(len=30) :: flow_case_name     !! Name of the flow case (e.g., 'channel', 'tgv', 'generic')
+    real(dp) :: L_global(3)                 !! Global domain lengths in each direction
+    integer :: dims_global(3)               !! Global number of grid points in each direction
+    integer :: nproc_dir(3)                 !! Number of processors in each direction
+    character(len=20) :: BC_x(2)            !! Boundary conditions in x-direction (lower, upper)
+    character(len=20) :: BC_y(2)            !! Boundary conditions in y-direction (lower, upper)
+    character(len=20) :: BC_z(2)            !! Boundary conditions in z-direction (lower, upper)
+    character(len=20) :: stretching(3)      !! Mesh stretching type in each direction
+    real(dp) :: beta(3)                     !! Stretching parameters in each direction
   contains
     procedure :: read => read_domain_nml
   end type domain_config_t
 
   type, extends(base_config_t) :: solver_config_t
-    real(dp) :: Re, dt
-    logical :: ibm_on
-    real(dp), dimension(:), allocatable :: pr_species
-    integer :: n_iters, n_output, n_species
-    logical :: lowmem_transeq, lowmem_fft
-    character(3) :: poisson_solver_type, time_intg
-    character(30) :: der1st_scheme, der2nd_scheme, &
-                     interpl_scheme, stagder_scheme
+    !! Solver configuration type containing numerical and physical parameters.
+    !!
+    !! This type stores parameters related to the numerical solver including
+    !! Reynolds number, time step, iteration counts, discretisation schemes,
+    !! and solver options.
+    real(dp) :: Re                          !! Reynolds number
+    real(dp) :: dt                          !! Time step size
+    logical :: ibm_on                       !! Flag to enable immersed boundary method
+    real(dp), dimension(:), allocatable :: pr_species  !! Prandtl numbers for each species
+    integer :: n_iters                      !! Total number of iterations
+    integer :: n_output                     !! Output frequency (every n_output iterations)
+    integer :: n_species                    !! Number of transported scalar species
+    logical :: lowmem_transeq               !! Use low-memory implementation for transport equation
+    logical :: lowmem_fft                   !! Use low-memory implementation for FFT
+    character(3) :: poisson_solver_type     !! Poisson solver type ('FFT' or 'CG')
+    character(3) :: time_intg               !! Time integration scheme (e.g., 'RK3', 'AB2')
+    character(30) :: der1st_scheme          !! First derivative scheme (e.g., 'compact6')
+    character(30) :: der2nd_scheme          !! Second derivative scheme (e.g., 'compact6')
+    character(30) :: interpl_scheme         !! Interpolation scheme (e.g., 'classic')
+    character(30) :: stagder_scheme         !! Staggered derivative scheme (e.g., 'compact6')
   contains
     procedure :: read => read_solver_nml
   end type solver_config_t
 
   type, extends(base_config_t) :: channel_config_t
-    real(dp) :: noise, omega_rot
-    logical :: rotation
-    integer :: n_rotate
+    !! Channel flow configuration type.
+    !!
+    !! This type contains parameters specific to channel flow simulations,
+    !! including initial perturbations and rotation effects.
+    real(dp) :: noise                       !! Initial noise amplitude for perturbations
+    real(dp) :: omega_rot                   !! Rotation rate for rotating channel flow
+    logical :: rotation                     !! Flag to enable rotation
+    integer :: n_rotate                     !! Number of directions to rotate
   contains
     procedure :: read => read_channel_nml
   end type channel_config_t
 
   type, extends(base_config_t) :: checkpoint_config_t
+    !! Checkpoint and snapshot configuration type.
+    !!
+    !! This type manages simulation restart and output settings including
+    !! checkpoint frequency, snapshot frequency, and file naming conventions.
     integer :: checkpoint_freq = 0                         !! Frequency of checkpointing (0 = off)
     integer :: snapshot_freq = 0                           !! Frequency of snapshots (0 = off)
     logical :: keep_checkpoint = .true.                    !! If false, only keep latest checkpoint
-    character(len=256) :: checkpoint_prefix = "checkpoint"
-    character(len=256) :: snapshot_prefix = "snapshot"
-    logical :: restart_from_checkpoint = .false.
-    character(len=256) :: restart_file = ""
+    character(len=256) :: checkpoint_prefix = "checkpoint" !! Filename prefix for checkpoint files
+    character(len=256) :: snapshot_prefix = "snapshot"     !! Filename prefix for snapshot files
+    logical :: restart_from_checkpoint = .false.           !! Flag to restart from a checkpoint
+    character(len=256) :: restart_file = ""                !! Path to checkpoint file for restart
     integer, dimension(3) :: output_stride = [2, 2, 2]     !! Spatial stride for snapshot output
     logical :: snapshot_sp = .false.                       !! if true, snapshot in single precision
   contains
@@ -64,9 +96,9 @@ module m_config
     subroutine read(self, nml_file, nml_string) !&
       !! Assigns the member variables either from a file or text source.
       !!
-      !! nml_file can be an absolute or relative path
-      !! nml_string is a character string that contains the namelist.
-      !! For example, nml_string="&foobar_nml foo=0, bar='this'/"
+      !! `nml_file` can be an absolute or relative path
+      !! `nml_string` is a character string that contains the namelist.
+      !! For example, `nml_string="&foobar_nml foo=0, bar='this'/"`
       import :: base_config_t
 
       class(base_config_t) :: self
@@ -78,11 +110,16 @@ end subroutine read
 contains
 
   subroutine read_domain_nml(self, nml_file, nml_string)
+    !! Read domain configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the domain_settings namelist containing mesh
+    !! and domain decomposition parameters. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(domain_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(domain_config_t) :: self               !! Domain configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -96,6 +133,8 @@ subroutine read_domain_nml(self, nml_file, nml_string)
 
     namelist /domain_settings/ flow_case_name, L_global, dims_global, &
       nproc_dir, BC_x, BC_y, BC_z, stretching, beta
+      !! Specifies the computational domain geometry, mesh resolution, boundary conditions,
+      !! and MPI decomposition for the simulation.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading domain config failed! &
@@ -124,11 +163,16 @@ subroutine read_domain_nml(self, nml_file, nml_string)
   end subroutine read_domain_nml
 
   subroutine read_solver_nml(self, nml_file, nml_string)
+    !! Read solver configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the solver_params namelist containing numerical
+    !! and physical parameters for the solver. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(solver_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(solver_config_t) :: self               !! Solver configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -147,6 +191,8 @@ subroutine read_solver_nml(self, nml_file, nml_string)
       n_species, pr_species, lowmem_transeq, lowmem_fft, &
       time_intg, der1st_scheme, der2nd_scheme, interpl_scheme, &
       stagder_scheme, ibm_on
+      !! Specifies numerical solver settings including Reynolds number, time integration,
+      !! discretization schemes, and solver options for the Navier-Stokes equations.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading solver config failed! &
@@ -181,11 +227,16 @@ subroutine read_solver_nml(self, nml_file, nml_string)
   end subroutine read_solver_nml
 
   subroutine read_channel_nml(self, nml_file, nml_string)
+    !! Read channel flow configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the channel_nml namelist containing parameters
+    !! specific to channel flow simulations. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(channel_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(channel_config_t) :: self              !! Channel configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -194,6 +245,8 @@ subroutine read_channel_nml(self, nml_file, nml_string)
     integer :: n_rotate
 
     namelist /channel_nml/ noise, rotation, omega_rot, n_rotate
+      !! Specifies parameters specific to turbulent channel flow simulations,
+      !! including initial perturbations and optional rotation effects.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading channel config failed! &
@@ -217,11 +270,16 @@ subroutine read_channel_nml(self, nml_file, nml_string)
   end subroutine read_channel_nml
 
   subroutine read_checkpoint_nml(self, nml_file, nml_string)
+    !! Read checkpoint/snapshot configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the checkpoint_params namelist containing settings
+    !! for checkpointing and snapshot output. Exactly one of nml_file or
+    !! nml_string must be provided. Uses default values if namelist is missing.
     implicit none
 
-    class(checkpoint_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(checkpoint_config_t) :: self           !! Checkpoint configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit, ierr
 
@@ -238,6 +296,8 @@ subroutine read_checkpoint_nml(self, nml_file, nml_string)
     namelist /checkpoint_params/ checkpoint_freq, snapshot_freq, &
       keep_checkpoint, checkpoint_prefix, snapshot_prefix, &
       restart_from_checkpoint, restart_file, output_stride, snapshot_sp
+      !! Specifies checkpoint and snapshot settings for simulation output and restart,
+      !! including file naming, frequency, and spatial output stride.
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading checkpoint config failed! &
                  &Provide only a file name or source, not both.'
diff --git a/src/field.f90 b/src/field.f90
index 878b5d247..7d2e69c4a 100644
--- a/src/field.f90
+++ b/src/field.f90
@@ -1,25 +1,32 @@
 module m_field
+  !! Field data structure module for managing computational grid data.
+  !!
+  !! This module provides the `field_t` type for storing 3D scalar fields
+  !! on the computational grid. Fields can be organised in linked lists
+  !! for memory management and support different data orientations
+  !! (x-pencil, y-pencil, z-pencil).
 
   use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C
 
   type :: field_t
-     !! Memory block type holding both a data field and a pointer
-     !! to the next block.  The `field_t` type also holds a integer
-     !! `refcount` that counts the number of references to this
-     !! field.  User code is currently responsible for incrementing
-     !! the reference count.
-    class(field_t), pointer :: next
-    real(dp), pointer, private :: p_data(:)
-    real(dp), pointer, contiguous :: data(:, :, :)
-    integer :: dir
-    integer :: data_loc
-    integer :: refcount = 0
-    integer :: id !! An integer identifying the memory block.
+    !! Memory block type holding a 3D scalar field with metadata.
+    !!
+    !! The field_t type stores both a data field and a pointer to the next
+    !! block, enabling linked list structures for memory management. The type
+    !! tracks a reference count (currently managed by user code), data
+    !! orientation (x-, y-, or z-pencil), and data location on the staggered grid.
+    class(field_t), pointer :: next             !! Pointer to next field in linked list
+    real(dp), pointer, private :: p_data(:)     !! 1D array storage for data
+    real(dp), pointer, contiguous :: data(:, :, :)  !! 3D view of data array
+    integer :: dir                              !! Data direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer :: data_loc                         !! Data location flag (VERT, CELL, etc.)
+    integer :: refcount = 0                     !! Reference count for memory management
+    integer :: id                               !! Unique identifier for this memory block
   contains
-    procedure :: fill
-    procedure :: get_shape
-    procedure :: set_shape
-    procedure :: set_data_loc
+    procedure :: fill          !! Fill field with a constant value
+    procedure :: get_shape     !! Get 3D dimensions of data array
+    procedure :: set_shape     !! Set 3D dimensions by reshaping p_data
+    procedure :: set_data_loc  !! Set data location flag
   end type field_t
 
   interface field_t
@@ -27,16 +34,25 @@ module m_field
   end interface field_t
 
   type :: flist_t
-    !! Use for creating a list of field pointers
-    class(field_t), pointer :: ptr
+    !! Wrapper type for creating arrays of field pointers.
+    !!
+    !! This type is used to create lists or arrays of field pointers,
+    !! useful for managing multiple fields such as velocity components
+    !! or transported scalar species.
+    class(field_t), pointer :: ptr  !! Pointer to a field
   end type flist_t
 
 contains
 
   function field_init(ngrid, next, id) result(f)
-    integer, intent(in) :: ngrid, id
-    type(field_t), pointer, intent(in) :: next
-    type(field_t) :: f
+    !! Initialise a new field with allocated memory.
+    !!
+    !! Creates a new field_t instance with allocated storage for ngrid points.
+    !! The field is linked to the next field in the list and assigned a unique ID.
+    integer, intent(in) :: ngrid  !! Total number of grid points to allocate
+    type(field_t), pointer, intent(in) :: next  !! Pointer to next field in linked list
+    integer, intent(in) :: id     !! Unique identifier for this field
+    type(field_t) :: f            !! Initialised field
 
     allocate (f%p_data(ngrid))
     f%refcount = 0
@@ -45,38 +61,52 @@ function field_init(ngrid, next, id) result(f)
   end function field_init
 
   subroutine fill(self, c)
+    !! Fill the entire field with a constant value.
+    !!
+    !! Sets all grid points in the field to the specified constant value.
     implicit none
 
-    class(field_t) :: self
-    real(dp), intent(in) :: c
+    class(field_t) :: self        !! Field to fill
+    real(dp), intent(in) :: c     !! Constant value to fill with
 
     self%p_data(:) = c
 
   end subroutine fill
 
   subroutine set_data_loc(self, data_loc)
-    class(field_t) :: self
-    integer, intent(in) :: data_loc
+    !! Set the data location flag for this field.
+    !!
+    !! The data location specifies where on the staggered grid the data
+    !! is located (e.g., VERT, CELL, X_FACE, etc.).
+    class(field_t) :: self           !! Field to modify
+    integer, intent(in) :: data_loc  !! Data location flag
 
     self%data_loc = data_loc
 
   end subroutine
 
   function get_shape(self) result(dims)
+    !! Get the 3D dimensions of the field data.
+    !!
+    !! Returns the current shape of the 3D data array.
     implicit none
 
-    class(field_t) :: self
-    integer :: dims(3)
+    class(field_t) :: self  !! Field to query
+    integer :: dims(3)      !! Array dimensions [nx, ny, nz]
 
     dims = shape(self%data)
 
   end function get_shape
 
   subroutine set_shape(self, dims)
+    !! Reshape the field data to specified 3D dimensions.
+    !!
+    !! Maps the 1D storage array (p_data) to a 3D view with the specified
+    !! dimensions. The total size must match the allocated storage.
     implicit none
 
-    class(field_t) :: self
-    integer, intent(in) :: dims(3)
+    class(field_t) :: self        !! Field to reshape
+    integer, intent(in) :: dims(3)  !! Target dimensions [nx, ny, nz]
 
     self%data(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data
 
diff --git a/src/io/adios2/io.f90 b/src/io/adios2/io.f90
index 26976b98d..613db079f 100644
--- a/src/io/adios2/io.f90
+++ b/src/io/adios2/io.f90
@@ -1,24 +1,40 @@
 module m_io_backend
-!! @brief Provides ADIOS2-specific implementation of the I/O backend interface
-!!
-!! @details This module contains the concrete backend implementation for ADIOS2
-!! (ADaptive Input Output System v2) library. It acts as a translation layer
-!! converting generic I/O calls from the session interface into specific calls
-!! to the ADIOS2 API.
-!!
-!! The `adios2_reader_t` and `adios2_writer_t` types defined here extend the
-!! abstract base types from `m_io_base` and implement required procedures
-!!
-!! This backend leverages several key features of the underlying ADIOS2 library
-!! - engine abstraction - the same API can be used for different transport
-!! methods (e.g. BP4, BP5, HDF5)
-!! - Asynchronous I/O - by default ADIOS2 uses a deferred transport mode
-!! which can improve performance by overlapping computation and I/O
-!! - MPI integration - it is designed for large-scale paralle I/O and
-!! integrates with MPI, though serial operation is also supported
-!!
-!! @note This is an internal backend module and should never be used directly.
-!! All user interaction must go through `m_io_session`.
+  !! ADIOS2-specific implementation of the I/O backend interface.
+  !!
+  !! This module provides the concrete backend implementation for ADIOS2
+  !! (Adaptable Input Output System v2), a high-performance parallel I/O
+  !! library. It acts as a translation layer converting generic I/O calls
+  !! from the session interface into specific ADIOS2 API calls.
+  !!
+  !! **Architecture:**
+  !!
+  !! - Extends abstract base types from `m_io_base`
+  !! - Implements all required I/O procedures (init, open, read, write, etc.)
+  !! - Manages ADIOS2-specific objects (adios, io, engine)
+  !! - Handles step-based I/O for time-series data
+  !!
+  !! **ADIOS2 Features Leveraged:**
+  !!
+  !! - **Engine Abstraction**: Same API for different formats (BP4, BP5, HDF5)
+  !! - **Asynchronous I/O**: Deferred transport mode overlaps computation and I/O
+  !! - **MPI Integration**: Designed for large-scale parallel I/O
+  !! - **Variable/Attribute Management**: Efficient metadata handling
+  !! - **Hyperslab Selection**: Parallel distributed array I/O
+  !!
+  !! **Type Hierarchy:**
+  !!
+  !! ```
+  !! io_base (abstract)
+  !!   |-- io_reader_t (abstract)
+  !!   |     |-- io_adios2_reader_t (concrete)
+  !!   |-- io_writer_t (abstract)
+  !!   |     |-- io_adios2_writer_t (concrete)
+  !!   |-- io_file_t (abstract)
+  !!         |-- io_adios2_file_t (concrete)
+  !! ```
+  !!
+  !! **Note:** This is an internal backend module and should never be used
+  !! directly. All user interaction must go through `m_io_session`.
   use adios2, only: adios2_adios, adios2_io, adios2_engine, &
                     adios2_variable, adios2_attribute, &
                     adios2_mode_sync, adios2_mode_write, &
@@ -45,56 +61,71 @@ module m_io_backend
   public :: allocate_io_reader, allocate_io_writer
   public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2
 
-  integer, parameter :: IO_BACKEND_DUMMY = 0
-  integer, parameter :: IO_BACKEND_ADIOS2 = 1
+  integer, parameter :: IO_BACKEND_DUMMY = 0   !! Dummy backend identifier
+  integer, parameter :: IO_BACKEND_ADIOS2 = 1  !! ADIOS2 backend identifier
 
   type, extends(io_reader_t) :: io_adios2_reader_t
+    !! ADIOS2 reader implementation for reading data from files.
+    !!
+    !! Manages ADIOS2 objects required for reading operations including
+    !! the global ADIOS handler, I/O object, and tracks step state for
+    !! time-series data reading.
     private
     type(adios2_adios) :: adios              !! ADIOS2 global handler
-    type(adios2_io) :: io_handle             !! ADIOS2 IO object for managing I/O
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    integer :: comm = MPI_COMM_NULL          !! MPI communicator
+    type(adios2_io) :: io_handle             !! ADIOS2 I/O object for managing variables
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    integer :: comm = MPI_COMM_NULL          !! MPI communicator for parallel I/O
   contains
-    procedure :: init => reader_init_adios2
-    procedure :: open => reader_open_adios2
-    procedure :: read_data_i8 => read_data_i8_adios2
-    procedure :: read_data_integer => read_data_integer_adios2
-    procedure :: read_data_real => read_data_real_adios2
-    procedure :: read_data_array_3d => read_data_array_3d_adios2
-    procedure :: finalise => finalise_reader_adios2
-    procedure, private :: handle_error => handle_error_reader
+    procedure :: init => reader_init_adios2                 !! Initialise reader
+    procedure :: open => reader_open_adios2                 !! Open file for reading
+    procedure :: read_data_i8 => read_data_i8_adios2        !! Read 64-bit integer
+    procedure :: read_data_integer => read_data_integer_adios2 !! Read default integer
+    procedure :: read_data_real => read_data_real_adios2    !! Read double precision real
+    procedure :: read_data_array_3d => read_data_array_3d_adios2 !! Read 3D array with hyperslab
+    procedure :: finalise => finalise_reader_adios2         !! Finalise and clean up
+    procedure, private :: handle_error => handle_error_reader !! Error handling (internal)
   end type io_adios2_reader_t
 
   type, extends(io_writer_t) :: io_adios2_writer_t
+    !! ADIOS2 writer implementation for writing data to files.
+    !!
+    !! Manages ADIOS2 objects required for writing operations including
+    !! the global ADIOS handler, I/O object, and tracks step state for
+    !! time-series data writing.
     private
     type(adios2_adios) :: adios              !! ADIOS2 global handler
-    type(adios2_io) :: io_handle             !! ADIOS2 IO object for managing I/O
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    integer :: comm = MPI_COMM_NULL          !! MPI communicator
+    type(adios2_io) :: io_handle             !! ADIOS2 I/O object for managing variables
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    integer :: comm = MPI_COMM_NULL          !! MPI communicator for parallel I/O
   contains
-    procedure :: init => writer_init_adios2
-    procedure :: open => writer_open_adios2
-    procedure :: write_data_i8 => write_data_i8_adios2
-    procedure :: write_data_integer => write_data_integer_adios2
-    procedure :: write_data_real => write_data_real_adios2
-    procedure :: write_data_array_3d => write_data_array_3d_adios2
-    procedure :: write_attribute_string => write_attribute_string_adios2
+    procedure :: init => writer_init_adios2                 !! Initialise writer
+    procedure :: open => writer_open_adios2                 !! Open file for writing
+    procedure :: write_data_i8 => write_data_i8_adios2      !! Write 64-bit integer
+    procedure :: write_data_integer => write_data_integer_adios2 !! Write default integer
+    procedure :: write_data_real => write_data_real_adios2  !! Write double precision real
+    procedure :: write_data_array_3d => write_data_array_3d_adios2 !! Write 3D array with hyperslab
+    procedure :: write_attribute_string => write_attribute_string_adios2 !! Write string attribute
     procedure :: write_attribute_array_1d_real => &
-      write_attribute_array_1d_real_adios2
-    procedure :: finalise => finalise_writer_adios2
-    procedure, private :: handle_error => handle_error_writer
+      write_attribute_array_1d_real_adios2                  !! Write 1D real array attribute
+    procedure :: finalise => finalise_writer_adios2         !! Finalise and clean up
+    procedure, private :: handle_error => handle_error_writer !! Error handling (internal)
   end type io_adios2_writer_t
 
   type, extends(io_file_t) :: io_adios2_file_t
+    !! ADIOS2 file handle for open file operations.
+    !!
+    !! Wraps the ADIOS2 engine object and manages step-based I/O for
+    !! time-series data. Tracks whether file is opened for reading or
+    !! writing and current step state.
     private
-    type(adios2_engine) :: engine            !! ADIOS2 engine for data reading/writing
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    logical :: is_writer = .false.           !! Flag to track if this is for writing
+    type(adios2_engine) :: engine            !! ADIOS2 engine for data transport
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    logical :: is_writer = .false.           !! True if file opened for writing
   contains
-    procedure :: close => file_close_adios2
-    procedure :: begin_step => file_begin_step_adios2
-    procedure :: end_step => file_end_step_adios2
-    procedure, private :: handle_error => handle_error_file
+    procedure :: close => file_close_adios2              !! Close file and engine
+    procedure :: begin_step => file_begin_step_adios2    !! Begin new I/O step
+    procedure :: end_step => file_end_step_adios2        !! End current I/O step
+    procedure, private :: handle_error => handle_error_file !! Error handling (internal)
   end type io_adios2_file_t
 
 contains
diff --git a/src/io/checkpoint_manager.f90 b/src/io/checkpoint_manager.f90
index 8bd2ed97f..205f08d0e 100644
--- a/src/io/checkpoint_manager.f90
+++ b/src/io/checkpoint_manager.f90
@@ -1,20 +1,36 @@
 module m_checkpoint_manager
-! @brief Manages the creation and restoration of simulation checkpoints
-!! for restart capabilities.
-!!
-!! @details This module is responsible for periodically saving the full, unstrided
-!! simulation state to a file. This allows a simulation to be stopped and resumed
-!! from the exact state it was in.
-!!
-!! Key features include:
-!! - Reading all checkpoint settings from a configuration file
-!! - Periodically writing the full-resolution simulation state
-!! - Handling the full logic for restarting a simulation from
-!! a specified checkpoint file.
-!! - A safe-write strategy that writes to a temporary file first,
-!!   then atomically renames it to the final filename to
-!! prevent corrupted checkpoints.
-!! - Optional cleanup of old checkpoint files to conserve disk space.
+  !! Manages creation and restoration of simulation checkpoints for restart.
+  !!
+  !! This module is responsible for periodically saving the full simulation
+  !! state to checkpoint files and restoring from them for restarts. This
+  !! allows simulations to be stopped and resumed from the exact state.
+  !!
+  !! **Key Features:**
+  !!
+  !! - Configuration via namelist (checkpoint frequency, prefix, etc.)
+  !! - Periodic writing of full-resolution simulation state
+  !! - Complete restart logic from specified checkpoint file
+  !! - Safe-write strategy: temporary file then atomic rename
+  !! - Optional cleanup of old checkpoints to conserve disk space
+  !! - Stores velocity fields (\(u, v, w\)), timestep, and simulation time
+  !!
+  !! **Safe-Write Strategy:**
+  !!
+  !! To prevent corrupted checkpoints from crashes during write:
+  !!
+  !! 1. Write to temporary file (e.g., `checkpoint_0001000.tmp.bp`)
+  !! 2. Atomic rename to final name (`checkpoint_0001000.bp`)
+  !! 3. Optionally delete previous checkpoint if `keep_checkpoint=false`
+  !!
+  !! **Configuration:**
+  !!
+  !! Controlled via `checkpoint_config_t` read from input namelist:
+  !!
+  !! - `checkpoint_freq`: write interval (iterations)
+  !! - `keep_checkpoint`: retain all checkpoints vs overwrite old ones
+  !! - `checkpoint_prefix`: filename prefix
+  !! - `restart_from_checkpoint`: enable restart
+  !! - `restart_file`: checkpoint file to restart from
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank, MPI_Abort
   use m_common, only: dp, i8, DIR_X, get_argument
   use m_field, only: field_t
@@ -30,38 +46,48 @@ module m_checkpoint_manager
   implicit none
 
   type :: raw_old_field_buffer_t
-    real(dp), allocatable :: data(:, :, :)
+    !! Temporary buffer for field data (used internally).
+    real(dp), allocatable :: data(:, :, :) !! 3D array storage
   end type raw_old_field_buffer_t
 
   private
   public :: checkpoint_manager_t
 
   type :: checkpoint_manager_t
-    type(checkpoint_config_t) :: config
-    integer :: last_checkpoint_step = -1
-    integer, dimension(3) :: full_resolution = [1, 1, 1]
-    type(field_buffer_map_t), allocatable :: field_buffers(:)
-    integer(i8), dimension(3) :: last_shape_dims = 0
-    integer, dimension(3) :: last_stride_factors = 0
-    integer(i8), dimension(3) :: last_output_shape = 0
+    !! Manager for checkpoint file operations (writing and reading).
+    !!
+    !! Handles all aspects of checkpoint I/O including periodic writes
+    !! during simulation and restoration during restart. Maintains state
+    !! needed for consistent checkpoint operations across multiple writes.
+    type(checkpoint_config_t) :: config              !! Checkpoint configuration settings
+    integer :: last_checkpoint_step = -1             !! Timestep of last checkpoint written
+    integer, dimension(3) :: full_resolution = [1, 1, 1] !! Global domain resolution [nx, ny, nz]
+    type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O
+    integer(i8), dimension(3) :: last_shape_dims = 0 !! Shape dimensions from last write
+    integer, dimension(3) :: last_stride_factors = 0 !! Stride factors from last write
+    integer(i8), dimension(3) :: last_output_shape = 0 !! Output shape from last write
   contains
-    procedure :: init
-    procedure :: handle_restart
-    procedure :: handle_checkpoint_step
-    procedure :: is_restart
-    procedure :: finalise
-    procedure, private :: write_checkpoint
-    procedure, private :: restart_checkpoint
-    procedure, private :: write_fields
-    procedure, private :: cleanup_output_buffers
+    procedure :: init                          !! Initialise checkpoint manager
+    procedure :: handle_restart                !! Restore from checkpoint file
+    procedure :: handle_checkpoint_step        !! Write checkpoint if needed at timestep
+    procedure :: is_restart                    !! Check if this is a restart run
+    procedure :: finalise                      !! Clean up and finalise
+    procedure, private :: write_checkpoint     !! Write checkpoint file (internal)
+    procedure, private :: restart_checkpoint   !! Read checkpoint file (internal)
+    procedure, private :: write_fields         !! Write field data to file (internal)
+    procedure, private :: cleanup_output_buffers !! Free output buffers (internal)
   end type checkpoint_manager_t
 
 contains
 
   subroutine init(self, comm)
-    !! Initialise checkpoint manager
-    class(checkpoint_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise checkpoint manager from configuration.
+    !!
+    !! Reads checkpoint settings from input namelist and configures
+    !! output if checkpoint frequency is positive. Prints checkpoint
+    !! settings on root process.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    integer, intent(in) :: comm                         !! MPI communicator
 
     self%config = checkpoint_config_t()
     call self%config%read(nml_file=get_argument(1))
@@ -72,10 +98,13 @@ subroutine init(self, comm)
   end subroutine init
 
   subroutine configure_output(self, comm)
-    !! Configure checkpoint output settings
+    !! Configure and print checkpoint output settings.
+    !!
+    !! Displays checkpoint configuration on root process including
+    !! frequency, retention policy, and file prefix.
     use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY
-    class(checkpoint_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    integer, intent(in) :: comm                         !! MPI communicator
 
     integer :: myrank, ierr
 
@@ -89,18 +118,25 @@ subroutine configure_output(self, comm)
   end subroutine configure_output
 
   function is_restart(self) result(restart)
-    !! Check if this is a restart run
-    class(checkpoint_manager_t), intent(in) :: self
-    logical :: restart
+    !! Check if this is a restart run.
+    !!
+    !! Queries configuration to determine if simulation should restart
+    !! from an existing checkpoint file.
+    class(checkpoint_manager_t), intent(in) :: self !! Checkpoint manager instance
+    logical :: restart                               !! True if restarting from checkpoint
 
     restart = self%config%restart_from_checkpoint
   end function is_restart
 
   subroutine handle_restart(self, solver, comm)
-    !! Handle restart from checkpoint
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    integer, intent(in), optional :: comm
+    !! Restore solver state from checkpoint file.
+    !!
+    !! Reads velocity fields, timestep, and time from the checkpoint file
+    !! specified in configuration. Updates solver's current iteration counter.
+    !! Prints restart information on root process.
+    class(checkpoint_manager_t), intent(inout) :: self    !! Checkpoint manager instance
+    class(solver_t), intent(inout) :: solver              !! Solver to restore state into
+    integer, intent(in), optional :: comm                 !! MPI communicator (optional)
 
     character(len=256) :: restart_file
     integer :: restart_timestep
@@ -123,11 +159,15 @@ subroutine handle_restart(self, solver, comm)
   end subroutine handle_restart
 
   subroutine handle_checkpoint_step(self, solver, timestep, comm)
-    !! Handle checkpoint writing at a given timestep
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Write checkpoint if frequency condition is met.
+    !!
+    !! Checks if current timestep is a checkpoint interval (divisible by
+    !! checkpoint_freq) and writes checkpoint file if so. Called each
+    !! timestep from main simulation loop.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(in) :: solver              !! Solver containing current state
+    integer, intent(in) :: timestep                     !! Current timestep number
+    integer, intent(in), optional :: comm               !! MPI communicator (optional)
 
     integer :: comm_to_use
 
@@ -138,11 +178,26 @@ subroutine handle_checkpoint_step(self, solver, timestep, comm)
   end subroutine handle_checkpoint_step
 
   subroutine write_checkpoint(self, solver, timestep, comm)
-    !! Write a checkpoint file for simulation restart
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in) :: comm
+    !! Write checkpoint file using safe-write strategy (internal).
+    !!
+    !! Implements the checkpoint writing logic with atomic file operations
+    !! to prevent corruption. The procedure:
+    !! 1. Check if checkpoint is due (frequency condition)
+    !! 2. Write to temporary file (_temp.bp)
+    !! 3. Write metadata (timestep, time, dt, data location)
+    !! 4. Write velocity fields (u, v, w) via write_fields
+    !! 5. Write time integrator state (AB scheme coefficients if applicable)
+    !! 6. Close temporary file
+    !! 7. Atomic rename: temp file to final name
+    !! 8. Optionally delete previous checkpoint if keep_checkpoint=false
+    !!
+    !! **Safe-Write Strategy:** Writing to a temporary file and then renaming
+    !! ensures that if a crash occurs during write, the previous valid
+    !! checkpoint remains intact.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(in) :: solver              !! Solver with state to save
+    integer, intent(in) :: timestep                     !! Current timestep number
+    integer, intent(in) :: comm                         !! MPI communicator
 
     character(len=256) :: filename, temp_filename, old_filename
     integer :: ierr, myrank
@@ -307,13 +362,29 @@ end subroutine write_checkpoint
   subroutine restart_checkpoint( &
     self, solver, filename, timestep, restart_time, comm &
     )
-    !! Restart simulation state from checkpoint file
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    character(len=*), intent(in) :: filename
-    integer, intent(out) :: timestep
-    real(dp), intent(out) :: restart_time
-    integer, intent(in) :: comm
+    !! Restore simulation state from checkpoint file (internal).
+    !!
+    !! Reads all data from checkpoint file and restores solver state:
+    !! 1. Verify checkpoint file exists (abort if missing)
+    !! 2. Open checkpoint file for reading
+    !! 3. Read metadata (timestep, time, dt, data location)
+    !! 4. Read time integrator state (AB coefficients, order, step counters)
+    !! 5. Read velocity fields (u, v, w) with correct dimensions
+    !! 6. Restore time integrator state including history (olds arrays)
+    !! 7. Set solver data location to match checkpoint
+    !!
+    !! **Data Location:** Checkpoint records whether fields were stored at
+    !! vertices (VERT) or cell centers (CELL), and restoration preserves this.
+    !!
+    !! **Time Integrator State:** For Adams-Bashforth schemes, restores the
+    !! history of old field values (du_olds, dv_olds, dw_olds) needed for
+    !! multi-step time integration.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(inout) :: solver           !! Solver to restore state into
+    character(len=*), intent(in) :: filename           !! Checkpoint file path
+    integer, intent(out) :: timestep                    !! Timestep from checkpoint
+    real(dp), intent(out) :: restart_time              !! Simulation time from checkpoint
+    integer, intent(in) :: comm                         !! MPI communicator
 
     type(reader_session_t) :: reader_session
     integer :: ierr, myrank, data_loc
@@ -456,13 +527,28 @@ end subroutine restart_checkpoint
   subroutine write_fields( &
     self, field_names, host_fields, solver, writer_session, data_loc &
     )
-    !! Write field data for checkpoints (no striding)
-    class(checkpoint_manager_t), intent(inout) :: self
-    character(len=*), dimension(:), intent(in) :: field_names
-    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields
-    class(solver_t), intent(in) :: solver
-    type(writer_session_t), intent(inout) :: writer_session
-    integer, intent(in) :: data_loc
+    !! Write velocity field data to checkpoint file (internal).
+    !!
+    !! Writes field data at full resolution (no striding for checkpoints).
+    !! The procedure:
+    !! 1. Prepare field buffers for full resolution output
+    !! 2. Calculate output dimensions and hyperslab selection
+    !! 3. For each field (u, v, w):
+    !!    - Copy field data to output buffer
+    !!    - Write buffer to file with proper hyperslab parameters
+    !!
+    !! **Full Resolution:** Unlike snapshots (which can be strided),
+    !! checkpoints always write full-resolution data to enable exact restart.
+    !!
+    !! **Parallel I/O:** Each MPI rank writes its local subdomain using
+    !! hyperslab selection (output_start, output_count) to assemble the
+    !! global field in the file.
+    class(checkpoint_manager_t), intent(inout) :: self      !! Checkpoint manager instance
+    character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"]
+    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers
+    class(solver_t), intent(in) :: solver                   !! Solver containing mesh info
+    type(writer_session_t), intent(inout) :: writer_session !! I/O writer session
+    integer, intent(in) :: data_loc                         !! Data location (VERT or CELL)
 
     integer :: i_field
     integer(i8), dimension(3) :: output_start, output_count
@@ -505,15 +591,22 @@ subroutine write_fields( &
   end subroutine write_fields
 
   subroutine cleanup_output_buffers(self)
-    !! Clean up dynamic field buffers
-    class(checkpoint_manager_t), intent(inout) :: self
+    !! Clean up dynamically allocated field buffers (internal).
+    !!
+    !! Frees memory allocated for field I/O buffers. Called during
+    !! finalisation to prevent memory leaks.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
 
     call cleanup_field_buffers(self%field_buffers)
   end subroutine cleanup_output_buffers
 
   subroutine finalise(self)
-    !! Clean up checkpoint manager
-    class(checkpoint_manager_t), intent(inout) :: self
+    !! Finalise checkpoint manager and free resources.
+    !!
+    !! Cleans up all dynamically allocated buffers. Should be called
+    !! at the end of simulation or when checkpoint manager is no longer
+    !! needed.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
 
     call self%cleanup_output_buffers()
   end subroutine finalise
diff --git a/src/io/dummy/io.f90 b/src/io/dummy/io.f90
index 69dbb3c73..0e282fbbe 100644
--- a/src/io/dummy/io.f90
+++ b/src/io/dummy/io.f90
@@ -1,20 +1,31 @@
 module m_io_backend
-!! @brief Provides a dummy, non-functional I/O backend for when an I/O backend
-!! is not available
-!!
-!! @details This module provides a fallback implementation of the I/O backend
-!! interface. It is used when no real I/O backend (e.g. ADIOS2) is enabled at
-!! compile time.
-!!
-!! The primary purpose of this dummy backend is to allow the full program to
-!! compile and link against the session interface (`m_io_session`) without
-!! requiring a functional I/O library.
-!!
-!! @warning This is a non-functional stub. Calling any of its I/O procedures
-!! will immediately terminate the program with an error message.
-!!
-!! @note If you require file I/O, you must recompile the code with a functional
-!! backend
+  !! Dummy (non-functional) I/O backend for when no real backend is available.
+  !!
+  !! This module provides a fallback implementation of the I/O backend
+  !! interface used when no real I/O backend (e.g., ADIOS2) is enabled at
+  !! compile time. It allows the code to compile and link without a functional
+  !! I/O library.
+  !!
+  !! **Purpose:**
+  !!
+  !! - Enables compilation without external I/O library dependencies
+  !! - Provides informative error messages when I/O operations are attempted
+  !! - Allows code structure to remain consistent regardless of I/O backend
+  !!
+  !! **Behaviour:**
+  !!
+  !! - Write operations are silently ignored (no-op)
+  !! - Read operations terminate with error message directing user to recompile
+  !! - File open/close operations are tracked but perform no actual I/O
+  !!
+  !! **Use Cases:**
+  !!
+  !! - Testing/debugging without I/O overhead
+  !! - Systems where ADIOS2 is unavailable
+  !! - Dry runs to validate simulation setup
+  !!
+  !! **Warning:** This is a non-functional stub. If you require actual file I/O,
+  !! recompile with `-DWITH_ADIOS2=ON` to enable the ADIOS2 backend.
   use iso_fortran_env, only: stderr => error_unit
   use m_io_base, only: io_reader_t, io_writer_t, io_file_t, io_mode_read, &
                        io_mode_write
@@ -26,45 +37,48 @@ module m_io_backend
   public :: allocate_io_reader, allocate_io_writer
   public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2
 
-  logical, save :: write_warning_shown = .false.
+  logical, save :: write_warning_shown = .false. !! Track if warning has been displayed
 
-  integer, parameter :: IO_BACKEND_DUMMY = 0
-  integer, parameter :: IO_BACKEND_ADIOS2 = 1
+  integer, parameter :: IO_BACKEND_DUMMY = 0   !! Dummy backend identifier
+  integer, parameter :: IO_BACKEND_ADIOS2 = 1  !! ADIOS2 backend identifier
 
   type, extends(io_file_t) :: io_dummy_file_t
-    logical :: is_open = .false.
+    !! Dummy file handle (tracks state but performs no I/O).
+    logical :: is_open = .false. !! File open state flag
   contains
-    procedure :: close => file_close_dummy
-    procedure :: begin_step => file_begin_step_dummy
-    procedure :: end_step => file_end_step_dummy
-    procedure :: is_file_functional => is_file_functional_dummy
+    procedure :: close => file_close_dummy                  !! Close file (no-op)
+    procedure :: begin_step => file_begin_step_dummy        !! Begin step (no-op)
+    procedure :: end_step => file_end_step_dummy            !! End step (no-op)
+    procedure :: is_file_functional => is_file_functional_dummy !! Check if functional
   end type io_dummy_file_t
 
   type, extends(io_reader_t) :: io_dummy_reader_t
-    logical :: initialised = .false.
+    !! Dummy reader (errors on read attempts).
+    logical :: initialised = .false. !! Initialisation state flag
   contains
-    procedure :: init => reader_init_dummy
-    procedure :: open => reader_open_dummy
-    procedure :: finalise => reader_finalise_dummy
-    procedure :: read_data_i8 => read_data_i8_dummy
-    procedure :: read_data_integer => read_data_integer_dummy
-    procedure :: read_data_real => read_data_real_dummy
-    procedure :: read_data_array_3d => read_data_array_3d_dummy
+    procedure :: init => reader_init_dummy                  !! Initialise reader
+    procedure :: open => reader_open_dummy                  !! Open file (returns non-functional handle)
+    procedure :: finalise => reader_finalise_dummy          !! Finalise (no-op)
+    procedure :: read_data_i8 => read_data_i8_dummy         !! Read i8 (errors)
+    procedure :: read_data_integer => read_data_integer_dummy !! Read integer (errors)
+    procedure :: read_data_real => read_data_real_dummy     !! Read real (errors)
+    procedure :: read_data_array_3d => read_data_array_3d_dummy !! Read 3D array (errors)
   end type io_dummy_reader_t
 
   type, extends(io_writer_t) :: io_dummy_writer_t
-    logical :: initialised = .false.
+    !! Dummy writer (silently ignores write operations).
+    logical :: initialised = .false. !! Initialisation state flag
   contains
-    procedure :: init => writer_init_dummy
-    procedure :: open => writer_open_dummy
-    procedure :: finalise => writer_finalise_dummy
-    procedure :: write_data_i8 => write_data_i8_dummy
-    procedure :: write_data_integer => write_data_integer_dummy
-    procedure :: write_data_real => write_data_real_dummy
-    procedure :: write_data_array_3d => write_data_array_3d_dummy
-    procedure :: write_attribute_string => write_attribute_string_dummy
+    procedure :: init => writer_init_dummy                  !! Initialise writer
+    procedure :: open => writer_open_dummy                  !! Open file (returns non-functional handle)
+    procedure :: finalise => writer_finalise_dummy          !! Finalise (no-op)
+    procedure :: write_data_i8 => write_data_i8_dummy       !! Write i8 (no-op)
+    procedure :: write_data_integer => write_data_integer_dummy !! Write integer (no-op)
+    procedure :: write_data_real => write_data_real_dummy   !! Write real (no-op)
+    procedure :: write_data_array_3d => write_data_array_3d_dummy !! Write 3D array (no-op)
+    procedure :: write_attribute_string => write_attribute_string_dummy !! Write string attribute (no-op)
     procedure :: write_attribute_array_1d_real => &
-      write_attribute_array_1d_real_dummy
+      write_attribute_array_1d_real_dummy                   !! Write 1D real array attribute (no-op)
   end type io_dummy_writer_t
 
 contains
diff --git a/src/io/io_base.f90 b/src/io/io_base.f90
index c6860409f..84537be06 100644
--- a/src/io/io_base.f90
+++ b/src/io/io_base.f90
@@ -1,32 +1,31 @@
 module m_io_base
-!! @brief Provides the abstract base types and interfaces for the session-based
-!! I/O architecture.
-!!
-!! @details This internal module defines the fundamental building blocks of
-!! the I/O system. It establishes a polymorphic layer that allows the
-!! high-level user session to interact with various I/O backends through a
-!! consistent interface.
-!!
-!! The architecture is designed in distinct layers:
-!! User code
-!! - interacts only with the Session layer
-!!
-!! Session layer (`m_io_session`)
-!! - manages all I/O complexity (file handles, state, etc.)
-!! - instantiates the I/O backend selected at compile-time
-!! - provides `reader_session_t` and `writer_session_t` for users
-!!
-!! Backend layer (`m_io_backend`)
-!! - concrete implementation of an I/O backed (e.g., ADIOS2)
-!! - extends the abstract base types defined in this module
-!!
-!! Base layer (`m_io_base`, this module)
-!! - provides abstract `reader_base_t` and `writer_base_t` types
-!! - enforces a consistent interface for all backends
-!!
-!! @note This is an internal module and should not be used directly by users.
-!! The sole public interface for I/O is the high-level session API provided in
-!! `m_io_session`.
+  !! Abstract base types and interfaces for session-based I/O architecture.
+  !!
+  !! This internal module defines the fundamental building blocks of the I/O
+  !! system. It establishes a polymorphic layer that allows the high-level
+  !! user session to interact with various I/O backends (e.g., ADIOS2, dummy)
+  !! through a consistent interface.
+  !!
+  !! **Architecture Layers:**
+  !!
+  !! 1. **User Code** - interacts only with the Session layer
+  !!
+  !! 2. **Session Layer** (`m_io_session`)
+  !!    - Manages all I/O complexity (file handles, state, etc.)
+  !!    - Instantiates the I/O backend selected at compile-time
+  !!    - Provides `reader_session_t` and `writer_session_t` for users
+  !!
+  !! 3. **Backend Layer** (`m_io_backend`)
+  !!    - Concrete implementation of an I/O backend (e.g., ADIOS2)
+  !!    - Extends the abstract base types defined in this module
+  !!
+  !! 4. **Base Layer** (`m_io_base`, this module)
+  !!    - Provides abstract `io_reader_t` and `io_writer_t` types
+  !!    - Enforces a consistent interface for all backends
+  !!
+  !! **Note:** This is an internal module and should not be used directly by
+  !! users. The sole public interface for I/O is the high-level session API
+  !! provided in `m_io_session`.
 
   use m_common, only: dp, i8
 
@@ -36,50 +35,64 @@ module m_io_base
   public :: io_reader_t, io_writer_t, io_file_t
   public :: io_mode_read, io_mode_write
 
-  integer, parameter :: io_mode_read = 1
-  integer, parameter :: io_mode_write = 2
+  integer, parameter :: io_mode_read = 1   !! Read mode flag for opening files
+  integer, parameter :: io_mode_write = 2  !! Write mode flag for opening files
 
-  !> Base file handle for I/O operations
   type :: io_file_t
+    !! Base file handle for I/O operations.
+    !!
+    !! This abstract type represents an open file handle. Concrete backends
+    !! extend this type to implement backend-specific file operations.
+    !! Provides step-based I/O for time-series data.
   contains
-    procedure :: close => base_close
-    procedure :: begin_step => base_begin_step
-    procedure :: end_step => base_end_step
-    procedure :: is_file_functional => base_is_file_functional
+    procedure :: close => base_close                     !! Close the file
+    procedure :: begin_step => base_begin_step           !! Begin a new I/O step
+    procedure :: end_step => base_end_step               !! End current I/O step
+    procedure :: is_file_functional => base_is_file_functional !! Check if file is operational
   end type io_file_t
 
-  !> Base I/O reader type for polymorphic usage
   type :: io_reader_t
+    !! Base I/O reader type for polymorphic usage.
+    !!
+    !! This abstract type provides the interface for reading data from files.
+    !! Concrete backends (e.g., ADIOS2) extend this type to implement
+    !! backend-specific reading operations. Supports reading scalars and
+    !! 3D arrays with optional hyperslab selection.
   contains
-    procedure :: init => base_reader_init
-    procedure :: open => base_reader_open
-    procedure :: finalise => base_reader_finalise
+    procedure :: init => base_reader_init                !! Initialise reader
+    procedure :: open => base_reader_open                !! Open file for reading
+    procedure :: finalise => base_reader_finalise        !! Finalise and clean up
     ! Generic interfaces for session usage
     generic :: read_data => read_data_i8, read_data_integer, read_data_real, &
-      read_data_array_3d
-    procedure :: read_data_i8
-    procedure :: read_data_integer
-    procedure :: read_data_real
-    procedure :: read_data_array_3d
+      read_data_array_3d                                 !! Read data (generic interface)
+    procedure :: read_data_i8                            !! Read 64-bit integer
+    procedure :: read_data_integer                       !! Read default integer
+    procedure :: read_data_real                          !! Read double precision real
+    procedure :: read_data_array_3d                      !! Read 3D array
   end type io_reader_t
 
-  !> Base I/O writer type for polymorphic usage
   type :: io_writer_t
+    !! Base I/O writer type for polymorphic usage.
+    !!
+    !! This abstract type provides the interface for writing data to files.
+    !! Concrete backends (e.g., ADIOS2) extend this type to implement
+    !! backend-specific writing operations. Supports writing scalars,
+    !! 3D arrays, and attributes.
   contains
-    procedure :: init => base_writer_init
-    procedure :: open => base_writer_open
-    procedure :: finalise => base_writer_finalise
+    procedure :: init => base_writer_init                !! Initialise writer
+    procedure :: open => base_writer_open                !! Open file for writing
+    procedure :: finalise => base_writer_finalise        !! Finalise and clean up
     generic :: write_data => write_data_i8, write_data_integer, &
       write_data_real, &
-      write_data_array_3d
-    procedure :: write_data_i8
-    procedure :: write_data_integer
-    procedure :: write_data_real
-    procedure :: write_data_array_3d
+      write_data_array_3d                                !! Write data (generic interface)
+    procedure :: write_data_i8                           !! Write 64-bit integer
+    procedure :: write_data_integer                      !! Write default integer
+    procedure :: write_data_real                         !! Write double precision real
+    procedure :: write_data_array_3d                     !! Write 3D array
     generic :: write_attribute => write_attribute_string, &
-      write_attribute_array_1d_real
-    procedure :: write_attribute_string
-    procedure :: write_attribute_array_1d_real
+      write_attribute_array_1d_real                      !! Write attribute (generic interface)
+    procedure :: write_attribute_string                  !! Write string attribute
+    procedure :: write_attribute_array_1d_real           !! Write 1D real array attribute
   end type io_writer_t
 
 contains
diff --git a/src/io/io_field_utils.f90 b/src/io/io_field_utils.f90
index be8911a7b..8ebb3dc17 100644
--- a/src/io/io_field_utils.f90
+++ b/src/io/io_field_utils.f90
@@ -1,10 +1,11 @@
 module m_io_field_utils
-!! @brief Provides common utilities and helper routines for field I/O
-!! operations
+!! Common utilities and helper routines for field I/O operations.
+!!
+!! This module contains a collection of procedures and derived types that
+!! handle the low-level tasks required for writing field data.
+!!
+!! **Primary functionalities:**
 !!
-!! @details This module contains a collection of procedures and derived
-!! types that handle the low-level tasks required for writing field data
-!! Its primary functionalities include:
 !! - Data sub-sampling (striding) - applying a stride to data to reduce the
 !! size of the output files
 !! - Parallel I/O calculations - determining correct global shapes,
@@ -27,14 +28,56 @@ module m_io_field_utils
             cleanup_field_buffers
 
   type :: field_buffer_map_t
-    ! Race-free field buffer mapping for async I/O operations.
-    ! Each field gets its own dedicated buffer to prevent data races
-    ! when multiple async write operations are in flight.
+    !! Named buffer for thread-safe asynchronous I/O operations.
+    !!
+    !! This type maps a field name to its dedicated memory buffer, preventing
+    !! data races when multiple asynchronous write operations are in flight
+    !! simultaneously.
+    !!
+    !! **Purpose:**
+    !!
+    !! During asynchronous I/O, fields are copied into persistent buffers that
+    !! remain valid while I/O operations execute in the background. Each field
+    !! gets its own buffer identified by name, ensuring:
+    !!
+    !! - **Thread safety**: No conflicts between concurrent writes
+    !! - **Data integrity**: Field data remains stable during async operations
+    !! - **Flexibility**: Supports strided/downsampled data for visualization
+    !!
+    !! **Workflow:**
+    !!
+    !! 1. `prepare_field_buffers`: Allocate buffers for all fields
+    !! 2. `write_single_field_to_buffer`: Copy field data into named buffer
+    !! 3. ADIOS2 writes from buffer (async, non-blocking)
+    !! 4. `cleanup_field_buffers`: Deallocate buffers when done
+    !!
+    !! **Components:**
+    !!
+    !! - `field_name`: Identifier for buffer lookup (e.g., "u", "v", "w", "p")
+    !! - `buffer`: 3D array holding field data (possibly strided)
     character(len=32) :: field_name
     real(dp), dimension(:, :, :), allocatable :: buffer
   end type field_buffer_map_t
 
   type :: field_ptr_t
+    !! Wrapper type for storing polymorphic field pointers in arrays.
+    !!
+    !! Fortran does not allow allocatable arrays of polymorphic pointers directly
+    !! (e.g., `class(field_t), pointer :: fields(:)`), so this wrapper type
+    !! enables creating arrays of field pointers:
+    !!
+    !! ```fortran
+    !! type(field_ptr_t), allocatable :: field_array(:)
+    !! ```
+    !!
+    !! **Use cases:**
+    !!
+    !! - Managing multiple fields for I/O operations
+    !! - Storing references to velocity components (u, v, w)
+    !! - Building lists of fields to write/read simultaneously
+    !!
+    !! **Note:** Each `field_ptr_t` holds a pointer to a `field_t` object;
+    !! the pointer can be null if not yet associated.
     class(field_t), pointer :: ptr => null()
   end type field_ptr_t
 
diff --git a/src/io/io_manager.f90 b/src/io/io_manager.f90
index a9b50a72f..18dab197c 100644
--- a/src/io/io_manager.f90
+++ b/src/io/io_manager.f90
@@ -1,12 +1,29 @@
 module m_io_manager
-!! @brief Provides a high-level manager that orchestrates all checkpoint and
-!! snapshot operations.
-!!
-!! @details This module acts as a facade to the I/O subsystem.
-!! Its purpose is to simplify the main simulation loop by providing
-!! a single point of contact for all I/O-related actions. The mainprogram only
-!! needs to interact with the `io_manager_t` type, which then delegates tasks
-!! to the specialised checkpoint and snapshot managers.
+  !! High-level manager orchestrating checkpoint and snapshot operations.
+  !!
+  !! This module acts as a facade to the I/O subsystem, simplifying the main
+  !! simulation loop by providing a single point of contact for all I/O-related
+  !! actions. The main program only needs to interact with `io_manager_t`, which
+  !! delegates tasks to specialised checkpoint and snapshot managers.
+  !!
+  !! **Responsibilities:**
+  !!
+  !! - Initialise checkpoint and snapshot managers
+  !! - Coordinate restart from checkpoints
+  !! - Orchestrate periodic checkpoint and snapshot writes
+  !! - Finalise I/O operations and clean up resources
+  !!
+  !! **Usage Pattern:**
+  !!
+  !! ```fortran
+  !! type(io_manager_t) :: io_mgr
+  !! call io_mgr%init(comm)
+  !! if (io_mgr%is_restart()) call io_mgr%handle_restart(solver, comm)
+  !! do timestep = 1, n_steps
+  !!   call io_mgr%handle_io_step(solver, timestep, comm)
+  !! end do
+  !! call io_mgr%finalise()
+  !! ```
   use m_checkpoint_manager, only: checkpoint_manager_t
   use m_snapshot_manager, only: snapshot_manager_t
   use m_solver, only: solver_t
@@ -17,53 +34,90 @@ module m_io_manager
   public :: io_manager_t
 
   type :: io_manager_t
-    type(checkpoint_manager_t) :: checkpoint_mgr
-    type(snapshot_manager_t) :: snapshot_mgr
+    !! Unified manager for checkpoint and snapshot operations.
+    !!
+    !! Contains both checkpoint and snapshot managers and provides
+    !! a simplified interface for the main simulation loop.
+    type(checkpoint_manager_t) :: checkpoint_mgr !! Manages restart and checkpoint files
+    type(snapshot_manager_t) :: snapshot_mgr     !! Manages visualisation output files
   contains
-    procedure :: init => io_init
-    procedure :: handle_restart => io_handle_restart
-    procedure :: handle_io_step => io_handle_step
-    procedure :: finalise => io_finalise
-    procedure :: is_restart => io_is_restart
+    procedure :: init => io_init                   !! Initialise I/O managers
+    procedure :: handle_restart => io_handle_restart !! Load restart data if needed
+    procedure :: handle_io_step => io_handle_step  !! Process checkpoints/snapshots for timestep
+    procedure :: finalise => io_finalise           !! Finalise and clean up
+    procedure :: is_restart => io_is_restart       !! Check if simulation is restarting
   end type io_manager_t
 
 contains
 
   subroutine io_init(self, comm)
-    class(io_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise checkpoint and snapshot managers.
+    !!
+    !! Sets up both managers by passing the MPI communicator. Each manager
+    !! reads its configuration and prepares for I/O operations.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
+    integer, intent(in) :: comm                 !! MPI communicator
 
     call self%checkpoint_mgr%init(comm)
     call self%snapshot_mgr%init(comm)
   end subroutine io_init
 
   subroutine io_handle_restart(self, solver, comm)
-    class(io_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    integer, intent(in), optional :: comm
+    !! Handle restart by loading checkpoint data.
+    !!
+    !! Delegates to the checkpoint manager to load solver state from
+    !! the most recent checkpoint file. Should only be called if
+    !! `is_restart()` returns true.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self    !! I/O manager instance
+    class(solver_t), intent(inout) :: solver      !! Solver to load state into
+    integer, intent(in), optional :: comm         !! MPI communicator (optional)
 
     call self%checkpoint_mgr%handle_restart(solver, comm)
   end subroutine io_handle_restart
 
   subroutine io_handle_step(self, solver, timestep, comm)
-    class(io_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Handle I/O operations for current timestep.
+    !!
+    !! Checks if checkpoint or snapshot output is required at this timestep
+    !! and writes data accordingly. Typically called at the end of each
+    !! timestep in the main simulation loop.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
+    class(solver_t), intent(in) :: solver      !! Solver containing current state
+    integer, intent(in) :: timestep             !! Current timestep number
+    integer, intent(in), optional :: comm       !! MPI communicator (optional)
 
     call self%checkpoint_mgr%handle_checkpoint_step(solver, timestep, comm)
     call self%snapshot_mgr%handle_snapshot_step(solver, timestep, comm)
   end subroutine io_handle_step
 
   function io_is_restart(self) result(is_restart)
-    class(io_manager_t), intent(in) :: self
-    logical :: is_restart
+    !! Check if simulation is restarting from checkpoint.
+    !!
+    !! Queries the checkpoint manager to determine if a restart file
+    !! exists and should be loaded.
+    implicit none
+
+    class(io_manager_t), intent(in) :: self !! I/O manager instance
+    logical :: is_restart                    !! True if restarting from checkpoint
 
     is_restart = self%checkpoint_mgr%is_restart()
   end function io_is_restart
 
   subroutine io_finalise(self)
-    class(io_manager_t), intent(inout) :: self
+    !! Finalise I/O operations and clean up resources.
+    !!
+    !! Closes any open files and releases resources held by both
+    !! checkpoint and snapshot managers. Should be called at the end
+    !! of the simulation.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
 
     call self%checkpoint_mgr%finalise()
     call self%snapshot_mgr%finalise()
diff --git a/src/io/io_session.f90 b/src/io/io_session.f90
index 9eae14f7b..21c0de7e7 100644
--- a/src/io/io_session.f90
+++ b/src/io/io_session.f90
@@ -1,12 +1,12 @@
 module m_io_session
-!! @brief Provides high-level, session-based user interface for all I/O
-!! operations
+!! High-level, session-based user interface for all I/O operations.
 !!
-!! @details This module is the sole entry point for file reading and writing.
+!! This module is the sole entry point for file reading and writing.
 !! It abstracts away all backend details and provides a type-safe interface
 !! for all I/O tasks.
 !!
-!! Key features:
+!! **Key features:**
+!!
 !! - Type-safe sessions: specialised `reader_session_t` and `writer_session_t`
 !!   types for reading and writing operations, respectively.
 !! - Automatic backend selection: based on compile-time options
@@ -16,10 +16,11 @@ module m_io_session
 !! `open -> read/write -> close` workflow, with no need for manual file handle
 !!   management or explicit cleanup calls.
 !!
-!! @example
-!! A typical usage pattern for reading data and writing data:
+!! **Usage Example:**
+!!
+!! A typical usage pattern for reading and writing data:
 !!
-!! @code{.f90}
+!! ```fortran
 !! use m_io_session, only: writer_session_t, reader_session_t
 !!
 !! implicit none
@@ -39,9 +40,9 @@ module m_io_session
 !! call reader%read_data("temperature", temp_field)
 !! call reader%close()
 !! ! Note: reader is automatically cleaned up when it goes out of scope
-!! @endcode
+!! ```
 !!
-!! @note Users should only use the types provided by this module. The lower-level
+!! **Note:** Users should only use the types provided by this module. The lower-level
 !! modules like `m_io_base` and `m_io_backend` are internal components and should
 !! never be used directly in user code.
   use m_common, only: dp, i8
@@ -68,16 +69,19 @@ module m_io_session
     procedure :: close => session_base_close
   end type io_session_base_t
 
-  !> **PRIMARY TYPE FOR READING DATA** - Use this for all file reading operations
+  !> PRIMARY TYPE FOR READING DATA - Use this for all file reading operations
   !! This is the only interface users should use for reading data.
   !! Provides type-safe reading operations with automatic backend selection.
   !!
-  !! Usage example:
+  !! **Usage example:**
+  !!
+  !! ```fortran
   !!   type(reader_session_t) :: reader_session
   !!   call reader_session%open("checkpoint.bp", MPI_COMM_WORLD)
   !!   call reader_session%read_data("timestep", timestep)
   !!   call reader_session%read_data("velocity_u", u_field, start_dims, count_dims)
   !!   call reader_session%close()
+  !! ```
   type, extends(io_session_base_t) :: reader_session_t
     private
     class(io_reader_t), allocatable :: reader
@@ -94,18 +98,20 @@ module m_io_session
     final :: reader_session_finaliser
   end type reader_session_t
 
-  !> **PRIMARY TYPE FOR WRITING DATA** - Use this for all file writing operations
+  !> PRIMARY TYPE FOR WRITING DATA - Use this for all file writing operations
   !! This is the only interface users should use for writing data.
   !! Provides type-safe writing operations with automatic backend selection.
   !!
-  !! Usage example:
-  !!   type(writer_session_t) :: writer_session
-  !!   call writer_session%open("output.bp", MPI_COMM_WORLD)
-  !!   call writer_session%write_data("timestep", current_step)
-  !!   call writer_session%write_data("pressure", p_field, start_dims, count_dims)
-  !!   call writer_session%close()
-  !!   call writer_session%write_attribute("ParaView", "vtk_xml_content")
-  !!   call writer_session%close()
+  !! **Usage example:**
+  !!
+  !! ```fortran
+  !! type(writer_session_t) :: writer_session
+  !! call writer_session%open("output.bp", MPI_COMM_WORLD)
+  !! call writer_session%write_data("timestep", current_step)
+  !! call writer_session%write_data("pressure", p_field, start_dims, count_dims)
+  !! call writer_session%close()
+  !! call writer_session%write_attribute("ParaView", "vtk_xml_content")
+  !! ```
   type, extends(io_session_base_t) :: writer_session_t
     private
     class(io_writer_t), allocatable :: writer
diff --git a/src/io/snapshot_manager.f90 b/src/io/snapshot_manager.f90
index 68ddfa841..1d03052a0 100644
--- a/src/io/snapshot_manager.f90
+++ b/src/io/snapshot_manager.f90
@@ -1,11 +1,33 @@
 module m_snapshot_manager
-!! @brief Manages the creation of simulation snapshots for post-processing
-!! and visualisation.
-!!
-!! @details This module is responsible for periodically writing simulation
-!! data to files intended for analysis and visualisation
-!! Unlike checkpoints, which are always full-resolution for exact restarts,
-!! snapshots can be strided to reduce file size.
+  !! Manages creation of simulation snapshots for post-processing and visualisation.
+  !!
+  !! This module periodically writes simulation data to files intended for
+  !! analysis and visualisation. Unlike checkpoints (full-resolution for exact
+  !! restarts), snapshots can be strided to reduce file size while retaining
+  !! sufficient resolution for visualisation.
+  !!
+  !! **Key Differences from Checkpoints:**
+  !!
+  !! - **Purpose**: Visualisation/analysis vs exact restart
+  !! - **Resolution**: Can be strided (e.g., every 2nd point) vs full resolution
+  !! - **Frequency**: Typically more frequent than checkpoints
+  !! - **File Management**: Single persistent file with multiple timesteps vs
+  !!   separate files per checkpoint
+  !!
+  !! **Features:**
+  !!
+  !! - Configurable spatial striding to reduce output size
+  !! - Persistent file handle (stays open across multiple writes)
+  !! - Generates VTK-compatible XML for ParaView visualisation
+  !! - Writes velocity fields at each snapshot interval
+  !!
+  !! **Configuration:**
+  !!
+  !! Controlled via `checkpoint_config_t` read from input namelist:
+  !!
+  !! - `snapshot_freq`: write interval (iterations)
+  !! - `snapshot_prefix`: filename prefix
+  !! - `output_stride`: spatial stride factors [`sx`, `sy`, `sz`]
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank
   use m_common, only: dp, i8, DIR_C, VERT, get_argument
   use m_field, only: field_t
@@ -24,34 +46,43 @@ module m_snapshot_manager
   public :: snapshot_manager_t
 
   type :: snapshot_manager_t
-    type(checkpoint_config_t) :: config
-    integer, dimension(3) :: output_stride = [1, 1, 1]
-    type(field_buffer_map_t), allocatable :: field_buffers(:)
-    integer(i8), dimension(3) :: last_shape_dims = 0
-    integer, dimension(3) :: last_stride_factors = 0
-    integer(i8), dimension(3) :: last_output_shape = 0
-    character(len=4096) :: vtk_xml = ""
-    logical :: is_snapshot_file_open = .false.
-    type(writer_session_t) :: snapshot_writer
-    logical :: convert_to_sp = .false.              !! Flag for single precision snapshots
+    !! Manager for snapshot file operations (periodic visualisation output).
+    !!
+    !! Handles periodic writing of visualisation data with optional striding.
+    !! Maintains a persistent file handle that stays open across multiple
+    !! snapshot writes for efficient I/O.
+    type(checkpoint_config_t) :: config                  !! Configuration settings
+    integer, dimension(3) :: output_stride = [1, 1, 1]   !! Spatial stride factors [sx, sy, sz]
+    type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O
+    integer(i8), dimension(3) :: last_shape_dims = 0     !! Shape dimensions from last write
+    integer, dimension(3) :: last_stride_factors = 0     !! Stride factors from last write
+    integer(i8), dimension(3) :: last_output_shape = 0   !! Output shape from last write
+    character(len=4096) :: vtk_xml = ""                  !! VTK XML metadata for ParaView
+    logical :: is_snapshot_file_open = .false.           !! File handle state flag
+    type(writer_session_t) :: snapshot_writer            !! I/O session writer
+    logical :: convert_to_sp = .false.                   !! Flag for single precision snapshots
   contains
-    procedure :: init
-    procedure :: handle_snapshot_step
-    procedure :: finalise
-    procedure, private :: write_snapshot
-    procedure, private :: write_fields
-    procedure, private :: cleanup_output_buffers
-    procedure, private :: generate_vtk_xml
-    procedure, private :: open_snapshot_file
-    procedure, private :: close_snapshot_file
+    procedure :: init                          !! Initialise snapshot manager
+    procedure :: handle_snapshot_step          !! Write snapshot if needed at timestep
+    procedure :: finalise                      !! Clean up and finalise
+    procedure, private :: write_snapshot       !! Write snapshot file (internal)
+    procedure, private :: write_fields         !! Write field data to file (internal)
+    procedure, private :: cleanup_output_buffers !! Free output buffers (internal)
+    procedure, private :: generate_vtk_xml     !! Generate VTK XML metadata (internal)
+    procedure, private :: open_snapshot_file   !! Open snapshot file (internal)
+    procedure, private :: close_snapshot_file  !! Close snapshot file (internal)
   end type snapshot_manager_t
 
 contains
 
   subroutine init(self, comm)
-    !! Initialise snapshot manager
-    class(snapshot_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise snapshot manager from configuration.
+    !!
+    !! Reads snapshot settings from input namelist and configures
+    !! output if snapshot frequency is positive. Prints snapshot
+    !! settings including stride factors on root process.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    integer, intent(in) :: comm                       !! MPI communicator
 
     self%config = checkpoint_config_t()
     call self%config%read(nml_file=get_argument(1))
@@ -62,10 +93,13 @@ subroutine init(self, comm)
   end subroutine init
 
   subroutine configure_output(self, comm)
-    !! Configure snapshot output settings
+    !! Configure and print snapshot output settings.
+    !!
+    !! Displays snapshot configuration on root process including
+    !! frequency, file prefix, and output stride factors.
     use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY
-    class(snapshot_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    integer, intent(in) :: comm                       !! MPI communicator
 
     integer :: myrank, ierr
 
@@ -84,11 +118,15 @@ subroutine configure_output(self, comm)
   end subroutine configure_output
 
   subroutine handle_snapshot_step(self, solver, timestep, comm)
-    !! Handle snapshot writing at a given timestep
-    class(snapshot_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Write snapshot if frequency condition is met.
+    !!
+    !! Checks if current timestep is a snapshot interval (divisible by
+    !! snapshot_freq) and writes snapshot if so. Called each timestep
+    !! from main simulation loop.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    class(solver_t), intent(in) :: solver            !! Solver containing current state
+    integer, intent(in) :: timestep                   !! Current timestep number
+    integer, intent(in), optional :: comm             !! MPI communicator (optional)
 
     integer :: comm_to_use
 
@@ -99,13 +137,15 @@ subroutine handle_snapshot_step(self, solver, timestep, comm)
   end subroutine handle_snapshot_step
 
   subroutine write_snapshot(self, solver, timestep, comm)
-    !! Write a snapshot file for visualisation
-    !! Uses a persistent file that stays open across multiple snapshots
-    !! Each snapshot is written as a separate timestep in the file
-    class(snapshot_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in) :: comm
+    !! Write a snapshot file for visualisation.
+    !!
+    !! Uses a persistent file that stays open across multiple snapshots.
+    !! Each snapshot is written as a separate timestep within the file.
+    !! Data can be strided according to output_stride configuration.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    class(solver_t), intent(in) :: solver            !! Solver containing field data
+    integer, intent(in) :: timestep                   !! Current timestep number
+    integer, intent(in) :: comm                       !! MPI communicator
 
     character(len=*), parameter :: field_names(*) = ["u", "v", "w"]
     integer :: myrank, ierr
@@ -179,11 +219,27 @@ subroutine write_snapshot(self, solver, timestep, comm)
   end subroutine write_snapshot
 
   subroutine generate_vtk_xml(self, dims, fields, origin, spacing)
-    !! Generate VTK XML string for ImageData format for ParaView's ADIOS2VTXReader
-    class(snapshot_manager_t), intent(inout) :: self
-    integer(i8), dimension(3), intent(in) :: dims
-    character(len=*), dimension(:), intent(in) :: fields
-    real(dp), dimension(3), intent(in) :: origin, spacing
+    !! Generate VTK XML metadata for ParaView visualization (internal).
+    !!
+    !! Creates VTK ImageData XML string that describes the structured grid
+    !! for ParaView's ADIOS2VTXReader. This enables direct visualization of
+    !! ADIOS2 files in ParaView without conversion.
+    !!
+    !! **VTK ImageData Format:**
+    !!
+    !! - Defines structured rectilinear grid with uniform spacing
+    !! - Extent: grid dimensions from 0 to N-1 in (z,y,x) order
+    !! - Origin: physical coordinates of first grid point
+    !! - Spacing: grid resolution (dx, dy, dz)
+    !! - Point data: velocity fields (u, v, w) stored at grid points
+    !!
+    !! **Note:** VTK uses (x,y,z) order while X3D2 uses (z,y,x) internally,
+    !! requiring dimension reordering in the extent string.
+    class(snapshot_manager_t), intent(inout) :: self        !! Snapshot manager instance
+    integer(i8), dimension(3), intent(in) :: dims           !! Grid dimensions [nx, ny, nz]
+    character(len=*), dimension(:), intent(in) :: fields    !! Field names ["u", "v", "w"]
+    real(dp), dimension(3), intent(in) :: origin            !! Grid origin [x0, y0, z0]
+    real(dp), dimension(3), intent(in) :: spacing           !! Grid spacing [dx, dy, dz]
 
     character(len=4096) :: xml
     character(len=96) :: extent_str, origin_str, spacing_str
@@ -223,13 +279,28 @@ end subroutine generate_vtk_xml
   subroutine write_fields( &
     self, field_names, host_fields, solver, writer_session, data_loc &
     )
-    !! Write field data with striding for snapshots
-    class(snapshot_manager_t), intent(inout) :: self
-    character(len=*), dimension(:), intent(in) :: field_names
-    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields
-    class(solver_t), intent(in) :: solver
-    type(writer_session_t), intent(inout) :: writer_session
-    integer, intent(in) :: data_loc
+    !! Write field data with optional striding for snapshots (internal).
+    !!
+    !! Writes field data with spatial striding to reduce file size while
+    !! maintaining sufficient resolution for visualization. The procedure:
+    !! 1. Prepare field buffers with configured stride factors
+    !! 2. Calculate strided output dimensions and hyperslab selection
+    !! 3. For each field (u, v, w):
+    !!    - Copy strided field data to output buffer
+    !!    - Write buffer to file with proper hyperslab parameters
+    !!
+    !! **Striding:** Unlike checkpoints (full resolution), snapshots can
+    !! subsample data. For example, stride [2,2,2] writes every 2nd point
+    !! in each direction, reducing file size by factor of 8.
+    !!
+    !! **Parallel I/O:** Each MPI rank writes its strided local subdomain
+    !! using hyperslab selection to assemble the strided global field.
+    class(snapshot_manager_t), intent(inout) :: self       !! Snapshot manager instance
+    character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"]
+    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers
+    class(solver_t), intent(in) :: solver                  !! Solver containing mesh info
+    type(writer_session_t), intent(inout) :: writer_session !! I/O writer session
+    integer, intent(in) :: data_loc                        !! Data location (VERT or CELL)
 
     integer :: i_field
     integer(i8), dimension(3) :: output_start, output_count
@@ -272,26 +343,49 @@ subroutine write_fields( &
   end subroutine write_fields
 
   subroutine cleanup_output_buffers(self)
-    !! Clean up dynamic field buffers
-    class(snapshot_manager_t), intent(inout) :: self
+    !! Clean up dynamically allocated field buffers (internal).
+    !!
+    !! Frees memory allocated for field I/O buffers. Called during
+    !! finalisation to prevent memory leaks.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
 
     call cleanup_field_buffers(self%field_buffers)
   end subroutine cleanup_output_buffers
 
   subroutine finalise(self)
-    !! Clean up snapshot manager
-    class(snapshot_manager_t), intent(inout) :: self
+    !! Finalise snapshot manager and free resources.
+    !!
+    !! Cleans up all dynamically allocated buffers and closes the
+    !! persistent snapshot file. Should be called at the end of
+    !! simulation or when snapshot manager is no longer needed.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
 
     call self%cleanup_output_buffers()
     call self%close_snapshot_file()
   end subroutine finalise
 
   subroutine open_snapshot_file(self, filename, comm)
-    !! Open a persistent snapshot file
-    !! ADIOS2 handles both creating new files and appending to existing ones
-    class(snapshot_manager_t), intent(inout) :: self
-    character(len=*), intent(in) :: filename
-    integer, intent(in) :: comm
+    !! Open persistent snapshot file for appending timesteps (internal).
+    !!
+    !! Opens or creates a snapshot file that remains open across multiple
+    !! snapshot writes. Each snapshot is written as a new timestep within
+    !! the same file, enabling efficient time-series visualization.
+    !!
+    !! **Persistent File Strategy:**
+    !!
+    !! - File opened once at first snapshot
+    !! - Remains open for subsequent snapshots (append mode)
+    !! - Each write adds a new timestep to the file
+    !! - Closed only during finalisation
+    !!
+    !! **Benefits:** Reduces file open/close overhead and keeps all snapshots
+    !! in a single file for easy ParaView animation.
+    !!
+    !! **ADIOS2 Behaviour:** Automatically handles both creating new files
+    !! and appending to existing ones based on file existence.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    character(len=*), intent(in) :: filename         !! Snapshot file path
+    integer, intent(in) :: comm                      !! MPI communicator
 
     logical :: file_exists
     integer :: myrank, ierr
diff --git a/src/mesh.f90 b/src/mesh.f90
index bb0dac1c7..a9b625654 100644
--- a/src/mesh.f90
+++ b/src/mesh.f90
@@ -1,4 +1,11 @@
 module m_mesh
+  !! Mesh module providing high-level mesh management and query functions.
+  !!
+  !! This module defines the `mesh_t` type which aggregates geometry, grid, and
+  !! parallel decomposition information. It provides methods to query mesh
+  !! dimensions, coordinates, and other mesh properties for both global and
+  !! local (per MPI rank) domains.
+
   use iso_fortran_env, only: stderr => error_unit
 
   use mpi
@@ -11,21 +18,28 @@ module m_mesh
 
   implicit none
 
-  ! The mesh class stores all the information about the global and local (due to domain decomposition) mesh
-  ! It also includes getter functions to access some of its parameters
   type :: mesh_t
-    type(geo_t), allocatable :: geo ! object containing geometry information
-    class(grid_t), allocatable :: grid ! object containing grid information
-    class(par_t), allocatable :: par ! object containing parallel domain decomposition information
+    !! Mesh type containing all mesh information for the simulation.
+    !!
+    !! This type aggregates three main components:
+    !! - geo: Geometry information (coordinates, stretching)
+    !! - grid: Grid dimensions and boundary conditions
+    !! - par: Parallel domain decomposition information
+    !!
+    !! The mesh is initialised once and should be treated as read-only
+    !! during the simulation.
+    type(geo_t), allocatable :: geo     !! Geometry information
+    class(grid_t), allocatable :: grid  !! Grid dimensions and boundary conditions
+    class(par_t), allocatable :: par    !! Parallel decomposition information
   contains
-    procedure :: get_dims
-    procedure :: get_global_dims
+    procedure :: get_dims         !! Get local dimensions for a data location
+    procedure :: get_global_dims  !! Get global dimensions for a data location
 
-    procedure :: get_n_dir
-    procedure :: get_n_phi
-    generic :: get_n => get_n_dir, get_n_phi
+    procedure :: get_n_dir        !! Get number of grid points in a direction
+    procedure :: get_n_phi        !! Get number of grid points for a field
+    generic :: get_n => get_n_dir, get_n_phi  !! Generic interface for get_n
 
-    procedure :: get_coordinates
+    procedure :: get_coordinates  !! Get coordinate array for a direction
   end type mesh_t
 
   interface mesh_t
@@ -36,18 +50,23 @@ module m_mesh
 
   function mesh_init(dims_global, nproc_dir, L_global, BC_x, BC_y, BC_z, &
                      stretching, beta, use_2decomp) result(mesh)
+    !! Initialise the mesh object with global domain parameters.
+    !!
+    !! Creates and fully initialises a mesh object containing geometry, grid, and
+    !! parallel decomposition information. The mesh should be treated as read-only
+    !! after initialisation. Supports both uniform and stretched meshes, and can
+    !! use either 2decomp or generic domain decomposition.
     use m_decomp, only: is_avail_2decomp, decomposition_2decomp
-    !! Completely initialise the mesh object.
-    !! Upon initialisation the mesh object can be read-only and shouldn't be edited
-    !! Takes as argument global information about the mesh like its length, number of cells and decomposition in each direction
-    integer, dimension(3), intent(in) :: dims_global
-    integer, dimension(3), intent(in) :: nproc_dir ! Number of proc in each direction
-    real(dp), dimension(3), intent(in) :: L_global
-    character(len=*), dimension(2), intent(in) :: BC_x, BC_y, BC_z
-    character(len=*), dimension(3), optional, intent(in) :: stretching
-    real(dp), dimension(3), optional, intent(in) :: beta
-    logical, optional, intent(in) :: use_2decomp
-    class(mesh_t), allocatable :: mesh
+    integer, dimension(3), intent(in) :: dims_global  !! Global grid dimensions [nx, ny, nz]
+    integer, dimension(3), intent(in) :: nproc_dir    !! Number of processors in each direction
+    real(dp), dimension(3), intent(in) :: L_global    !! Physical domain lengths [Lx, Ly, Lz]
+    character(len=*), dimension(2), intent(in) :: BC_x  !! Boundary conditions in x (lower, upper)
+    character(len=*), dimension(2), intent(in) :: BC_y  !! Boundary conditions in y (lower, upper)
+    character(len=*), dimension(2), intent(in) :: BC_z  !! Boundary conditions in z (lower, upper)
+    character(len=*), dimension(3), optional, intent(in) :: stretching  !! Mesh stretching type per direction
+    real(dp), dimension(3), optional, intent(in) :: beta  !! Stretching parameters per direction
+    logical, optional, intent(in) :: use_2decomp      !! Flag to use 2decomp library
+    class(mesh_t), allocatable :: mesh                !! Initialised mesh object
 
     character(len=20), dimension(3, 2) :: BC_all
     logical :: is_first_domain, is_last_domain
@@ -194,19 +213,25 @@ subroutine decomposition_generic(grid, par)
   end subroutine
 
   pure function get_dims(self, data_loc) result(dims)
-  !! Getter for local domain dimensions
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: data_loc
-    integer, dimension(3) :: dims
+    !! Get local domain dimensions for a specific data location.
+    !!
+    !! Returns the dimensions of the local subdomain (on this MPI rank) for
+    !! the specified data location (VERT, CELL, X_FACE, etc.).
+    class(mesh_t), intent(in) :: self  !! Mesh object
+    integer, intent(in) :: data_loc    !! Data location flag (VERT, CELL, etc.)
+    integer, dimension(3) :: dims      !! Local dimensions [nx, ny, nz]
 
     dims = get_dims_dataloc(data_loc, self%grid%vert_dims, self%grid%cell_dims)
   end function
 
   pure function get_global_dims(self, data_loc) result(dims)
-  !! Getter for local domain dimensions
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: data_loc
-    integer, dimension(3) :: dims
+    !! Get global domain dimensions for a specific data location.
+    !!
+    !! Returns the dimensions of the entire global domain for the specified
+    !! data location (VERT, CELL, X_FACE, etc.).
+    class(mesh_t), intent(in) :: self  !! Mesh object
+    integer, intent(in) :: data_loc    !! Data location flag (VERT, CELL, etc.)
+    integer, dimension(3) :: dims      !! Global dimensions [nx, ny, nz]
 
     dims = get_dims_dataloc(data_loc, self%grid%global_vert_dims, &
                             self%grid%global_cell_dims)
@@ -249,21 +274,30 @@ pure function get_dims_dataloc(data_loc, vert_dims, cell_dims) result(dims)
   end function get_dims_dataloc
 
   pure function get_n_phi(self, phi) result(n)
-  !! Getter for the main dimension of field phi
-    class(mesh_t), intent(in) :: self
-    class(field_t), intent(in) :: phi
-    integer :: n
+    !! Get the main dimension (pencil length) for a field.
+    !!
+    !! Returns the number of grid points along the primary direction for the
+    !! given field, accounting for both the field's orientation (dir) and
+    !! data location on the staggered grid.
+    class(mesh_t), intent(in) :: self   !! Mesh object
+    class(field_t), intent(in) :: phi   !! Field to query
+    integer :: n                        !! Number of grid points in main direction
 
     n = self%get_n(phi%dir, phi%data_loc)
 
   end function
 
   pure function get_n_dir(self, dir, data_loc) result(n)
-  !! Getter for the main dimension a field oriented along `dir` with data on `data_loc`
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: dir
-    integer, intent(in) :: data_loc
-    integer :: n, n_cell, n_vert
+    !! Get the main dimension for a field with given direction and data location.
+    !!
+    !! Returns the number of grid points along a specified direction for a field
+    !! located at the given position on the staggered grid. Handles the different
+    !! grid dimensions for vertex-centered vs cell-centered data.
+    class(mesh_t), intent(in) :: self     !! Mesh object
+    integer, intent(in) :: dir            !! Primary direction (DIR_X, DIR_Y, DIR_Z)
+    integer, intent(in) :: data_loc       !! Data location (VERT, CELL, X_FACE, etc.)
+    integer :: n                          !! Number of grid points in direction
+    integer :: n_cell, n_vert
 
     n_cell = self%grid%cell_dims(dir)
     n_vert = self%grid%vert_dims(dir)
@@ -306,13 +340,17 @@ pure function get_n_dir(self, dir, data_loc) result(n)
   end function get_n_dir
 
   pure function get_coordinates(self, i, j, k, data_loc_op) result(coords)
-    !! Get the coordinates of a vertex with i, j, k local cartesian indices
-    !! Avoid calling this in hot loops
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: i, j, k
-    integer, optional, intent(in) :: data_loc_op
+    !! Get physical coordinates for a grid point with given indices.
+    !!
+    !! Returns the physical (x, y, z) coordinates for a grid point specified by
+    !! local Cartesian indices (i, j, k) at the given data location. Default
+    !! location is vertex-centered (VERT). Note: Avoid calling this function in
+    !! hot loops due to performance overhead.
+    class(mesh_t), intent(in) :: self             !! Mesh object
+    integer, intent(in) :: i, j, k                !! Local Cartesian indices
+    integer, optional, intent(in) :: data_loc_op  !! Data location (default: VERT)
     integer :: data_loc
-    real(dp), dimension(3) :: coords
+    real(dp), dimension(3) :: coords              !! Physical coordinates [x, y, z]
 
     if (present(data_loc_op)) then
       data_loc = data_loc_op
diff --git a/src/mesh_content.f90 b/src/mesh_content.f90
index 322870225..43012f705 100644
--- a/src/mesh_content.f90
+++ b/src/mesh_content.f90
@@ -1,80 +1,96 @@
 module m_mesh_content
+  !! Module containing mesh content types for geometry, grid, and parallel decomposition.
+  !!
+  !! This module defines three main types:
+  !!
+  !! - `geo_t`: Geometry information including coordinates and mesh stretching
+  !! - `grid_t`: Grid dimensions and boundary conditions
+  !! - `par_t`: Parallel domain decomposition information
 
   use m_common, only: dp, pi
   implicit none
 
   type :: geo_t
-    !! Stores geometry information
-    !> Origin: coordinates of vertex (1, 1, 1)
-    real(dp) :: origin(3)
-    !> size of a cell in each direction for a uniform mesh
-    real(dp) :: d(3)
-    !> Global dimensions of the domain in each direction
-    real(dp) :: L(3)
-    !> Global coordinates at vertices
-    real(dp), allocatable, dimension(:, :) :: vert_coords
-    !> Global coordinates at midpoints
-    real(dp), allocatable, dimension(:, :) :: midp_coords
-    !> Stretching type
-    character(len=20), dimension(3) :: stretching
-    !> Stretching
-    logical :: stretched(3)
-    !> Stretching parameters
-    real(dp) :: alpha(3), beta(3)
-    !> Stretching factors at vertices
+    !! Geometry information type for domain coordinates and mesh stretching.
+    !!
+    !! This type stores physical domain dimensions, coordinates at grid points,
+    !! and mesh stretching parameters. Coordinates and stretching factors are
+    !! stored for both vertex-centered and cell-centered locations.
+    real(dp) :: origin(3)  !! Coordinates of vertex (1, 1, 1)
+    real(dp) :: d(3)       !! Cell size in each direction for uniform mesh
+    real(dp) :: L(3)       !! Global domain dimensions in each direction
+    real(dp), allocatable, dimension(:, :) :: vert_coords  !! Global coordinates at vertices
+    real(dp), allocatable, dimension(:, :) :: midp_coords  !! Global coordinates at cell midpoints
+    character(len=20), dimension(3) :: stretching  !! Stretching type in each direction
+    logical :: stretched(3)     !! Whether each direction has stretching applied
+    real(dp) :: alpha(3)        !! Stretching parameter \(\alpha\) in each direction
+    real(dp) :: beta(3)         !! Stretching parameter \(\beta\) in each direction
+    !> Stretching factors at vertices: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\)
     real(dp), allocatable, dimension(:, :) :: vert_ds, vert_ds2, vert_d2s
-    !> Stretching factors at midpoints
+    !> Stretching factors at midpoints: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\)
     real(dp), allocatable, dimension(:, :) :: midp_ds, midp_ds2, midp_d2s
   contains
-    procedure :: obtain_coordinates
+    procedure :: obtain_coordinates  !! Compute coordinates and stretching factors
   end type
 
   type :: grid_t
-    !! Stores grid information
-    integer, dimension(3) :: global_vert_dims ! global number of vertices in each direction without padding (cartesian structure)
-    integer, dimension(3) :: global_cell_dims ! global number of cells in each direction without padding (cartesian structure)
-
-    integer, dimension(3) :: vert_dims ! local number of vertices in each direction without padding (cartesian structure)
-    integer, dimension(3) :: cell_dims ! local number of cells in each direction without padding (cartesian structure)
-    logical, dimension(3) :: periodic_BC ! Whether or not a direction has a periodic BC
-    integer, dimension(3, 2) :: BCs_global
-    integer, dimension(3, 2) :: BCs
+    !! Grid information type for mesh dimensions and boundary conditions.
+    !!
+    !! This type stores both global and local (per MPI rank) grid dimensions,
+    !! accounting for both vertex-centered and cell-centered data. It also
+    !! manages boundary condition information.
+    integer, dimension(3) :: global_vert_dims  !! Global number of vertices in each direction
+    integer, dimension(3) :: global_cell_dims  !! Global number of cells in each direction
+    integer, dimension(3) :: vert_dims         !! Local number of vertices in each direction
+    integer, dimension(3) :: cell_dims         !! Local number of cells in each direction
+    logical, dimension(3) :: periodic_BC       !! Whether each direction has periodic BC
+    integer, dimension(3, 2) :: BCs_global     !! Global boundary conditions (lower, upper) in each direction
+    integer, dimension(3, 2) :: BCs            !! Local subdomain boundary conditions (lower, upper)
   contains
-    procedure :: copy_cell2vert_dims  ! Copies cell_dims to vert_dims taking periodicity into account
-    procedure :: copy_vert2cell_dims  ! Copies vert_dims to cell_dims taking periodicity into account
+    procedure :: copy_cell2vert_dims  !! Copy cell_dims to vert_dims accounting for periodicity
+    procedure :: copy_vert2cell_dims  !! Copy vert_dims to cell_dims accounting for periodicity
   end type
 
   type :: par_t
-    !! Stores parallel domain related information
-    integer :: nrank ! local rank ID
-    integer :: nproc ! total number of ranks/proc participating in the domain decomposition
-    integer, dimension(3) :: nrank_dir ! local rank ID in each direction
-    integer, dimension(3) :: nproc_dir ! total number of proc in each direction
-    integer, dimension(3) :: n_offset  ! number of cells offset in each direction due to domain decomposition
-    integer, dimension(3) :: pnext ! rank ID of the previous rank in each direction
-    integer, dimension(3) :: pprev ! rank ID of the next rank in each direction
+    !! Parallel domain decomposition information type.
+    !!
+    !! This type stores all information related to MPI domain decomposition,
+    !! including rank IDs, processor grid layout, and neighbor communication
+    !! information for halo exchanges.
+    integer :: nrank                !! Local MPI rank ID (0-based)
+    integer :: nproc                !! Total number of MPI ranks
+    integer, dimension(3) :: nrank_dir   !! Local rank ID in each direction (0-based)
+    integer, dimension(3) :: nproc_dir   !! Number of processors in each direction
+    integer, dimension(3) :: n_offset    !! Cell offset in each direction due to decomposition
+    integer, dimension(3) :: pnext       !! Rank ID of next neighbor in each direction
+    integer, dimension(3) :: pprev       !! Rank ID of previous neighbor in each direction
   contains
-    procedure :: is_root ! returns if the current rank is the root rank
-    procedure :: compute_rank_pos_from_global ! fills in pnext, pprev and nrank_dir from global ranks map
+    procedure :: is_root                        !! Check if current rank is root (rank 0)
+    procedure :: compute_rank_pos_from_global   !! Compute rank position and neighbors from global map
   end type
 
 contains
 
   pure function is_root(self) result(is_root_rank)
-    !! Returns wether or not the current rank is the root rank
-    class(par_t), intent(in) :: self
-    logical :: is_root_rank
+    !! Check whether the current MPI rank is the root rank.
+    !!
+    !! The root rank is defined as rank 0 in the MPI communicator.
+    class(par_t), intent(in) :: self  !! Parallel decomposition object
+    logical :: is_root_rank           !! True if this is rank 0
 
     is_root_rank = (self%nrank == 0)
 
   end function
 
   pure subroutine compute_rank_pos_from_global(self, global_ranks)
-    !! From the global rank maps, fills in the rank position as well
-    !! as the previous and next rank in the `par` structure
-
-    class(par_t), intent(inout) :: self
-    integer, dimension(:, :, :), intent(in) :: global_ranks
+    !! Compute rank position and neighbor ranks from global rank map.
+    !!
+    !! From the 3D global rank map, this subroutine determines the position
+    !! of the current rank in the processor grid and identifies the previous
+    !! and next neighboring ranks in each direction for halo communication.
+    !! Periodic wrapping is applied for neighbor identification.
+    class(par_t), intent(inout) :: self                !! Parallel decomposition object to update
+    integer, dimension(:, :, :), intent(in) :: global_ranks  !! 3D map of MPI ranks
     integer, dimension(3) :: subd_pos, subd_pos_prev, subd_pos_next
     integer :: dir, nproc
 
@@ -102,10 +118,13 @@ pure subroutine compute_rank_pos_from_global(self, global_ranks)
   end subroutine
 
   pure subroutine copy_vert2cell_dims(self, par)
-    !! Copies vert_dims information to cell_dims taking
-    !! periodicity into account
-    class(grid_t), intent(inout) :: self
-    type(par_t), intent(in) :: par
+    !! Copy vertex dimensions to cell dimensions accounting for periodicity.
+    !!
+    !! For periodic boundaries, vertex and cell dimensions are equal. For
+    !! non-periodic boundaries on the last domain, cell dimensions are one
+    !! less than vertex dimensions.
+    class(grid_t), intent(inout) :: self  !! Grid object to update
+    type(par_t), intent(in) :: par        !! Parallel decomposition info
     integer :: dir
     logical :: is_last_domain
 
@@ -121,10 +140,13 @@ pure subroutine copy_vert2cell_dims(self, par)
   end subroutine
 
   pure subroutine copy_cell2vert_dims(self, par)
-    !! Copies cell_dims information to vert_dims taking
-    !! periodicity into account
-    class(grid_t), intent(inout) :: self
-    type(par_t), intent(in) :: par
+    !! Copy cell dimensions to vertex dimensions accounting for periodicity.
+    !!
+    !! For periodic boundaries, vertex and cell dimensions are equal. For
+    !! non-periodic boundaries on the last domain, vertex dimensions are one
+    !! more than cell dimensions.
+    class(grid_t), intent(inout) :: self  !! Grid object to update
+    type(par_t), intent(in) :: par        !! Parallel decomposition info
     integer :: dir
     logical :: is_last_domain
 
@@ -140,10 +162,17 @@ pure subroutine copy_cell2vert_dims(self, par)
   end subroutine
 
   subroutine obtain_coordinates(self, vert_dims, cell_dims, n_offset)
-    !! Obtains global coordinates for all the vertices and midpoints
+    !! Compute global coordinates and stretching factors for grid points.
+    !!
+    !! This subroutine calculates coordinates at both vertex-centered and
+    !! cell-centered locations, supporting both uniform and stretched meshes.
+    !! For stretched meshes, it also computes the stretching factors
+    !! \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), and \(\frac{d^2\xi}{ds^2}\).
     implicit none
-    class(geo_t) :: self
-    integer, intent(in) :: vert_dims(3), cell_dims(3), n_offset(3)
+    class(geo_t) :: self                      !! Geometry object to populate
+    integer, intent(in) :: vert_dims(3)       !! Local vertex dimensions
+    integer, intent(in) :: cell_dims(3)       !! Local cell dimensions
+    integer, intent(in) :: n_offset(3)        !! Cell offset due to domain decomposition
 
     integer :: dir, i, i_glob
     real(dp) :: L_inf, alpha, beta, r, const, s, yeta_vt, yeta_mp, coord
diff --git a/src/module/ibm.f90 b/src/module/ibm.f90
index c24445625..8f6d019b7 100644
--- a/src/module/ibm.f90
+++ b/src/module/ibm.f90
@@ -1,10 +1,10 @@
 module m_ibm
 !! This module implements the IBM capabilities.
 !!
-!! When iibm = 0, the IBM object is never used.
+!! When `iibm = 0`, the IBM object is never used.
 !!
-!! When iibm = 1, the basic IBM capability is used.
-!! It only requires ep1, a 3D field, as input.
+!! When `iibm = 1`, the basic IBM capability is used.
+!! It only requires `ep1`, a 3D field, as input.
 !! This field should be one (zero) in the fluid (solid)
 !! domain.
   use iso_fortran_env, only: stderr => error_unit
@@ -25,6 +25,45 @@ module m_ibm
   integer, parameter :: iibm_basic = 1
 
   type :: ibm_t
+    !! Immersed Boundary Method (IBM) for simulating flow around solid bodies.
+    !!
+    !! The IBM approach enables simulation of flows with complex solid geometries
+    !! without requiring body-fitted meshes. Instead, the solid geometry is
+    !! represented by a masking field (`ep1`) on a Cartesian grid.
+    !!
+    !! **Current Implementation (iibm = 1):**
+    !!
+    !! The basic IBM enforces zero velocity inside solid regions by multiplying
+    !! velocity components with the mask field `ep1`:
+    !!
+    !! - `ep1 = 1` in fluid regions → velocity unchanged
+    !! - `ep1 = 0` in solid regions → velocity set to zero
+    !!
+    !! This is applied before the pressure solve to ensure the divergence-free
+    !! constraint is satisfied only in the fluid domain.
+    !!
+    !! **Mask Field (ep1):**
+    !!
+    !! The `ep1` field defines the fluid/solid interface:
+    !!
+    !! - Values of 1.0 indicate fluid cells (no modification)
+    !! - Values of 0.0 indicate solid cells (velocity zeroed)
+    !! - Intermediate values (0 < ep1 < 1) represent interface cells
+    !!
+    !! **Future Extensions:**
+    !!
+    !! The current implementation sets velocity to zero in solid regions.
+    !! A more accurate IBM would set velocity to \(\Delta t \nabla p^n\)
+    !! before the pressure solve, then subtract \(\Delta t \nabla p^{n+1}\)
+    !! after reconstruction to properly enforce boundary conditions.
+    !!
+    !! **Components:**
+    !!
+    !! - `backend`: Computational backend for field operations
+    !! - `mesh`: Grid information
+    !! - `host_allocator`: Memory allocator for field storage
+    !! - `iibm`: IBM mode (0 = disabled, 1 = basic IBM)
+    !! - `ep1`: Mask field (1 in fluid, 0 in solid)
     class(base_backend_t), pointer :: backend => null()
     class(mesh_t), pointer :: mesh => null()
     type(allocator_t), pointer :: host_allocator => null()
diff --git a/src/ordering.f90 b/src/ordering.f90
index 19be0a583..7c1d45c87 100644
--- a/src/ordering.f90
+++ b/src/ordering.f90
@@ -1,14 +1,16 @@
 module m_ordering
+  !! Module for index conversion between application storage and Cartesian layouts.
+  !!
+  !! This module provides functions to convert between directional "application storage"
+  !! indices (optimised for cache locality) and Cartesian \( (i,j,k) \) indices. The application
+  !! storage layout arranges data in blocks oriented along a specific direction ( \( X, Y \), or \( Z \) )
+  !! to improve memory access patterns during computations.
 
   use m_common, only: dp, get_dirs_from_rdr, DIR_X, DIR_Y, DIR_Z, DIR_C
 
   implicit none
 
 contains
-   !!
-   !! "Application storage" stores spatial data with a directionality for better cache locality
-   !!  This set of functions converts indices from this application storage (_dir) to cartesian indices (_ijk)
-   !!
 
   pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, &
                                 SZ, nx_padded, ny_padded, nz_padded)
diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
index 3efe96b65..937f42ff6 100644
--- a/src/poisson_fft.f90
+++ b/src/poisson_fft.f90
@@ -1,4 +1,36 @@
 module m_poisson_fft
+  !! FFT-based spectral Poisson solver for incompressible flow.
+  !!
+  !! This module implements fast Fourier transform (FFT) based solvers for
+  !! the Poisson equation:
+  !! \[ \nabla^2 \phi = f \]
+  !!
+  !! **Solution Strategy:**
+  !!
+  !! 1. **Forward FFT**: Transform RHS from physical to spectral space
+  !! 2. **Spectral division**: Solve algebraically using wave numbers:
+  !!    \( \hat{\phi} = \hat{f} / k^2 \)
+  !! 3. **Backward FFT**: Transform solution back to physical space
+  !!
+  !! **Boundary Condition Support:**
+  !!
+  !! - **Periodic (000)**: Fully periodic in all directions (standard FFT)
+  !! - **Mixed (010)**: Periodic in \( X/Z \), non-periodic in \( Y \) (requires special handling)
+  !!
+  !! **Grid Stretching:**
+  !!
+  !! - Uniform grids in \( X \) and \( Z \) (required for FFT)
+  !! - \( Y \)-direction stretching supported for `010` BCs via transformation matrices
+  !! - Stretching handled through spectral equivalence constants
+  !!
+  !! **Parallel Implementation:**
+  !!
+  !! - Pencil decomposition in \( Y \) and \( Z \) directions (\( X \) must be undivided)
+  !! - Spectral space operations on permuted/transposed data layouts
+  !! - Backend-specific FFT implementations (CPU/GPU)
+  !!
+  !! The module is abstract; concrete implementations provide FFT routines
+  !! via deferred procedures (`fft_forward`, `fft_backward`, `fft_postprocess`).
   use m_common, only: dp, pi, CELL
   use m_field, only: field_t
   use m_mesh, only: mesh_t, geo_t
@@ -7,111 +39,151 @@ module m_poisson_fft
   implicit none
 
   type, abstract :: poisson_fft_t
-    !! FFT based Poisson solver
-    !> Global dimensions
+    !! Abstract base type for FFT-based Poisson solvers.
+    !!
+    !! Concrete backend implementations (OMP, CUDA) extend this type
+    !! and provide FFT library integration (FFTW, cuFFT, etc.).
+    !> Global dimensions (full domain)
     integer :: nx_glob, ny_glob, nz_glob
-    !> Local dimensions
+    !> Local dimensions (subdomain on this rank)
     integer :: nx_loc, ny_loc, nz_loc
-    !> Local dimensions in the permuted slabs
+    !> Local dimensions in the permuted slabs (after transpose for FFT)
     integer :: nx_perm, ny_perm, nz_perm
-    !> Local dimensions in the permuted slabs in spectral space
+    !> Local dimensions in the permuted slabs in spectral space (complex)
     integer :: nx_spec, ny_spec, nz_spec
-    !> Offset in y and z directions in the permuted slabs in spectral space
+    !> Offset in x, y, z directions in the spectral space pencil
     integer :: x_sp_st, y_sp_st, z_sp_st
-    !> Local domain sized array storing the spectral equivalence constants
+    !> Local spectral equivalence constants (modified wave numbers)
     complex(dp), allocatable, dimension(:, :, :) :: waves
-    !> Wave numbers in x, y, and z
+    !> Tridiagonal coefficients for wave number computation (real part)
     real(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz
-    !> Wave numbers in x, y, and z
+    !> Complex wave numbers and their squares for each direction
     complex(dp), allocatable, dimension(:) :: kx, ky, kz, exs, eys, ezs, &
                                               k2x, k2y, k2z
-    !> Staggared grid transformation
+    !> Staggered grid transformation coefficients (real and imaginary parts)
     real(dp), allocatable, dimension(:) :: trans_x_re, trans_x_im, &
                                            trans_y_re, trans_y_im, &
                                            trans_z_re, trans_z_im
-    !> Periodicity in x, y, and z
+    !> Periodicity flags for each direction
     logical :: periodic_x, periodic_y, periodic_z, &
-               stretched_y = .false., stretched_y_sym
-    !> Stretching operator matrices
+               stretched_y = .false., stretched_y_sym  !! Y-direction stretching
+    !> Stretching transformation matrices (odd/even modes, real/imaginary)
     real(dp), allocatable, dimension(:, :, :, :) :: a_odd_re, a_odd_im, &
                                                     a_even_re, a_even_im, &
                                                     a_re, a_im
-    !> lowmem option, only used in CUDA backend
+    !> Low memory mode flag (used for GPU backends to reduce memory usage)
     logical :: lowmem = .false.
-    !> Procedure pointer to BC specific poisson solvers
+    !> Procedure pointer to BC-specific Poisson solver implementation
     procedure(poisson_xxx), pointer :: poisson => null()
   contains
-    procedure(fft_forward), deferred :: fft_forward
-    procedure(fft_backward), deferred :: fft_backward
-    procedure(fft_postprocess), deferred :: fft_postprocess_000
-    procedure(fft_postprocess), deferred :: fft_postprocess_010
-    procedure(field_process), deferred :: enforce_periodicity_y
-    procedure(field_process), deferred :: undo_periodicity_y
-    procedure :: base_init
-    procedure :: solve_poisson
-    procedure :: stretching_matrix
-    procedure :: waves_set
-    procedure :: get_km
-    procedure :: get_km_re
-    procedure :: get_km_im
+    procedure(fft_forward), deferred :: fft_forward           !! Forward FFT (deferred)
+    procedure(fft_backward), deferred :: fft_backward         !! Backward FFT (deferred)
+    procedure(fft_postprocess), deferred :: fft_postprocess_000 !! Postprocess for 000 BCs
+    procedure(fft_postprocess), deferred :: fft_postprocess_010 !! Postprocess for 010 BCs
+    procedure(field_process), deferred :: enforce_periodicity_y !! Enforce Y periodicity
+    procedure(field_process), deferred :: undo_periodicity_y    !! Undo Y periodicity
+    procedure :: base_init           !! Initialise Poisson solver
+    procedure :: solve_poisson       !! Main interface to solve Poisson equation
+    procedure :: stretching_matrix   !! Compute stretching transformation matrices
+    procedure :: waves_set           !! Compute spectral equivalence constants
+    procedure :: get_km              !! Get complex wave number
+    procedure :: get_km_re           !! Get real part of wave number
+    procedure :: get_km_im           !! Get imaginary part of wave number
   end type poisson_fft_t
 
   abstract interface
     subroutine fft_forward(self, f_in)
+      !! Abstract interface for forward FFT transform.
+      !!
+      !! Transforms field from physical space to spectral space.
+      !! Implementation is backend-specific (FFTW, cuFFT, etc.).
       import :: poisson_fft_t
       import :: field_t
       implicit none
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(in) :: f_in
+      class(poisson_fft_t) :: self      !! Poisson solver instance
+      class(field_t), intent(in) :: f_in !! Input field in physical space
     end subroutine fft_forward
 
     subroutine fft_backward(self, f_out)
+      !! Abstract interface for backward (inverse) FFT transform.
+      !!
+      !! Transforms field from spectral space back to physical space.
+      !! Implementation is backend-specific (FFTW, cuFFT, etc.).
       import :: poisson_fft_t
       import :: field_t
       implicit none
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f_out
+      class(poisson_fft_t) :: self           !! Poisson solver instance
+      class(field_t), intent(inout) :: f_out !! Output field in physical space
     end subroutine fft_backward
 
     subroutine fft_postprocess(self)
+      !! Abstract interface for spectral space postprocessing.
+      !!
+      !! Applies spectral division and any BC-specific operations
+      !! in Fourier space. Different implementations for different
+      !! boundary condition combinations (000, 010, etc.).
       import :: poisson_fft_t
       implicit none
 
-      class(poisson_fft_t) :: self
+      class(poisson_fft_t) :: self !! Poisson solver instance
     end subroutine fft_postprocess
   end interface
 
   abstract interface
     subroutine poisson_xxx(self, f, temp)
+      !! Abstract interface for complete Poisson solve.
+      !!
+      !! Orchestrates forward FFT, postprocessing, and backward FFT.
+      !! Different implementations for different BC combinations.
       import :: poisson_fft_t
       import :: field_t
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f, temp
+      class(poisson_fft_t) :: self                !! Poisson solver instance
+      class(field_t), intent(inout) :: f, temp    !! Field and temporary storage
     end subroutine poisson_xxx
 
     subroutine field_process(self, f_out, f_in)
+      !! Abstract interface for field processing operations.
+      !!
+      !! Used for enforcing or undoing periodicity in non-periodic
+      !! directions (e.g., Y direction for 010 BCs).
       import :: poisson_fft_t
       import :: field_t
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f_out
-      class(field_t), intent(in) :: f_in
+      class(poisson_fft_t) :: self             !! Poisson solver instance
+      class(field_t), intent(inout) :: f_out   !! Output field
+      class(field_t), intent(in) :: f_in       !! Input field
     end subroutine field_process
   end interface
 
 contains
 
   subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st)
+    !! Initialise FFT-based Poisson solver with mesh and decomposition info.
+    !!
+    !! Sets up:
+    !! - Domain dimensions (global and local)
+    !! - Periodicity flags from boundary conditions
+    !! - Spectral space dimensions and offsets
+    !! - Wave number arrays and spectral equivalence constants
+    !! - Stretching matrices (if Y-direction is stretched)
+    !! - Function pointer to appropriate BC-specific solver
+    !!
+    !! **Restrictions:**
+    !! - X-direction must not be decomposed (nproc_dir(1) must be 1)
+    !! - Only Y-direction stretching is supported
+    !! - Currently supports 000 (fully periodic) and 010 (Y non-periodic) BCs
+    !!
+    !! **Note:** 010 BCs with multiple MPI ranks not yet supported.
     implicit none
 
-    class(poisson_fft_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    integer, dimension(3), intent(in) :: n_spec ! Size of the spectral pencil
-    integer, dimension(3), intent(in) :: n_sp_st ! Offset of the spectral pencil
+    class(poisson_fft_t) :: self                  !! Poisson solver instance
+    type(mesh_t), intent(in) :: mesh              !! Mesh object with grid and decomposition
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Directional operators
+    integer, dimension(3), intent(in) :: n_spec   !! Size of the spectral pencil [nx, ny, nz]
+    integer, dimension(3), intent(in) :: n_sp_st  !! Offset of the spectral pencil [x, y, z]
 
     integer :: dims(3)
 
@@ -180,20 +252,33 @@ subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st)
   end subroutine base_init
 
   subroutine solve_poisson(self, f, temp)
+    !! Main interface to solve Poisson equation.
+    !!
+    !! Delegates to the BC-specific solver function pointed to by
+    !! self%poisson (either poisson_000 or poisson_010). This provides
+    !! a uniform interface regardless of boundary conditions.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field to solve (RHS in, solution out), temporary
 
     call self%poisson(f, temp)
 
   end subroutine solve_poisson
 
   subroutine poisson_000(self, f, temp)
+    !! Solve Poisson equation with fully periodic (000) boundary conditions.
+    !!
+    !! For periodic BCs in all directions, the solution procedure is:
+    !! 1. Forward FFT: f to f_hat
+    !! 2. Spectral division: \( \hat{f} / k^2 \) gives solution_hat
+    !! 3. Backward FFT: solution_hat to solution
+    !!
+    !! This is the simplest case requiring no special handling for BCs.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field (RHS in, solution out), temporary (unused)
 
     call self%fft_forward(f)
     call self%fft_postprocess_000
@@ -202,10 +287,21 @@ subroutine poisson_000(self, f, temp)
   end subroutine poisson_000
 
   subroutine poisson_010(self, f, temp)
+    !! Solve Poisson equation with mixed (010) boundary conditions.
+    !!
+    !! For periodic in X/Z, non-periodic in Y, the solution procedure is:
+    !! 1. Enforce artificial periodicity in Y using symmetry extension
+    !! 2. Forward FFT: f to f_hat
+    !! 3. Spectral division with stretching corrections (if grid is stretched)
+    !! 4. Backward FFT: solution_hat to solution
+    !! 5. Undo artificial periodicity to recover physical solution
+    !!
+    !! The symmetry extension doubles the domain size in Y to handle
+    !! non-periodic BCs via FFT.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field (RHS in, solution out), temporary
 
     call self%enforce_periodicity_y(temp, f)
 
diff --git a/src/solver.f90 b/src/solver.f90
index 700f4e912..dd5b5b8f2 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -1,4 +1,19 @@
 module m_solver
+  !! Main solver module implementing the Incompact3D numerical algorithm.
+  !!
+  !! This module provides the high-level solver infrastructure for solving
+  !! incompressible Navier-Stokes equations using compact finite differences.
+  !! The solver orchestrates the transport equation (`transeq`), divergence,
+  !! Poisson solver, and gradient operations required for the fractional-step
+  !! projection method.
+  !!
+  !! The implementation supports:
+  !!
+  !! - Multiple backend executors (CPU/GPU)
+  !! - Distributed and Thomas algorithm for derivatives
+  !! - Immersed boundary method (IBM)
+  !! - Multi-species transport
+  !! - Various time integration schemes
   use iso_fortran_env, only: stderr => error_unit
   use mpi
 
@@ -47,53 +62,65 @@ module m_solver
       !! method of the allocator can be used to make this field available
       !! for later use.
 
-    real(dp) :: dt, nu
-    real(dp), dimension(:), allocatable :: nu_species
-    integer :: n_iters, n_output
-    integer :: current_iter = 0
-    integer :: ngrid
-    integer :: nvars = 3
-    integer :: nspecies = 0
-
-    class(field_t), pointer :: u, v, w
-    type(flist_t), dimension(:), pointer :: species => null()
-
-    class(base_backend_t), pointer :: backend
-    type(mesh_t), pointer :: mesh
-    type(time_intg_t) :: time_integrator
-    type(allocator_t), pointer :: host_allocator
-    type(dirps_t), pointer :: xdirps, ydirps, zdirps
-    type(vector_calculus_t) :: vector_calculus
-    type(ibm_t) :: ibm
-    logical :: ibm_on
-    procedure(poisson_solver), pointer :: poisson => null()
-    procedure(transport_equation), pointer :: transeq => null()
+    real(dp) :: dt                              !! Time step size
+    real(dp) :: nu                              !! Kinematic viscosity
+    real(dp), dimension(:), allocatable :: nu_species  !! Viscosities for multiple species
+    integer :: n_iters                          !! Total number of time iterations
+    integer :: n_output                         !! Output frequency (every nth iteration)
+    integer :: current_iter = 0                 !! Current iteration number
+    integer :: ngrid                            !! Total number of grid points
+    integer :: nvars = 3                        !! Number of velocity variables (u,v,w)
+    integer :: nspecies = 0                     !! Number of scalar species to transport
+
+    class(field_t), pointer :: u, v, w          !! Velocity field components
+    type(flist_t), dimension(:), pointer :: species => null()  !! Array of scalar species fields
+
+    class(base_backend_t), pointer :: backend  !! Backend executor (CPU/GPU)
+    type(mesh_t), pointer :: mesh               !! Computational mesh
+    type(time_intg_t) :: time_integrator        !! Time integration scheme
+    type(allocator_t), pointer :: host_allocator  !! Memory allocator for host arrays
+    type(dirps_t), pointer :: xdirps, ydirps, zdirps  !! Tridiagonal operators in each direction
+    type(vector_calculus_t) :: vector_calculus  !! Vector calculus operations
+    type(ibm_t) :: ibm                          !! Immersed boundary method handler
+    logical :: ibm_on                           !! Flag to enable/disable IBM
+    procedure(poisson_solver), pointer :: poisson => null()  !! Poisson solver procedure pointer
+    procedure(transport_equation), pointer :: transeq => null()  !! Transport equation solver pointer
   contains
-    procedure :: transeq_species
-    procedure :: pressure_correction
-    procedure :: divergence_v2p
-    procedure :: gradient_p2v
-    procedure :: curl
+    procedure :: transeq_species       !! Compute transport equation for scalar species
+    procedure :: pressure_correction   !! Apply pressure correction to enforce incompressibility
+    procedure :: divergence_v2p        !! Compute divergence of velocity field
+    procedure :: gradient_p2v          !! Compute pressure gradient
+    procedure :: curl                  !! Compute curl (vorticity) of velocity field
   end type solver_t
 
   abstract interface
     subroutine poisson_solver(self, pressure, div_u)
+      !! Interface for Poisson solver implementations.
+      !!
+      !! Solves the Poisson equation \( \nabla^2 p = f \) where f is the
+      !! divergence of the intermediate velocity field.
       import :: solver_t
       import :: field_t
       implicit none
 
       class(solver_t) :: self
-      class(field_t), intent(inout) :: pressure
-      class(field_t), intent(in) :: div_u
+      class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+      class(field_t), intent(in) :: div_u         !! Velocity divergence (RHS)
     end subroutine poisson_solver
 
     subroutine transport_equation(self, rhs, variables)
+      !! Interface for transport equation implementations.
+      !!
+      !! Computes the right-hand side of the transport equation including
+      !! convection, diffusion, and any source terms. The momentum equations are:
+      !! \[ \frac{\partial \mathbf{u}}{\partial t} + (\mathbf{u} \cdot \nabla)\mathbf{u} = -\nabla p + \nu \nabla^2 \mathbf{u} \]
       import :: solver_t
       import :: flist_t
       implicit none
 
       class(solver_t) :: self
-      type(flist_t), intent(inout) :: rhs(:), variables(:)
+      type(flist_t), intent(inout) :: rhs(:)       !! Right-hand side terms (output)
+      type(flist_t), intent(inout) :: variables(:) !! Field variables (velocity components)
     end subroutine transport_equation
   end interface
 
@@ -104,12 +131,25 @@ end subroutine transport_equation
 contains
 
   function init(backend, mesh, host_allocator) result(solver)
+    !! Initialise the solver with backend, mesh, and configuration.
+    !!
+    !! This function sets up the complete solver infrastructure including:
+    !! - Velocity field allocation (u, v, w)
+    !! - Tridiagonal operators for each direction (xdirps, ydirps, zdirps)
+    !! - Time integrator
+    !! - Poisson solver (FFT or CG)
+    !! - Transport equation solver (default or low-memory variant)
+    !! - Optional scalar species transport
+    !! - Optional immersed boundary method (IBM)
+    !!
+    !! All configuration is read from the namelist file specified as the first
+    !! command-line argument.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(solver_t) :: solver
+    class(base_backend_t), target, intent(inout) :: backend     !! Backend executor (CPU/GPU)
+    type(mesh_t), target, intent(inout) :: mesh                  !! Computational mesh
+    type(allocator_t), target, intent(inout) :: host_allocator  !! Host memory allocator
+    type(solver_t) :: solver                                     !! Initialised solver object
 
     type(solver_config_t) :: solver_cfg
     integer :: i
@@ -208,11 +248,22 @@ end function init
 
   subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, &
                              der2nd_scheme, interpl_scheme, stagder_scheme)
-    type(dirps_t), intent(inout) :: dirps
-    class(base_backend_t), intent(in) :: backend
-    type(mesh_t), intent(in) :: mesh
-    character(*), intent(in) :: der1st_scheme, der2nd_scheme, &
-                                interpl_scheme, stagder_scheme
+    !! Allocate and initialise tridiagonal operators for a given direction.
+    !!
+    !! This subroutine creates the compact finite difference operators needed for:
+    !! - First derivatives (der1st)
+    !! - Second derivatives (der2nd)
+    !! - Interpolation (interpl)
+    !! - Staggered derivatives (stagder)
+    !!
+    !! Boundary conditions are determined from the mesh periodicity flags.
+    type(dirps_t), intent(inout) :: dirps           !! Direction-specific operator set
+    class(base_backend_t), intent(in) :: backend    !! Backend executor
+    type(mesh_t), intent(in) :: mesh                 !! Computational mesh
+    character(*), intent(in) :: der1st_scheme        !! First derivative scheme name
+    character(*), intent(in) :: der2nd_scheme        !! Second derivative scheme name
+    character(*), intent(in) :: interpl_scheme       !! Interpolation scheme name
+    character(*), intent(in) :: stagder_scheme       !! Staggered derivative scheme name
 
     integer :: dir, bc_start, bc_end, bc_mp_start, bc_mp_end, n_vert, n_cell, i
     real(dp) :: d
@@ -282,15 +333,23 @@ subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, &
   end subroutine
 
   subroutine transeq_default(self, rhs, variables)
-    !! Skew-symmetric form of convection-diffusion terms in the
-    !! incompressible Navier-Stokes momemtum equations, excluding
-    !! pressure terms.
-    !! Inputs from velocity grid and outputs to velocity grid.
+    !! Compute transport equation RHS using default (high-memory) algorithm.
+    !!
+    !! Evaluates the skew-symmetric form of convection-diffusion terms in the
+    !! incompressible Navier-Stokes momentum equations, excluding pressure:
+    !! \[ RHS = -(\mathbf{u} \cdot \nabla)\mathbf{u} + \nu \nabla^2 \mathbf{u} \]
+    !!
+    !! Uses skew-symmetric formulation for numerical stability:
+    !! \[ (\mathbf{u} \cdot \nabla)\mathbf{u} = \frac{1}{2}[(\mathbf{u} \cdot \nabla)\mathbf{u} + \nabla \cdot (\mathbf{u}\mathbf{u})] \]
+    !!
+    !! This version stores intermediate results for all velocity components,
+    !! providing better performance at the cost of higher memory usage.
+    !! Both inputs and outputs are on the velocity (vertex) grid.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(inout) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)         !! Right-hand side output (du/dt, dv/dt, dw/dt)
+    type(flist_t), intent(inout) :: variables(:)   !! Velocity components (u, v, w)
 
     class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
       du_y, dv_y, dw_y, du_z, dv_z, dw_z, &
@@ -382,12 +441,20 @@ subroutine transeq_default(self, rhs, variables)
   end subroutine transeq_default
 
   subroutine transeq_lowmem(self, rhs, variables)
-    !! low memory version of the transport equation, roughly %2 slower overall
+    !! Compute transport equation RHS using low-memory algorithm.
+    !!
+    !! Evaluates the same skew-symmetric form as transeq_default but with
+    !! reduced memory footprint by reusing field storage. This approach is
+    !! approximately 2% slower but uses significantly less memory, which can
+    !! be important for large simulations or GPU implementations with limited
+    !! memory.
+    !!
+    !! See transeq_default for the mathematical formulation.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(inout) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)        !! Right-hand side output (du/dt, dv/dt, dw/dt)
+    type(flist_t), intent(inout) :: variables(:)  !! Velocity components (u, v, w)
 
     class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
       du_y, dv_y, dw_y, du_z, dv_z, dw_z, du, dv, dw, u, v, w
@@ -498,14 +565,20 @@ subroutine transeq_lowmem(self, rhs, variables)
   end subroutine transeq_lowmem
 
   subroutine transeq_species(self, rhs, variables)
-    !! Skew-symmetric form of convection-diffusion terms in the
-    !! species equation.
-    !! Inputs from velocity grid and outputs to velocity grid.
+    !! Compute transport equation for passive scalar species.
+    !!
+    !! Evaluates the convection-diffusion equation for transported scalars:
+    !! \[ \frac{\partial \phi}{\partial t} + (\mathbf{u} \cdot \nabla)\phi = \nu_\phi \nabla^2 \phi \]
+    !!
+    !! where \( \phi \) represents each scalar species, \( \nu_\phi \) is the
+    !! species diffusivity. Uses skew-symmetric form similar to momentum equations.
+    !! Velocity field must be available in self%u, self%v, self%w.
+    !! Both inputs and outputs are on the velocity (vertex) grid.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(in) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)       !! Right-hand side for species equations
+    type(flist_t), intent(in) :: variables(:)    !! Scalar species fields
 
     integer :: i
     class(field_t), pointer :: u, v, w, &
@@ -594,12 +667,18 @@ subroutine transeq_species(self, rhs, variables)
   end subroutine transeq_species
 
   subroutine divergence_v2p(self, div_u, u, v, w)
-    !! Wrapper for divergence_v2p
+    !! Compute divergence of velocity field from vertex to cell centers.
+    !!
+    !! Calculates \( \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \)
+    !! using staggered derivatives and interpolation operators. The input velocity
+    !! components are on the vertex grid and the output divergence is on the cell-centered grid.
+    !!
+    !! For incompressible flow, this should be zero (up to numerical errors).
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: div_u
-    class(field_t), intent(in) :: u, v, w
+    class(field_t), intent(inout) :: div_u  !! Velocity divergence (output, cell-centered)
+    class(field_t), intent(in) :: u, v, w   !! Velocity components (input, vertex-centered)
 
     call self%vector_calculus%divergence_v2c( &
       div_u, u, v, w, &
@@ -611,12 +690,19 @@ subroutine divergence_v2p(self, div_u, u, v, w)
   end subroutine divergence_v2p
 
   subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
-    !! Wrapper for gradient_p2v
+    !! Compute pressure gradient from cell centers to vertices.
+    !!
+    !! Calculates the pressure gradient components:
+    !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \]
+    !!
+    !! using staggered derivatives and interpolation operators. The input pressure
+    !! is on the cell-centered grid and the output gradient components are on the vertex grid.
+    !! This is used in the pressure correction step of the fractional-step method.
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: dpdx, dpdy, dpdz
-    class(field_t), intent(in) :: pressure
+    class(field_t), intent(inout) :: dpdx, dpdy, dpdz  !! Pressure gradient components (vertex-centered)
+    class(field_t), intent(in) :: pressure              !! Pressure field (cell-centered)
 
     call self%vector_calculus%gradient_c2v( &
       dpdx, dpdy, dpdz, pressure, &
@@ -628,7 +714,13 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
   end subroutine gradient_p2v
 
   subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
-    !! Wrapper for curl
+    !! Compute curl (vorticity) of the velocity field.
+    !!
+    !! Calculates the curl of velocity:
+    !! \[ \boldsymbol{\omega} = \nabla \times \mathbf{u} = \left( \frac{\partial w}{\partial y} - \frac{\partial v}{\partial z}, \frac{\partial u}{\partial z} - \frac{\partial w}{\partial x}, \frac{\partial v}{\partial x} - \frac{\partial u}{\partial y} \right) \]
+    !!
+    !! All fields are on the vertex grid. This is primarily used for
+    !! post-processing and visualisation of vorticity.
     implicit none
 
     class(solver_t) :: self
@@ -644,11 +736,23 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
   end subroutine curl
 
   subroutine poisson_fft(self, pressure, div_u)
+    !! Solve Poisson equation using Fast Fourier Transform method.
+    !!
+    !! Solves \( \nabla^2 p = f \) where f is the velocity divergence,
+    !! using FFT-based spectral method. This is very efficient for periodic
+    !! or Neumann boundary conditions and is the default/recommended solver.
+    !!
+    !! The solution process involves:
+    !! 1. Transform to 3D Cartesian data structure
+    !! 2. Apply FFT in periodic/Neumann directions
+    !! 3. Solve in spectral space
+    !! 4. Inverse FFT back to physical space
+    !! 5. Transform back to pencil decomposition
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: pressure
-    class(field_t), intent(in) :: div_u
+    class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+    class(field_t), intent(in) :: div_u        !! Velocity divergence (RHS)
 
     class(field_t), pointer :: p_temp, temp
 
@@ -671,11 +775,17 @@ subroutine poisson_fft(self, pressure, div_u)
   end subroutine poisson_fft
 
   subroutine poisson_cg(self, pressure, div_u)
+    !! Solve Poisson equation using Conjugate Gradient method.
+    !!
+    !! This is a placeholder for iterative Poisson solver using CG method.
+    !! Currently sets pressure to zero for performance testing.
+    !! Will be fully implemented for cases where FFT is not suitable
+    !! (e.g., complex geometries or Dirichlet boundary conditions).
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: pressure
-    class(field_t), intent(in) :: div_u
+    class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+    class(field_t), intent(in) :: div_u        !! Velocity divergence (RHS)
 
     ! set the pressure field to 0 so that we can do performance tests easily
     ! this will be removed once the CG solver is implemented of course
@@ -684,10 +794,19 @@ subroutine poisson_cg(self, pressure, div_u)
   end subroutine poisson_cg
 
   subroutine pressure_correction(self, u, v, w)
+    !! Apply pressure correction to enforce incompressibility constraint.
+    !!
+    !! Implements the projection step of the fractional-step method:
+    !! 1. Compute divergence of intermediate velocity: \( \nabla \cdot \mathbf{u}^* \)
+    !! 2. Solve Poisson equation: \( \nabla^2 p = \frac{1}{\Delta t} \nabla \cdot \mathbf{u}^* \)
+    !! 3. Correct velocity: \( \mathbf{u}^{n+1} = \mathbf{u}^* - \Delta t \nabla p \)
+    !!
+    !! After correction, the velocity field is divergence-free (incompressible).
+    !! If IBM is active, IBM forcing is applied after pressure correction.
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: u, v, w
+    class(field_t), intent(inout) :: u, v, w  !! Velocity components (corrected in-place)
 
     class(field_t), pointer :: div_u, pressure, dpdx, dpdy, dpdz
 
diff --git a/src/tdsops.f90 b/src/tdsops.f90
index 40623711a..09e68aa66 100644
--- a/src/tdsops.f90
+++ b/src/tdsops.f90
@@ -1,4 +1,28 @@
 module m_tdsops
+  !! Tridiagonal solver operators for compact finite differences.
+  !!
+  !! This module provides preprocessed tridiagonal operator arrays for
+  !! solving compact finite difference schemes. It supports both distributed
+  !! and Thomas algorithm implementations for computing:
+  !!
+  !! - First and second derivatives
+  !! - Interpolation between vertex and cell-centre grids
+  !! - Staggered derivatives
+  !!
+  !! The operators are preprocessed based on:
+  !!
+  !! - Grid spacing and optional stretching
+  !! - Boundary conditions (periodic, Neumann, Dirichlet)
+  !! - Numerical scheme (compact schemes of various orders)
+  !! - Symmetry properties for free-slip boundaries
+  !!
+  !! The distributed algorithm is designed for parallel execution and consists of:
+  !!
+  !! 1. Forward/backward elimination phase (`dist_fw`, `dist_bw`)
+  !! 2. Back-substitution phase (`dist_sa`, `dist_sc`)
+  !!
+  !! The Thomas algorithm (`thom_*`) is used for serial execution or
+  !! when the distributed approach is not suitable.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp, pi, VERT, CELL, &
@@ -24,21 +48,35 @@ module m_tdsops
       !! This class does not know about the current rank or its relative
       !! location among other ranks. All the operator arrays here are used when
       !! executing a distributed tridiagonal solver phase one or two.
-    real(dp), allocatable, dimension(:) :: dist_fw, dist_bw, & !! fw/bw phase
-                                           dist_sa, dist_sc, & !! back subs.
-                                           dist_af !! the auxiliary factors
-    real(dp), allocatable, dimension(:) :: thom_f, thom_s, thom_w, thom_p
-    real(dp), allocatable :: stretch(:), stretch_correct(:)
-    real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :)
-    real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp !! Compact scheme coeffs
-    logical :: periodic
-    integer :: n_tds !! Tridiagonal system size
-    integer :: n_rhs !! Right-hand-side builder size
-    integer :: move = 0 !! move between vertices and cell centres
-    integer :: n_halo !! number of halo points
+    real(dp), allocatable, dimension(:) :: dist_fw     !! Forward elimination coefficients (distributed)
+    real(dp), allocatable, dimension(:) :: dist_bw     !! Backward elimination coefficients (distributed)
+    real(dp), allocatable, dimension(:) :: dist_sa     !! Back-substitution coefficients A (distributed)
+    real(dp), allocatable, dimension(:) :: dist_sc     !! Back-substitution coefficients C (distributed)
+    real(dp), allocatable, dimension(:) :: dist_af     !! Auxiliary factors (distributed)
+    real(dp), allocatable, dimension(:) :: thom_f      !! Forward elimination factors (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_s      !! Scaling factors (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_w      !! Work array (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_p      !! Precomputed products (Thomas)
+    real(dp), allocatable :: stretch(:)                !! Grid stretching coefficients
+    real(dp), allocatable :: stretch_correct(:)        !! Stretch correction for 2nd derivatives
+    real(dp), allocatable :: coeffs(:)                 !! RHS builder coefficients (interior)
+    real(dp), allocatable :: coeffs_s(:, :)            !! RHS builder coefficients (start boundary)
+    real(dp), allocatable :: coeffs_e(:, :)            !! RHS builder coefficients (end boundary)
+    real(dp) :: alpha                                  !! Compact scheme coefficient (LHS)
+    real(dp) :: a, b                                   !! Compact scheme coefficients (RHS)
+    real(dp) :: c = 0._dp, d = 0._dp                   !! Extended compact scheme coefficients
+    logical :: periodic                                !! Periodic boundary condition flag
+    integer :: n_tds                                   !! Tridiagonal system size
+    integer :: n_rhs                                   !! Right-hand-side builder size
+    integer :: move = 0                                !! Offset for vertex/cell-centre conversion
+    integer :: n_halo                                  !! Number of halo points
   contains
-    procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st
-    procedure :: preprocess_dist, preprocess_thom
+    procedure :: deriv_1st       !! Set up first derivative operator
+    procedure :: deriv_2nd       !! Set up second derivative operator
+    procedure :: interpl_mid     !! Set up interpolation operator
+    procedure :: stagder_1st     !! Set up staggered derivative operator
+    procedure :: preprocess_dist !! Preprocess for distributed algorithm
+    procedure :: preprocess_thom !! Preprocess for Thomas algorithm
   end type tdsops_t
 
   interface tdsops_t
@@ -49,10 +87,21 @@ module m_tdsops
     !! Directional tridiagonal solver container.
     !!
     !! This class contains the preprocessed tridiagonal solvers for operating
-    !! in each coordinate direction.
-    class(tdsops_t), allocatable :: der1st, der1st_sym, der2nd, der2nd_sym, &
-      stagder_v2p, stagder_p2v, interpl_v2p, interpl_p2v
-    integer :: dir
+    !! in a specific coordinate direction (x, y, or z). Each direction requires
+    !! different operators for:
+    !! - Regular and symmetric first derivatives
+    !! - Regular and symmetric second derivatives
+    !! - Staggered derivatives (vertex-to-cell and cell-to-vertex)
+    !! - Interpolation (vertex-to-cell and cell-to-vertex)
+    class(tdsops_t), allocatable :: der1st        !! First derivative operator
+    class(tdsops_t), allocatable :: der1st_sym    !! Symmetric first derivative operator
+    class(tdsops_t), allocatable :: der2nd        !! Second derivative operator
+    class(tdsops_t), allocatable :: der2nd_sym    !! Symmetric second derivative operator
+    class(tdsops_t), allocatable :: stagder_v2p   !! Staggered derivative (vertex to cell)
+    class(tdsops_t), allocatable :: stagder_p2v   !! Staggered derivative (cell to vertex)
+    class(tdsops_t), allocatable :: interpl_v2p   !! Interpolation (vertex to cell)
+    class(tdsops_t), allocatable :: interpl_p2v   !! Interpolation (cell to vertex)
+    integer :: dir                                 !! Direction index (DIR_X, DIR_Y, DIR_Z)
   end type dirps_t
 
 contains
@@ -61,44 +110,57 @@ function tdsops_init( &
     n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     ) result(tdsops)
-    !! Constructor function for the tdsops_t class.
+    !! Initialise and construct a tridiagonal operator.
     !!
-    !! 'n_tds', 'delta', 'operation', 'scheme', 'bc_start', and 'bc_end' are
-    !! necessary arguments. The remaining arguments are optional.
+    !! This function creates a preprocessed tridiagonal operator for compact
+    !! finite difference operations. Required arguments are 'n_tds', 'delta',
+    !! 'operation', 'scheme', 'bc_start', and 'bc_end'. Optional arguments
+    !! enable stretched grids, staggered operations, and boundary condition tuning.
     !!
-    !! 'stretch' is for obtaining the correct derivations in a stretched mesh
-    !! 'stretch_correct' is for correcting the second derivative with the first
+    !! **Operation types:**
+    !! - 'first-deriv': First derivative \( \frac{\partial f}{\partial x} \)
+    !! - 'second-deriv': Second derivative \( \frac{\partial^2 f}{\partial x^2} \)
+    !! - 'interpolate': Interpolation between grids
+    !! - 'stag-deriv': Staggered derivative (vertex ↔ cell)
     !!
-    !! 'from_to' is necessary for interpolation and staggared derivative, and
-    !! it can be 'v2p' or 'p2v'.
-    !! If the specific region the instance is operating is not a boundary
-    !! region, then 'bc_start' and 'bc_end' are BC_HALO.
+    !! **Boundary conditions:**
+    !! - BC_PERIODIC: Periodic boundaries
+    !! - BC_NEUMANN: Neumann (zero gradient) boundaries
+    !! - BC_DIRICHLET: Dirichlet (fixed value) boundaries
     !!
-    !! 'sym' is relevant when the BC is free-slip. If sym is .true. then it
-    !! means the field we operate on is assumed to be an even function
-    !! (symmetric, cos type) accross the boundary. If it is .false. it means
-    !! the field is assumed to be an odd function (anti-symmetric, sin type).
+    !! **Optional stretched grid support:**
+    !! 'stretch' provides stretching coefficients for non-uniform grids.
+    !! 'stretch_correct' applies correction for second derivatives on stretched grids.
     !!
-    !! 'c_nu', 'nu0_nu' are relevant when operation is second order
-    !! derivative and scheme is compact6-hyperviscous.
+    !! **Staggered operations:**
+    !! 'from_to' specifies direction: 'v2p' (vertex-to-cell) or 'p2v' (cell-to-vertex)
+    !!
+    !! **Symmetry for free-slip boundaries:**
+    !! 'sym' determines field symmetry at Neumann boundaries:
+    !! - .true. = symmetric (cos-type, even function)
+    !! - .false. = anti-symmetric (sin-type, odd function)
+    !!
+    !! **Hyperviscosity parameters:**
+    !! 'c_nu' and 'nu0_nu' are used for compact6-hyperviscous second derivatives
     implicit none
 
-    type(tdsops_t) :: tdsops !! return value of the function
-
-    integer, intent(in) :: n_tds !! Tridiagonal system size
-    real(dp), intent(in) :: delta !! Grid spacing
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end !! Boundary Cond.
-    real(dp), optional, intent(in) :: stretch(:) !! Stretching coefficients
-    real(dp), optional, intent(in) :: stretch_correct(:) !! Stretch correction
-    integer, optional, intent(in) :: n_halo !! Number of halo cells
-    character(*), optional, intent(in) :: from_to !! 'v2p' or 'p2v'
-    logical, optional, intent(in) :: sym !! (==npaire), only for Neumann BCs
-    real(dp), optional, intent(in) :: c_nu, nu0_nu !! params for hypervisc.
+    type(tdsops_t) :: tdsops                         !! Constructed tridiagonal operator
+
+    integer, intent(in) :: n_tds                     !! Tridiagonal system size
+    real(dp), intent(in) :: delta                    !! Grid spacing
+    character(*), intent(in) :: operation            !! Operation type
+    character(*), intent(in) :: scheme               !! Numerical scheme name
+    integer, intent(in) :: bc_start, bc_end          !! Boundary conditions
+    real(dp), optional, intent(in) :: stretch(:)     !! Grid stretching coefficients
+    real(dp), optional, intent(in) :: stretch_correct(:)  !! Stretch correction
+    integer, optional, intent(in) :: n_halo          !! Number of halo cells
+    character(*), optional, intent(in) :: from_to    !! Staggering: 'v2p' or 'p2v'
+    logical, optional, intent(in) :: sym             !! Symmetry for Neumann BCs
+    real(dp), optional, intent(in) :: c_nu, nu0_nu   !! Hyperviscosity parameters
 #ifdef SINGLE_PREC
-    real(dp) :: tol = 1e-12
+    real(dp) :: tol = 1e-12                          !! Tolerance for checking small coefficients in single precision
 #else
-    real(dp) :: tol = 1e-16
+    real(dp) :: tol = 1e-16                          !! Tolerance for checking small coefficients in double precision
 #endif
 
     integer :: n, n_stencil
@@ -197,13 +259,28 @@ function tdsops_init( &
   end function tdsops_init
 
   subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym)
+    !! Set up first derivative operator.
+    !!
+    !! Configures the compact finite difference operator for computing first
+    !! derivatives \( \frac{\partial f}{\partial x} \). Supports various compact
+    !! schemes with different orders of accuracy:
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order accuracy
+    !! - 'compact6-exp': 6th-order with exponential profile
+    !! - 'compact6-hyp': 6th-order with hyperbolic profile
+    !!
+    !! The operator is built for the tridiagonal system:
+    !! \[ \alpha f'_{i-1} + f'_i + \alpha f'_{i+1} = a \frac{f_{i+1} - f_{i-1}}{2\Delta x} + b \frac{f_{i+2} - f_{i-2}}{4\Delta x} \]
+    !!
+    !! Boundary conditions modify the stencil near domain boundaries.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, afi, bfi
@@ -344,14 +421,30 @@ end subroutine deriv_1st
 
   subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
                        c_nu, nu0_nu)
+    !! Set up second derivative operator.
+    !!
+    !! Configures the compact finite difference operator for computing second
+    !! derivatives \( \frac{\partial^2 f}{\partial x^2} \). Supports various compact
+    !! schemes with different orders of accuracy and optional hyperviscosity.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order accuracy
+    !! - 'compact6-hyperviscous': 6th-order with selective hyperviscosity
+    !!
+    !! The operator is built for the tridiagonal system:
+    !! \[ \alpha f''_{i-1} + f''_i + \alpha f''_{i+1} = a \frac{f_{i+1} - 2f_i + f_{i-1}}{\Delta x^2} + b \frac{f_{i+2} - 2f_i + f_{i-2}}{4\Delta x^2} \]
+    !!
+    !! **Hyperviscosity:** Optional 'c_nu' and 'nu0_nu' parameters enable selective
+    !! damping of high-frequency modes for numerical stability.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
+    real(dp), optional, intent(in) :: c_nu      !! Hyperviscosity coefficient
+    real(dp), optional, intent(in) :: nu0_nu    !! Hyperviscosity parameter
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, asi, bsi, csi, dsi
@@ -556,12 +649,29 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
   end subroutine deriv_2nd
 
   subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
+    !! Set up interpolation operator between vertex and cell grids.
+    !!
+    !! Configures the compact interpolation operator for transferring data
+    !! between staggered grids (vertex-centred ↔ cell-centred). Uses compact
+    !! schemes for high-order accuracy.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order interpolation
+    !! - 'classic': Classical 2nd-order interpolation
+    !!
+    !! **Direction:**
+    !! - 'v2p': Vertex to cell-centre (pressure point)
+    !! - 'p2v': Cell-centre to vertex
+    !!
+    !! The interpolation is critical for maintaining consistency between
+    !! velocity and pressure grids in staggered arrangements.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    character(*), intent(in) :: scheme, from_to
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    character(*), intent(in) :: scheme          !! Interpolation scheme name
+    character(*), intent(in) :: from_to         !! Direction: 'v2p' or 'p2v'
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, aici, bici, cici, dici
@@ -702,13 +812,32 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
   end subroutine interpl_mid
 
   subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
+    !! Set up staggered first derivative operator.
+    !!
+    !! Configures the compact operator for computing first derivatives on
+    !! staggered grids, where the derivative is computed at a different grid
+    !! location than the input data.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order staggered derivative
+    !! - 'classic': Classical 2nd-order staggered derivative
+    !!
+    !! **Direction:**
+    !! - 'v2p': Derivative from vertex grid to cell-centre grid
+    !! - 'p2v': Derivative from cell-centre grid to vertex grid
+    !!
+    !! Staggered derivatives are essential for:
+    !! - Computing divergence and gradient on staggered grids
+    !! - Maintaining numerical stability in pressure-velocity coupling
+    !! - Accurate representation of boundary conditions
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme, from_to
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    character(*), intent(in) :: from_to         !! Direction: 'v2p' or 'p2v'
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, aci, bci
@@ -810,11 +939,23 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
   end subroutine stagder_1st
 
   subroutine preprocess_dist(self, dist_b)
+    !! Preprocess tridiagonal system for distributed algorithm.
+    !!
+    !! This subroutine preprocesses the tridiagonal matrix coefficients for
+    !! use in the distributed (parallel) tridiagonal solver algorithm. The
+    !! preprocessing follows Algorithm 3 from:
+    !! Reference: DOI: 10.1109/MCSE.2021.3130544
+    !!
+    !! The distributed algorithm consists of two phases:
+    !! 1. **Forward/backward elimination**: Reduces the system in parallel subdomains
+    !! 2. **Back-substitution**: Applies corrections from neighbouring ranks
+    !!
+    !! This preprocessing computes the coefficients (dist_fw, dist_bw, dist_sa,
+    !! dist_sc, dist_af) needed for both phases, enabling efficient parallel execution.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-
-    real(dp), dimension(:), intent(in) :: dist_b
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), dimension(:), intent(in) :: dist_b  !! Diagonal coefficients of tridiagonal system
 
     integer :: i
 
@@ -869,10 +1010,24 @@ subroutine preprocess_dist(self, dist_b)
   end subroutine preprocess_dist
 
   subroutine preprocess_thom(self, b)
+    !! Preprocess tridiagonal system for Thomas algorithm.
+    !!
+    !! This subroutine preprocesses the tridiagonal matrix coefficients for
+    !! use in the Thomas algorithm (serial tridiagonal solver). The Thomas
+    !! algorithm is a simplified form of Gaussian elimination optimised for
+    !! tridiagonal systems.
+    !!
+    !! The preprocessing performs forward elimination on the coefficients:
+    !! \( c'_i = c_i / (b_i - a_i \cdot c'_{i-1}) \)
+    !! \( d'_i = (d_i - a_i \cdot d'_{i-1}) / (b_i - a_i \cdot c'_{i-1}) \)
+    !!
+    !! This enables efficient back-substitution during the solve phase. This
+    !! algorithm is used within individual MPI ranks when the distributed
+    !! algorithm is employed, or for the entire domain in serial execution.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), dimension(:), intent(in) :: b
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), dimension(:), intent(in) :: b    !! Diagonal coefficients of tridiagonal system
 
     integer :: i, n
 
diff --git a/src/time_integrator.f90 b/src/time_integrator.f90
index 0ac159246..1bcf2bdf5 100644
--- a/src/time_integrator.f90
+++ b/src/time_integrator.f90
@@ -1,4 +1,26 @@
 module m_time_integrator
+  !! Time integration schemes for temporal advancement.
+  !!
+  !! This module provides explicit time integration methods for advancing
+  !! solutions in time. It supports two families of schemes:
+  !!
+  !! **1. Runge-Kutta (RK) Methods**
+  !! Multi-stage schemes that achieve high-order accuracy within a single
+  !! timestep. Supported orders: RK1 (Euler), RK2, RK3, RK4. Each stage
+  !! requires an evaluation of the right-hand side (derivative).
+  !!
+  !! **2. Adams-Bashforth (AB) Methods**
+  !! Multi-step schemes that use derivative information from previous
+  !! timesteps to achieve high-order accuracy. Supported orders: AB1, AB2,
+  !! AB3, AB4. These methods are more memory-efficient than RK schemes
+  !! for the same order of accuracy.
+  !!
+  !! The `time_intg_t` type encapsulates all integration state and provides
+  !! a unified interface through the step procedure pointer, which routes
+  !! to either runge_kutta() or adams_bashforth() based on the selected method.
+  !!
+  !! Old timestep/stage data is stored in the `olds` array and managed
+  !! automatically through rotation mechanisms for AB methods.
   use m_allocator, only: allocator_t
   use m_base_backend, only: base_backend_t
   use m_common, only: dp, DIR_X
@@ -9,19 +31,65 @@ module m_time_integrator
   private adams_bashforth, runge_kutta
 
   type :: time_intg_t
-    integer :: method, istep, istage, order, nstep, nstage, nvars, nolds
-    real(dp) :: coeffs(4, 4)
-    real(dp) :: rk_b(4, 4)
-    real(dp) :: rk_a(3, 3, 4)
-    character(len=3) :: sname
-    type(flist_t), allocatable :: olds(:, :)
-    class(base_backend_t), pointer :: backend
-    class(allocator_t), pointer :: allocator
-    procedure(stepper_func), pointer :: step => null()
+    !! Time integrator for explicit multi-step and multi-stage methods.
+    !!
+    !! This type encapsulates all data and methods needed for time integration
+    !! of ordinary differential equations (ODEs) arising from spatial discretization
+    !! of the Navier-Stokes equations:
+    !!
+    !! \[
+    !! \frac{d\mathbf{u}}{dt} = \mathbf{F}(\mathbf{u}, t)
+    !! \]
+    !!
+    !! where \(\mathbf{F}\) represents the spatial operators (advection, diffusion,
+    !! pressure gradient, etc.).
+    !!
+    !! **Supported Methods:**
+    !!
+    !! - **Adams-Bashforth (AB1-AB4)**: Explicit multi-step methods using
+    !!   previous timestep derivatives. Efficient (single evaluation per step)
+    !!   but requires startup procedure for higher orders.
+    !! - **Runge-Kutta (RK1-RK4)**: Explicit multi-stage methods using
+    !!   intermediate stages within a timestep. Self-starting but requires
+    !!   multiple evaluations per step.
+    !!
+    !! **Method Selection:**
+    !!
+    !! The `step` procedure pointer is bound at initialization to either
+    !! `runge_kutta()` or `adams_bashforth()` based on the method name
+    !! (e.g., "AB3" or "RK4"), enabling polymorphic time stepping.
+    !!
+    !! **Data Management:**
+    !!
+    !! - **AB methods**: Store previous timestep derivatives in `olds` array,
+    !!   rotated each timestep to maintain history
+    !! - **RK methods**: Store intermediate stage solutions in `olds` array,
+    !!   overwritten within each timestep
+    !!
+    !! **Startup Procedure (AB only):**
+    !!
+    !! Higher-order AB methods (AB2-AB4) ramp up from first-order during initial
+    !! timesteps until sufficient derivative history is available.
+    integer :: method       !! Integration method identifier (unused, kept for compatibility)
+    integer :: istep        !! Current timestep number (for AB startup ramping)
+    integer :: istage       !! Current stage number within timestep (RK only)
+    integer :: order        !! Order of accuracy of the scheme (1-4)
+    integer :: nstep        !! Number of timesteps needed (AB: order, RK: 1)
+    integer :: nstage       !! Number of stages per timestep (AB: 1, RK: order)
+    integer :: nvars        !! Number of variables being integrated
+    integer :: nolds        !! Number of old derivatives/solutions to store
+    real(dp) :: coeffs(4, 4)  !! Adams-Bashforth coefficients [stage, order]
+    real(dp) :: rk_b(4, 4)    !! Runge-Kutta final weights [stage, order]
+    real(dp) :: rk_a(3, 3, 4) !! Runge-Kutta stage weights [from_stage, to_stage, order]
+    character(len=3) :: sname !! Scheme name (e.g., 'AB3', 'RK4')
+    type(flist_t), allocatable :: olds(:, :) !! Old derivatives/solutions [nvars, nolds]
+    class(base_backend_t), pointer :: backend    !! Computational backend for operations
+    class(allocator_t), pointer :: allocator     !! Memory allocator for field storage
+    procedure(stepper_func), pointer :: step => null() !! Function pointer to integration method
   contains
-    procedure :: finalize
-    procedure :: runge_kutta
-    procedure :: adams_bashforth
+    procedure :: finalize       !! Clean up and release allocated memory
+    procedure :: runge_kutta    !! Runge-Kutta time integration implementation
+    procedure :: adams_bashforth !! Adams-Bashforth time integration implementation
   end type time_intg_t
 
   interface time_intg_t
@@ -30,25 +98,34 @@ module m_time_integrator
 
   abstract interface
     subroutine stepper_func(self, curr, deriv, dt)
+      !! Abstract interface for time stepping functions.
+      !!
+      !! Defines the signature for integration methods (RK or AB).
+      !! Each method takes the current solution, its derivative, and
+      !! the timestep size, and updates the solution accordingly.
       import :: time_intg_t
       import :: dp
       import :: flist_t
       implicit none
 
-      class(time_intg_t), intent(inout) :: self
-      type(flist_t), intent(inout) :: curr(:)
-      type(flist_t), intent(in) :: deriv(:)
-      real(dp), intent(in) :: dt
+      class(time_intg_t), intent(inout) :: self  !! Time integrator state
+      type(flist_t), intent(inout) :: curr(:)    !! Current solution variables [nvars]
+      type(flist_t), intent(in) :: deriv(:)      !! Time derivatives of variables [nvars]
+      real(dp), intent(in) :: dt                 !! Timestep size
     end subroutine stepper_func
   end interface
 
 contains
 
   subroutine finalize(self)
+    !! Finalise time integrator and release allocated resources.
+    !!
+    !! Releases all field storage blocks used for storing old derivatives
+    !! or stage solutions, and deallocates the olds array.
     implicit none
 
     !type(time_intg_t), intent(inout) :: self
-    class(time_intg_t), intent(inout) :: self
+    class(time_intg_t), intent(inout) :: self !! Time integrator to finalise
 
     integer :: i, j
 
@@ -67,13 +144,32 @@ subroutine finalize(self)
   end subroutine finalize
 
   function init(backend, allocator, method, nvars)
+    !! Initialise time integrator with specified method and coefficients.
+    !!
+    !! This constructor configures the time integration scheme based on the
+    !! method string (e.g., 'AB3' or 'RK4'). It initialises all Runge-Kutta
+    !! and Adams-Bashforth coefficients for orders 1-4, then selects the
+    !! appropriate method and allocates storage for old derivatives or stages.
+    !!
+    !! **Supported Methods:**
+    !! - AB1, AB2, AB3, AB4: Adams-Bashforth (explicit multi-step)
+    !! - RK1, RK2, RK3, RK4: Runge-Kutta (explicit multi-stage)
+    !!
+    !! **RK Coefficients (Butcher tableau):**
+    !! - RK1: Forward Euler
+    !! - RK2: Midpoint method
+    !! - RK3: Strong Stability Preserving RK3 (SSP-RK3)
+    !! - RK4: Classical fourth-order Runge-Kutta
+    !!
+    !! **AB Coefficients:**
+    !! Derived from polynomial extrapolation of previous derivatives.
     implicit none
 
-    type(time_intg_t) :: init
-    class(base_backend_t), pointer :: backend
-    class(allocator_t), pointer :: allocator
-    character(3), intent(in) :: method
-    integer, intent(in) :: nvars
+    type(time_intg_t) :: init                         !! Initialised time integrator
+    class(base_backend_t), pointer :: backend         !! Computational backend
+    class(allocator_t), pointer :: allocator          !! Memory allocator
+    character(3), intent(in) :: method                !! Integration method ('AB3', 'RK4', etc.)
+    integer, intent(in) :: nvars                      !! Number of variables to integrate
 
     integer :: i, j, stat
 
@@ -160,12 +256,27 @@ function init(backend, allocator, method, nvars)
   end function init
 
   subroutine runge_kutta(self, curr, deriv, dt)
+    !! Advance solution using Runge-Kutta method.
+    !!
+    !! Implements explicit Runge-Kutta schemes of orders 1-4. The general
+    !! form for an s-stage RK method is:
+    !!
+    !! \[ k_i = f(t_n + c_i \Delta t, u_n + \Delta t \sum_{j=1}^{i-1} a_{ij} k_j) \]
+    !! \[ u_{n+1} = u_n + \Delta t \sum_{i=1}^{s} b_i k_i \]
+    !!
+    !! Where \( k_i \) are stage derivatives, \( a_{ij} \) are stage weights,
+    !! and \( b_i \) are final combination weights. This implementation stores
+    !! stage derivatives in `olds(:, 2:nstage+1)` and the initial solution in
+    !! `olds(:, 1)`.
+    !!
+    !! The subroutine is called once per stage. When `istage == nstage`, it
+    !! computes the final solution and resets the stage counter.
     implicit none
 
-    class(time_intg_t), intent(inout) :: self
-    type(flist_t), intent(inout) :: curr(:)
-    type(flist_t), intent(in) :: deriv(:)
-    real(dp), intent(in) :: dt
+    class(time_intg_t), intent(inout) :: self  !! Time integrator state
+    type(flist_t), intent(inout) :: curr(:)    !! Current solution (updated)
+    type(flist_t), intent(in) :: deriv(:)      !! Stage derivative
+    real(dp), intent(in) :: dt                 !! Timestep size
 
     integer :: i, j
 
@@ -219,12 +330,27 @@ subroutine runge_kutta(self, curr, deriv, dt)
   end subroutine runge_kutta
 
   subroutine adams_bashforth(self, curr, deriv, dt)
+    !! Advance solution using Adams-Bashforth method.
+    !!
+    !! Implements explicit Adams-Bashforth schemes of orders 1-4. These
+    !! multi-step methods use derivatives from previous timesteps:
+    !!
+    !! \[ u_{n+1} = u_n + \Delta t \sum_{i=0}^{s-1} b_i f_{n-i} \]
+    !!
+    !! Where \( f_{n-i} \) are stored derivatives from previous steps and
+    !! \( b_i \) are the Adams-Bashforth coefficients. The method has an
+    !! automatic startup phase: for the first `order` steps, it uses a
+    !! lower-order scheme (e.g., AB2 uses AB1 on step 1, then AB2 on step 2+).
+    !!
+    !! Old derivatives are stored in `olds(:, 1:nstep-1)` and rotated after
+    !! each step. The current derivative is used directly and then stored
+    !! in `olds(:, 1)` for the next timestep.
     implicit none
 
-    class(time_intg_t), intent(inout) :: self
-    type(flist_t), intent(inout) :: curr(:)
-    type(flist_t), intent(in) :: deriv(:)
-    real(dp), intent(in) :: dt
+    class(time_intg_t), intent(inout) :: self  !! Time integrator state
+    type(flist_t), intent(inout) :: curr(:)    !! Current solution (updated)
+    type(flist_t), intent(in) :: deriv(:)      !! Current time derivative
+    real(dp), intent(in) :: dt                 !! Timestep size
 
     integer :: i, j
     integer :: nstep
@@ -266,10 +392,19 @@ subroutine adams_bashforth(self, curr, deriv, dt)
   end subroutine adams_bashforth
 
   subroutine rotate(sol, n)
+    !! Rotate pointer array for Adams-Bashforth old derivatives.
+    !!
+    !! Shifts pointers in the array to make room for a new derivative:
+    !! sol(i) <- sol(i-1) for i from n down to 2, and sol(1) gets the
+    !! old sol(n). This implements a circular buffer for old derivatives
+    !! without copying data - only pointers are reassigned.
+    !!
+    !! Example for n=3: [new, old1, old2] becomes [?, new, old1]
+    !! (where ? will be filled with the newest derivative)
     implicit none
 
-    type(flist_t), intent(inout) :: sol(:)
-    integer, intent(in) :: n
+    type(flist_t), intent(inout) :: sol(:)  !! Array of field list pointers to rotate
+    integer, intent(in) :: n                !! Number of elements to rotate
 
     integer :: i
     class(field_t), pointer :: ptr
diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90
index cf1a1da4d..411332b7f 100644
--- a/src/vector_calculus.f90
+++ b/src/vector_calculus.f90
@@ -1,4 +1,31 @@
 module m_vector_calculus
+  !! Vector calculus operators for finite-difference.
+  !!
+  !! This module provides implementations of fundamental differential operators
+  !! (divergence, gradient, curl, Laplacian) on staggered and collocated grids.
+  !! All operators are built using high-order compact finite-difference schemes
+  !! from the tdsops module.
+  !!
+  !! **Key Features:**
+  !!
+  !! - **Staggered grid support**: Operators handle transitions between cell centres
+  !!   (`CELL`) and vertices (`VERT`) through staged derivatives and interpolation
+  !! - **Data reordering**: Automatically manages pencil decomposition, reordering
+  !!   fields between \( X, Y, Z \) orientations as needed for derivatives
+  !! - **Memory efficiency**: Uses allocator blocks for temporary fields with
+  !!   careful release management to minimise memory footprint
+  !!
+  !! **Grid Conventions:**
+  !!
+  !! - `CELL` (`data_loc=CELL`): Variables stored at cell centres (e.g., pressure)
+  !! - `VERT` (`data_loc=VERT`): Variables stored at cell vertices (e.g., velocity)
+  !! - Staggered operators (`v2c`, `c2v`) transition between these locations
+  !!
+  !! **Data Layouts:**
+  !!
+  !! - `DIR_X`: Pencil decomposed in \( X \) direction (default for most operations)
+  !! - `DIR_Y`: Pencil decomposed in \( Y \) direction (for Y derivatives)
+  !! - `DIR_Z`: Pencil decomposed in \( Z \) direction (for Z derivatives)
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -11,13 +38,16 @@ module m_vector_calculus
   implicit none
 
   type :: vector_calculus_t
-    !! Defines vector calculus operators
-    class(base_backend_t), pointer :: backend
+    !! Container for vector calculus operators.
+    !!
+    !! Provides methods for computing curl, divergence, gradient, and Laplacian.
+    !! All operations are delegated to the backend for computational flexibility.
+    class(base_backend_t), pointer :: backend !! Computational backend (CPU/GPU)
   contains
-    procedure :: curl
-    procedure :: divergence_v2c
-    procedure :: gradient_c2v
-    procedure :: laplacian
+    procedure :: curl            !! Compute curl (vorticity) of vector field
+    procedure :: divergence_v2c  !! Compute divergence from vertices to cell centres
+    procedure :: gradient_c2v    !! Compute gradient from cell centres to vertices
+    procedure :: laplacian       !! Compute Laplacian of scalar field
   end type vector_calculus_t
 
   interface vector_calculus_t
@@ -27,10 +57,15 @@ module m_vector_calculus
 contains
 
   function init(backend) result(vector_calculus)
+    !! Initialise vector calculus module with computational backend.
+    !!
+    !! Simply stores a pointer to the backend, which provides access to
+    !! the allocator, reordering routines, and tridiagonal solvers needed
+    !! for computing derivatives.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(vector_calculus_t) :: vector_calculus
+    class(base_backend_t), target, intent(inout) :: backend !! Computational backend
+    type(vector_calculus_t) :: vector_calculus              !! Initialised vector calculus object
 
     vector_calculus%backend => backend
 
@@ -142,21 +177,33 @@ subroutine divergence_v2c(self, div_u, u, v, w, &
                             x_stagder_v2c, x_interpl_v2c, &
                             y_stagder_v2c, y_interpl_v2c, &
                             z_stagder_v2c, z_interpl_v2c)
-    !! Divergence of a vector field (u, v, w).
+    !! Compute divergence of a vector field from vertices to cell centres.
+    !!
+    !! Computes:
+    !! \[ \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} +
+    !!    \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \]
+    !!
+    !! Input velocity components (u, v, w) are at vertices (VERT), and
+    !! divergence is evaluated at cell centres (CELL). This requires:
+    !! - **Staggered derivatives** in the aligned direction (e.g., du/dx uses x_stagder_v2c)
+    !! - **Interpolation** for cross terms (e.g., v and w interpolated in x direction)
     !!
-    !! Evaluated at the cell centers (data_loc=CELL)
-    !! Input fields are at vertices (data_loc=VERT)
+    !! The algorithm proceeds dimension by dimension:
+    !! 1. Compute du/dx (staggered), interpolate dv/dx, dw/dx in DIR_X
+    !! 2. Reorder to DIR_Y, compute dv/dy (staggered), interpolate du/dy, dw/dy
+    !! 3. Reorder to DIR_Z, compute dw/dz (staggered), interpolate du/dz
+    !! 4. Sum all components: div = du/dx + dv/dy + dw/dz
     !!
-    !! Input fields are in DIR_X data layout.
-    !! Output field is in DIR_Z data layout.
+    !! **Input:** All fields in DIR_X layout
+    !! **Output:** div_u in DIR_Z layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: div_u
-    class(field_t), intent(in) :: u, v, w
-    class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, &
-      y_stagder_v2c, y_interpl_v2c, &
-      z_stagder_v2c, z_interpl_v2c
+    class(vector_calculus_t) :: self     !! Vector calculus object
+    class(field_t), intent(inout) :: div_u !! Divergence output (CELL, DIR_Z)
+    class(field_t), intent(in) :: u, v, w  !! Velocity components (VERT, DIR_X)
+    class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, & !! X operators
+      y_stagder_v2c, y_interpl_v2c, &   !! Y operators
+      z_stagder_v2c, z_interpl_v2c      !! Z operators
 
     class(field_t), pointer :: du_x, dv_x, dw_x, &
       u_y, v_y, w_y, du_y, dv_y, dw_y, &
@@ -248,21 +295,34 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
                           x_stagder_c2v, x_interpl_c2v, &
                           y_stagder_c2v, y_interpl_c2v, &
                           z_stagder_c2v, z_interpl_c2v)
-    !! Gradient of a scalar field 'p'.
+    !! Compute gradient of a scalar field from cell centres to vertices.
+    !!
+    !! Computes:
+    !! \[ \nabla p = \left( \frac{\partial p}{\partial x},
+    !!    \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \]
+    !!
+    !! Input pressure p is at cell centres (CELL), and gradient components
+    !! are evaluated at vertices (VERT). This is the inverse operation of
+    !! divergence_v2c and is used in projection methods for incompressible flow.
     !!
-    !! Evaluated at the vertices (data_loc=VERT)
-    !! Input field is at cell centers (data_loc=CELL)
+    !! The algorithm proceeds in reverse order (Z to Y to X):
+    !! 1. Compute dp/dz (staggered), interpolate p in Z direction (DIR_Z)
+    !! 2. Reorder to DIR_Y, compute dp/dy (staggered), interpolate p and dpdz
+    !! 3. Reorder to DIR_X, compute dp/dx (staggered), interpolate dpdy and dpdz
     !!
-    !! Input field is in DIR_Z data layout.
-    !! Output fields (dpdx, dpdy, dpdz) are in DIR_X data layout.
+    !! This reverse ordering optimises memory usage by minimising temporary
+    !! field allocations.
+    !!
+    !! **Input:** p in DIR_Z layout
+    !! **Output:** dpdx, dpdy, dpdz in DIR_X layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: dpdx, dpdy, dpdz
-    class(field_t), intent(in) :: p
-    class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, &
-      y_stagder_c2v, y_interpl_c2v, &
-      z_stagder_c2v, z_interpl_c2v
+    class(vector_calculus_t) :: self                      !! Vector calculus object
+    class(field_t), intent(inout) :: dpdx, dpdy, dpdz    !! Gradient components (VERT, DIR_X)
+    class(field_t), intent(in) :: p                       !! Scalar field (CELL, DIR_Z)
+    class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, & !! X operators
+      y_stagder_c2v, y_interpl_c2v, &   !! Y operators
+      z_stagder_c2v, z_interpl_c2v      !! Z operators
 
     class(field_t), pointer :: p_sxy_z, dpdz_sxy_z, &
       p_sxy_y, dpdz_sxy_y, &
@@ -331,18 +391,31 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
   end subroutine gradient_c2v
 
   subroutine laplacian(self, lapl_u, u, x_der2nd, y_der2nd, z_der2nd)
-    !! Laplacian of a scalar field 'u'.
+    !! Compute Laplacian of a scalar field.
+    !!
+    !! Computes:
+    !! \[ \nabla^2 u = \frac{\partial^2 u}{\partial x^2} +
+    !!    \frac{\partial^2 u}{\partial y^2} + \frac{\partial^2 u}{\partial z^2} \]
+    !!
+    !! The Laplacian is evaluated at the same grid location (CELL or VERT)
+    !! as the input field. This operator is used in diffusion terms and
+    !! Poisson equations.
     !!
-    !! Evaluated at the data_loc defined by the input u field
+    !! The algorithm computes second derivatives in each direction:
+    !! 1. Compute \( d^2u/dx^2 \) directly in DIR_X
+    !! 2. Reorder to DIR_Y, compute \( d^2u/dy^2 \), sum into result via sum_yintox
+    !! 3. Reorder to DIR_Z, compute \( d^2u/dz^2 \), sum into result via sum_zintox
     !!
-    !! Input and output fields are in DIR_X layout.
+    !! The sum_yintox and sum_zintox operations add directional derivatives
+    !! directly into the DIR_X result field without additional reordering.
+    !!
+    !! **Input/Output:** All fields in DIR_X layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: lapl_u
-    class(field_t), intent(in) :: u
-
-    class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd
+    class(vector_calculus_t) :: self           !! Vector calculus object
+    class(field_t), intent(inout) :: lapl_u    !! Laplacian output (same data_loc as u, DIR_X)
+    class(field_t), intent(in) :: u            !! Scalar field (DIR_X)
+    class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd !! Second derivative operators
 
     class(field_t), pointer :: u_y, d2u_y, u_z, d2u_z
 
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index dcfed3fd8..663d75f14 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -1,4 +1,44 @@
 program xcompact
+  !! Main program for X3D2 CFD solver.
+  !!
+  !! X3D2 is a high-order finite-difference incompressible Navier-Stokes
+  !! solver based on Xcompact3D/Incompact3D. It solves the incompressible
+  !! Navier-Stokes equations using:
+  !!
+  !! - **Compact finite differences** for spatial derivatives (4th-6th order)
+  !! - **Fractional-step method** for pressure-velocity coupling
+  !! - **FFT-based or iterative Poisson solvers** for pressure
+  !! - **Explicit time integration** (Runge-Kutta or Adams-Bashforth)
+  !!
+  !! **Program Flow:**
+  !!
+  !! 1. Initialise MPI and determine rank/size
+  !! 2. Select computational backend (CUDA GPU or OpenMP CPU)
+  !! 3. Read configuration from input file (domain and solver parameters)
+  !! 4. Create mesh with domain decomposition (pencil decomposition)
+  !! 5. Instantiate allocator and backend for the selected platform
+  !! 6. Select and instantiate flow case (channel, TGV, generic, etc.)
+  !! 7. Run simulation via flow_case%run()
+  !! 8. Report timing and finalise MPI
+  !!
+  !! **Backend Options:**
+  !!
+  !! - **CUDA**: GPU acceleration via NVIDIA CUDA (compile with -DCUDA)
+  !! - **OMP**: CPU parallelism via OpenMP threading
+  !!
+  !! **Input:** Namelist file specified as command-line argument (e.g., input.x3d)
+  !!
+  !! **Domain Decomposition:**
+  !!
+  !! X3D2 supports two decomposition strategies:
+  !!
+  !! - **2DECOMP&FFT**: External library used when FFT Poisson solver + OMP backend.
+  !!   Provides optimised pencil decomposition and FFT transforms. Cannot decompose
+  !!   in X-direction (`nproc_dir(1)` must be 1).
+  !! - **Generic**: Built-in X3D2 decomposition used for CUDA backend or when
+  !!   2DECOMP&FFT is unavailable. Can decompose in any direction (X, Y, Z).
+  !!
+  !! The decomposition is selected automatically based on backend and solver type.
   use mpi
 
   use m_allocator
@@ -22,30 +62,31 @@ program xcompact
 
   implicit none
 
-  class(base_backend_t), pointer :: backend
-  class(allocator_t), pointer :: allocator
-  type(allocator_t), pointer :: host_allocator
-  type(mesh_t), target :: mesh
-  class(base_case_t), allocatable :: flow_case
+  class(base_backend_t), pointer :: backend       !! Active computational backend (CUDA or OMP)
+  class(allocator_t), pointer :: allocator        !! Memory allocator for device/host
+  type(allocator_t), pointer :: host_allocator    !! Host memory allocator (for I/O, etc.)
+  type(mesh_t), target :: mesh                    !! Computational mesh with decomposition
+  class(base_case_t), allocatable :: flow_case    !! Flow case instance (polymorphic)
 
 #ifdef CUDA
-  type(cuda_backend_t), target :: cuda_backend
-  type(cuda_allocator_t), target :: cuda_allocator
-  integer :: ndevs, devnum
+  type(cuda_backend_t), target :: cuda_backend    !! CUDA backend implementation
+  type(cuda_allocator_t), target :: cuda_allocator !! CUDA device memory allocator
+  integer :: ndevs, devnum                         !! Number of GPUs, assigned device number
 #else
-  type(omp_backend_t), target :: omp_backend
+  type(omp_backend_t), target :: omp_backend       !! OpenMP backend implementation
 #endif
 
-  type(allocator_t), target :: omp_allocator
+  type(allocator_t), target :: omp_allocator       !! Host/CPU memory allocator
 
-  real(dp) :: t_start, t_end
+  real(dp) :: t_start, t_end                       !! CPU timing for performance measurement
 
-  type(domain_config_t) :: domain_cfg
-  type(solver_config_t) :: solver_cfg
-  character(32) :: backend_name
-  integer :: dims(3), nrank, nproc, ierr
-  logical :: use_2decomp
+  type(domain_config_t) :: domain_cfg              !! Domain configuration from input file
+  type(solver_config_t) :: solver_cfg              !! Solver configuration from input file
+  character(32) :: backend_name                    !! Backend name string ("CUDA" or "OMP")
+  integer :: dims(3), nrank, nproc, ierr           !! Dimensions, MPI rank/size, error code
+  logical :: use_2decomp                           !! Whether to use 2DECOMP&FFT library
 
+  ! Initialise MPI
   call MPI_Init(ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
@@ -74,7 +115,9 @@ program xcompact
     domain_cfg%nproc_dir = [1, 1, nproc]
   end if
 
-  ! Decide whether 2decomp is used or not
+  ! Select decomposition strategy:
+  ! - 2DECOMP&FFT: Used for FFT Poisson solver with OMP backend (optimised)
+  ! - Generic: Used for CUDA backend or non-FFT solvers (more flexible)
   use_2decomp = solver_cfg%poisson_solver_type == 'FFT' &
                 .and. trim(backend_name) == 'OMP'