From 2225990413c39045c69b7d9a80303f87c615cd78 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Wed, 10 Dec 2025 17:55:42 +0000
Subject: [PATCH 01/12] add comments to src files

---
 src/allocator.f90    |  55 +++++++++++-----
 src/common.f90       | 100 ++++++++++++++++++++---------
 src/config.f90       | 122 +++++++++++++++++++++++++----------
 src/field.f90        |  88 ++++++++++++++++---------
 src/mesh.f90         | 128 ++++++++++++++++++++++++-------------
 src/mesh_content.f90 | 148 +++++++++++++++++++++++++------------------
 src/ordering.f90     |  10 +--
 7 files changed, 432 insertions(+), 219 deletions(-)

diff --git a/src/allocator.f90 b/src/allocator.f90
index 698beda79..af6c57361 100644
--- a/src/allocator.f90
+++ b/src/allocator.f90
@@ -1,4 +1,11 @@
 module m_allocator
+  !! Memory allocator module for managing field data blocks.
+  !!
+  !! This module provides an allocator type that manages a pool of memory blocks
+  !! (field_t objects) organised in a linked list. The allocator supports efficient
+  !! memory reuse by allowing blocks to be requested and released, minimizing
+  !! allocation/deallocation overhead during simulations.
+
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C, NULL_LOC
@@ -34,16 +41,18 @@ module m_allocator
      !! [[m_allocator(module):release_block(subroutine)]].  The
      !! released block is then pushed in front of the block list.
 
-    integer :: ngrid, sz
-    !> The id for the next allocated block.  This counter is
-    !> incremented each time a new block is allocated.
+    integer :: ngrid  !! Total number of grid points per block
+    integer :: sz     !! Block size for data reordering
+    !> The ID for the next allocated block. This counter is
+    !! incremented each time a new block is allocated.
     integer :: next_id = 0
-    !> padded dimensions and n_groups in all 'dir's
+    !> Padded dimensions in all directions [3 dims x 4 directions].
+    !! Dimensions are padded based on block size for efficient reordering.
     integer, private :: dims_padded_dir(3, 4)
+    !> Number of groups for reordering in each direction [x, y, z].
     integer, private :: n_groups_dir(3)
-    !> The pointer to the first block on the list.  Non associated if
-    !> the list is empty
-    ! TODO: Rename first to head
+    !> Pointer to the first block on the linked list. Non-associated if
+    !! the list is empty. (TODO: Rename first to head)
     class(field_t), pointer :: first => null()
   contains
     procedure :: get_block
@@ -62,8 +71,14 @@ module m_allocator
 contains
 
   function allocator_init(dims, sz) result(allocator)
-    integer, intent(in) :: dims(3), sz
-    type(allocator_t) :: allocator
+    !! Initialise an allocator for the given grid dimensions and block size.
+    !!
+    !! Creates a new allocator configured for the specified grid dimensions
+    !! with the given block size. Computes padded dimensions and number of
+    !! groups for efficient data reordering operations.
+    integer, intent(in) :: dims(3)    !! Grid dimensions [nx, ny, nz]
+    integer, intent(in) :: sz         !! Block size for reordering
+    type(allocator_t) :: allocator    !! Initialised allocator
 
     integer :: nx, ny, nz, nx_padded, ny_padded, nz_padded
 
@@ -205,21 +220,31 @@ function get_block_ids(self)
   end function get_block_ids
 
   function get_padded_dims(self, dir) result(dims)
+    !! Get padded dimensions for a specific direction.
+    !!
+    !! Returns the padded dimensions used for memory allocation in the
+    !! specified direction. Padding is applied to ensure efficient memory
+    !! access patterns and alignment.
     implicit none
 
-    class(allocator_t), intent(inout) :: self
-    integer, intent(in) :: dir
-    integer :: dims(3)
+    class(allocator_t), intent(inout) :: self  !! Allocator object
+    integer, intent(in) :: dir                 !! Direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer :: dims(3)                         !! Padded dimensions [nx_pad, ny_pad, nz_pad]
 
     dims = self%dims_padded_dir(1:3, dir)
   end function get_padded_dims
 
   function get_n_groups(self, dir) result(n_groups)
+    !! Get number of groups for data reordering in a direction.
+    !!
+    !! Returns the number of groups used for data reordering operations
+    !! in the specified direction. Groups are determined by the block size
+    !! and grid dimensions.
     implicit none
 
-    class(allocator_t), intent(inout) :: self
-    integer, intent(in) :: dir
-    integer :: n_groups
+    class(allocator_t), intent(inout) :: self  !! Allocator object
+    integer, intent(in) :: dir                 !! Direction (DIR_X, DIR_Y, or DIR_Z)
+    integer :: n_groups                        !! Number of groups
 
     n_groups = self%n_groups_dir(dir)
   end function get_n_groups
diff --git a/src/common.f90 b/src/common.f90
index 8eba41c6f..0fec74609 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -1,42 +1,60 @@
 module m_common
+  !! Common module containing global constants, parameters, and utility functions.
+  !!
+  !! This module provides:
+  !! - Precision definitions (single or double precision based on compilation flags)
+  !! - Mathematical constants (e.g., \(\pi\))
+  !! - Direction and reordering constants for domain decomposition
+  !! - Data location flags (vertex, cell, face, edge centered)
+  !! - Boundary condition type constants
+  !! - Utility functions for argument parsing and data manipulation
   use mpi
 
   implicit none
 
 #ifdef SINGLE_PREC
-  integer, parameter :: dp = kind(0.0e0)
-  integer, parameter :: nbytes = 4
-  integer, parameter :: MPI_X3D2_DP = MPI_REAL
-  logical, parameter :: is_sp = .true.
+  integer, parameter :: dp = kind(0.0e0)  !! Double precision kind parameter (single precision)
+  integer, parameter :: nbytes = 4        !! Number of bytes for real numbers
+  integer, parameter :: MPI_X3D2_DP = MPI_REAL  !! MPI datatype for real numbers
+  logical, parameter :: is_sp = .true.    !! Flag indicating single precision
 #else
-  integer, parameter :: dp = kind(0.0d0)
-  integer, parameter :: nbytes = 8
-  integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION
-  logical, parameter :: is_sp = .false.
+  integer, parameter :: dp = kind(0.0d0)  !! Double precision kind parameter (double precision)
+  integer, parameter :: nbytes = 8        !! Number of bytes for real numbers
+  integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION  !! MPI datatype for real numbers
+  logical, parameter :: is_sp = .false.   !! Flag indicating double precision
 #endif
 
-  integer, parameter :: sp = kind(0.0e0)
-  integer, parameter :: i8 = selected_int_kind(18)
+  integer, parameter :: i8 = selected_int_kind(18)  !! Integer kind for 64-bit integers
 
-  real(dp), parameter :: pi = 4*atan(1.0_dp)
+  real(dp), parameter :: pi = 4*atan(1.0_dp)  !! Mathematical constant \(\pi\)
 
+  !> Reordering constants for data layout transformations between directions.
+  !! Format: RDR_<from_dir>2<to_dir> where directions are X, Y, Z, or C (complete/cell-centered)
   integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
                         RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, &
                         RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, &
                         RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34
-  integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
-  integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
-  integer, parameter :: VERT = 0000, & ! Vertex centered data
-                        CELL = 1110, & ! Cell centered data
-                        X_FACE = 1100, & ! Data on faces normal to X
-                        Y_FACE = 1010, & ! Data on faces normal to Y
-                        Z_FACE = 0110, & ! Data on faces normal to Z
-                        X_EDGE = 0010, & ! Data on edges along X
-                        Y_EDGE = 0100, & ! Data on edges along Y
-                        Z_EDGE = 1000, & ! Data on edges along Z
-                        NULL_LOC = -0001 ! The location of data isn't specified
-  integer, parameter :: BC_PERIODIC = 0, BC_NEUMANN = 1, BC_DIRICHLET = 2, &
-                        BC_HALO = -1
+  integer, parameter :: DIR_X = 1  !! X direction index
+  integer, parameter :: DIR_Y = 2  !! Y direction index
+  integer, parameter :: DIR_Z = 3  !! Z direction index
+  integer, parameter :: DIR_C = 4  !! Complete/cell-centered direction index
+  integer, parameter :: POISSON_SOLVER_FFT = 0  !! FFT-based Poisson solver
+  integer, parameter :: POISSON_SOLVER_CG = 1   !! Conjugate gradient Poisson solver
+  integer, parameter :: VERT = 0000, & !! Vertex centered data
+                        CELL = 1110, & !! Cell centered data
+                        X_FACE = 1100, & !! Data on faces normal to X
+                        Y_FACE = 1010, & !! Data on faces normal to Y
+                        Z_FACE = 0110, & !! Data on faces normal to Z
+                        X_EDGE = 0010, & !! Data on edges along X
+                        Y_EDGE = 0100, & !! Data on edges along Y
+                        Z_EDGE = 1000, & !! Data on edges along Z
+                        NULL_LOC = -0001 !! The location of data isn't specified
+  integer, parameter :: BC_PERIODIC = 0   !! Periodic boundary condition
+  integer, parameter :: BC_NEUMANN = 1    !! Neumann boundary condition
+  integer, parameter :: BC_DIRICHLET = 2  !! Dirichlet boundary condition
+  integer, parameter :: BC_HALO = -1      !! Halo/ghost cell boundary condition
+  !> Reordering map matrix for direction transformations.
+  !! Maps from direction (row) to direction (column), yielding the reordering constant.
   integer, protected :: &
     rdr_map(4, 4) = reshape([0, RDR_Y2X, RDR_Z2X, RDR_C2X, &
                              RDR_X2Y, 0, RDR_Z2Y, RDR_C2Y, &
@@ -46,8 +64,13 @@ module m_common
 contains
 
   pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir)
-    integer, intent(out) :: dir_from, dir_to
-    integer, intent(in) :: rdr_dir
+    !! Extract source and destination directions from a reordering constant.
+    !!
+    !! Given a reordering constant (e.g., RDR_X2Y), this subroutine determines
+    !! the source direction and destination direction.
+    integer, intent(out) :: dir_from  !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(out) :: dir_to    !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(in) :: rdr_dir    !! Reordering constant (e.g., RDR_X2Y)
     integer, dimension(2) :: dirs
 
     dirs = findloc(rdr_map, rdr_dir)
@@ -57,15 +80,23 @@ pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir)
   end subroutine
 
   pure integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir)
-      !! Returns RDR_?2? value based on two direction inputs
-    integer, intent(in) :: dir_from, dir_to
+    !! Returns reordering constant based on two direction inputs.
+    !!
+    !! Given a source and destination direction, this function returns the
+    !! corresponding reordering constant (e.g., RDR_X2Y for X to Y).
+    integer, intent(in) :: dir_from  !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer, intent(in) :: dir_to    !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
 
     rdr_dir = rdr_map(dir_from, dir_to)
   end function get_rdr_from_dirs
 
   function get_argument(pos) result(arg)
-    integer, intent(in) :: pos
-    character(:), allocatable :: arg
+    !! Retrieve a command-line argument at the specified position.
+    !!
+    !! This function wraps the intrinsic get_command_argument with error checking
+    !! and automatic string trimming.
+    integer, intent(in) :: pos  !! Position of the command-line argument (1-indexed)
+    character(:), allocatable :: arg  !! The retrieved command-line argument
 
     character(len=200) :: temp
     integer :: stat
@@ -82,7 +113,14 @@ function get_argument(pos) result(arg)
   end function get_argument
 
   integer function move_data_loc(in_data_loc, dir, move) result(out_data_loc)
-    integer, intent(in) :: in_data_loc, dir, move
+    !! Update data location by shifting along a specified direction.
+    !!
+    !! This function modifies a data location flag by moving it along one direction
+    !! (X, Y, or Z) by a specified amount. The data location encoding uses powers of 10
+    !! to represent positions in each direction.
+    integer, intent(in) :: in_data_loc  !! Input data location flag
+    integer, intent(in) :: dir          !! Direction to move (DIR_X, DIR_Y, or DIR_Z)
+    integer, intent(in) :: move         !! Amount to move (typically -1, 0, or 1)
 
     out_data_loc = in_data_loc + move*(10**dir)
   end function move_data_loc
diff --git a/src/config.f90 b/src/config.f90
index 076de2232..b164b3b90 100644
--- a/src/config.f90
+++ b/src/config.f90
@@ -6,54 +6,86 @@ module m_config
 
   implicit none
 
-  integer, parameter :: n_species_max = 99
+  integer, parameter :: n_species_max = 99  !! Maximum number of transported species
 
   type, abstract :: base_config_t
-    !! All config types have a method read to initialise their data
+    !! Base abstract type for all configuration types.
+    !!
+    !! All config types have a deferred read method to initialise their data
+    !! from either a namelist file or a namelist string.
   contains
     procedure(read), deferred :: read
   end type base_config_t
 
   type, extends(base_config_t) :: domain_config_t
-    character(len=30) :: flow_case_name
-    real(dp) :: L_global(3)
-    integer :: dims_global(3), nproc_dir(3)
-    character(len=20) :: BC_x(2), BC_y(2), BC_z(2)
-    character(len=20) :: stretching(3)
-    real(dp) :: beta(3)
+    !! Domain configuration type containing mesh and decomposition settings.
+    !!
+    !! This type stores all parameters related to the computational domain,
+    !! including global dimensions, boundary conditions, mesh stretching,
+    !! and MPI decomposition.
+    character(len=30) :: flow_case_name     !! Name of the flow case (e.g., 'channel', 'tgv', 'generic')
+    real(dp) :: L_global(3)                 !! Global domain lengths in each direction
+    integer :: dims_global(3)               !! Global number of grid points in each direction
+    integer :: nproc_dir(3)                 !! Number of processors in each direction
+    character(len=20) :: BC_x(2)            !! Boundary conditions in x-direction (lower, upper)
+    character(len=20) :: BC_y(2)            !! Boundary conditions in y-direction (lower, upper)
+    character(len=20) :: BC_z(2)            !! Boundary conditions in z-direction (lower, upper)
+    character(len=20) :: stretching(3)      !! Mesh stretching type in each direction
+    real(dp) :: beta(3)                     !! Stretching parameters in each direction
   contains
     procedure :: read => read_domain_nml
   end type domain_config_t
 
   type, extends(base_config_t) :: solver_config_t
-    real(dp) :: Re, dt
-    logical :: ibm_on
-    real(dp), dimension(:), allocatable :: pr_species
-    integer :: n_iters, n_output, n_species
-    logical :: lowmem_transeq, lowmem_fft
-    character(3) :: poisson_solver_type, time_intg
-    character(30) :: der1st_scheme, der2nd_scheme, &
-                     interpl_scheme, stagder_scheme
+    !! Solver configuration type containing numerical and physical parameters.
+    !!
+    !! This type stores parameters related to the numerical solver including
+    !! Reynolds number, time step, iteration counts, discretisation schemes,
+    !! and solver options.
+    real(dp) :: Re                          !! Reynolds number
+    real(dp) :: dt                          !! Time step size
+    logical :: ibm_on                       !! Flag to enable immersed boundary method
+    real(dp), dimension(:), allocatable :: pr_species  !! Prandtl numbers for each species
+    integer :: n_iters                      !! Total number of iterations
+    integer :: n_output                     !! Output frequency (every n_output iterations)
+    integer :: n_species                    !! Number of transported scalar species
+    logical :: lowmem_transeq               !! Use low-memory implementation for transport equation
+    logical :: lowmem_fft                   !! Use low-memory implementation for FFT
+    character(3) :: poisson_solver_type     !! Poisson solver type ('FFT' or 'CG')
+    character(3) :: time_intg               !! Time integration scheme (e.g., 'RK3', 'AB2')
+    character(30) :: der1st_scheme          !! First derivative scheme (e.g., 'compact6')
+    character(30) :: der2nd_scheme          !! Second derivative scheme (e.g., 'compact6')
+    character(30) :: interpl_scheme         !! Interpolation scheme (e.g., 'classic')
+    character(30) :: stagder_scheme         !! Staggered derivative scheme (e.g., 'compact6')
   contains
     procedure :: read => read_solver_nml
   end type solver_config_t
 
   type, extends(base_config_t) :: channel_config_t
-    real(dp) :: noise, omega_rot
-    logical :: rotation
-    integer :: n_rotate
+    !! Channel flow configuration type.
+    !!
+    !! This type contains parameters specific to channel flow simulations,
+    !! including initial perturbations and rotation effects.
+    real(dp) :: noise                       !! Initial noise amplitude for perturbations
+    real(dp) :: omega_rot                   !! Rotation rate for rotating channel flow
+    logical :: rotation                     !! Flag to enable rotation
+    integer :: n_rotate                     !! Number of directions to rotate
   contains
     procedure :: read => read_channel_nml
   end type channel_config_t
 
   type, extends(base_config_t) :: checkpoint_config_t
+    !! Checkpoint and snapshot configuration type.
+    !!
+    !! This type manages simulation restart and output settings including
+    !! checkpoint frequency, snapshot frequency, and file naming conventions.
     integer :: checkpoint_freq = 0                         !! Frequency of checkpointing (0 = off)
     integer :: snapshot_freq = 0                           !! Frequency of snapshots (0 = off)
     logical :: keep_checkpoint = .true.                    !! If false, only keep latest checkpoint
-    character(len=256) :: checkpoint_prefix = "checkpoint"
-    character(len=256) :: snapshot_prefix = "snapshot"
-    logical :: restart_from_checkpoint = .false.
-    character(len=256) :: restart_file = ""
+    character(len=256) :: checkpoint_prefix = "checkpoint" !! Filename prefix for checkpoint files
+    character(len=256) :: snapshot_prefix = "snapshot"     !! Filename prefix for snapshot files
+    logical :: restart_from_checkpoint = .false.           !! Flag to restart from a checkpoint
+    character(len=256) :: restart_file = ""                !! Path to checkpoint file for restart
     integer, dimension(3) :: output_stride = [2, 2, 2]     !! Spatial stride for snapshot output
     logical :: snapshot_sp = .false.                       !! if true, snapshot in single precision
   contains
@@ -78,11 +110,16 @@ end subroutine read
 contains
 
   subroutine read_domain_nml(self, nml_file, nml_string)
+    !! Read domain configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the domain_settings namelist containing mesh
+    !! and domain decomposition parameters. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(domain_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(domain_config_t) :: self               !! Domain configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -124,11 +161,16 @@ subroutine read_domain_nml(self, nml_file, nml_string)
   end subroutine read_domain_nml
 
   subroutine read_solver_nml(self, nml_file, nml_string)
+    !! Read solver configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the solver_params namelist containing numerical
+    !! and physical parameters for the solver. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(solver_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(solver_config_t) :: self               !! Solver configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -181,11 +223,16 @@ subroutine read_solver_nml(self, nml_file, nml_string)
   end subroutine read_solver_nml
 
   subroutine read_channel_nml(self, nml_file, nml_string)
+    !! Read channel flow configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the channel_nml namelist containing parameters
+    !! specific to channel flow simulations. Exactly one of nml_file or
+    !! nml_string must be provided.
     implicit none
 
-    class(channel_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(channel_config_t) :: self              !! Channel configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit
 
@@ -217,11 +264,16 @@ subroutine read_channel_nml(self, nml_file, nml_string)
   end subroutine read_channel_nml
 
   subroutine read_checkpoint_nml(self, nml_file, nml_string)
+    !! Read checkpoint/snapshot configuration from a namelist file or string.
+    !!
+    !! This subroutine reads the checkpoint_params namelist containing settings
+    !! for checkpointing and snapshot output. Exactly one of nml_file or
+    !! nml_string must be provided. Uses default values if namelist is missing.
     implicit none
 
-    class(checkpoint_config_t) :: self
-    character(*), optional, intent(in) :: nml_file
-    character(*), optional, intent(in) :: nml_string
+    class(checkpoint_config_t) :: self           !! Checkpoint configuration object to populate
+    character(*), optional, intent(in) :: nml_file    !! Path to namelist file
+    character(*), optional, intent(in) :: nml_string  !! Namelist as a string
 
     integer :: unit, ierr
 
diff --git a/src/field.f90 b/src/field.f90
index 878b5d247..af8b6fc89 100644
--- a/src/field.f90
+++ b/src/field.f90
@@ -1,25 +1,32 @@
 module m_field
+  !! Field data structure module for managing computational grid data.
+  !!
+  !! This module provides the field_t type for storing 3D scalar fields
+  !! on the computational grid. Fields can be organised in linked lists
+  !! for memory management and support different data orientations
+  !! (x-pencil, y-pencil, z-pencil).
 
   use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C
 
   type :: field_t
-     !! Memory block type holding both a data field and a pointer
-     !! to the next block.  The `field_t` type also holds a integer
-     !! `refcount` that counts the number of references to this
-     !! field.  User code is currently responsible for incrementing
-     !! the reference count.
-    class(field_t), pointer :: next
-    real(dp), pointer, private :: p_data(:)
-    real(dp), pointer, contiguous :: data(:, :, :)
-    integer :: dir
-    integer :: data_loc
-    integer :: refcount = 0
-    integer :: id !! An integer identifying the memory block.
+    !! Memory block type holding a 3D scalar field with metadata.
+    !!
+    !! The field_t type stores both a data field and a pointer to the next
+    !! block, enabling linked list structures for memory management. The type
+    !! tracks a reference count (currently managed by user code), data
+    !! orientation (x-, y-, or z-pencil), and data location on the staggered grid.
+    class(field_t), pointer :: next             !! Pointer to next field in linked list
+    real(dp), pointer, private :: p_data(:)     !! 1D array storage for data
+    real(dp), pointer, contiguous :: data(:, :, :)  !! 3D view of data array
+    integer :: dir                              !! Data direction (DIR_X, DIR_Y, DIR_Z, or DIR_C)
+    integer :: data_loc                         !! Data location flag (VERT, CELL, etc.)
+    integer :: refcount = 0                     !! Reference count for memory management
+    integer :: id                               !! Unique identifier for this memory block
   contains
-    procedure :: fill
-    procedure :: get_shape
-    procedure :: set_shape
-    procedure :: set_data_loc
+    procedure :: fill          !! Fill field with a constant value
+    procedure :: get_shape     !! Get 3D dimensions of data array
+    procedure :: set_shape     !! Set 3D dimensions by reshaping p_data
+    procedure :: set_data_loc  !! Set data location flag
   end type field_t
 
   interface field_t
@@ -27,16 +34,25 @@ module m_field
   end interface field_t
 
   type :: flist_t
-    !! Use for creating a list of field pointers
-    class(field_t), pointer :: ptr
+    !! Wrapper type for creating arrays of field pointers.
+    !!
+    !! This type is used to create lists or arrays of field pointers,
+    !! useful for managing multiple fields such as velocity components
+    !! or transported scalar species.
+    class(field_t), pointer :: ptr  !! Pointer to a field
   end type flist_t
 
 contains
 
   function field_init(ngrid, next, id) result(f)
-    integer, intent(in) :: ngrid, id
-    type(field_t), pointer, intent(in) :: next
-    type(field_t) :: f
+    !! Initialise a new field with allocated memory.
+    !!
+    !! Creates a new field_t instance with allocated storage for ngrid points.
+    !! The field is linked to the next field in the list and assigned a unique ID.
+    integer, intent(in) :: ngrid  !! Total number of grid points to allocate
+    type(field_t), pointer, intent(in) :: next  !! Pointer to next field in linked list
+    integer, intent(in) :: id     !! Unique identifier for this field
+    type(field_t) :: f            !! Initialised field
 
     allocate (f%p_data(ngrid))
     f%refcount = 0
@@ -45,38 +61,52 @@ function field_init(ngrid, next, id) result(f)
   end function field_init
 
   subroutine fill(self, c)
+    !! Fill the entire field with a constant value.
+    !!
+    !! Sets all grid points in the field to the specified constant value.
     implicit none
 
-    class(field_t) :: self
-    real(dp), intent(in) :: c
+    class(field_t) :: self        !! Field to fill
+    real(dp), intent(in) :: c     !! Constant value to fill with
 
     self%p_data(:) = c
 
   end subroutine fill
 
   subroutine set_data_loc(self, data_loc)
-    class(field_t) :: self
-    integer, intent(in) :: data_loc
+    !! Set the data location flag for this field.
+    !!
+    !! The data location specifies where on the staggered grid the data
+    !! is located (e.g., VERT, CELL, X_FACE, etc.).
+    class(field_t) :: self           !! Field to modify
+    integer, intent(in) :: data_loc  !! Data location flag
 
     self%data_loc = data_loc
 
   end subroutine
 
   function get_shape(self) result(dims)
+    !! Get the 3D dimensions of the field data.
+    !!
+    !! Returns the current shape of the 3D data array.
     implicit none
 
-    class(field_t) :: self
-    integer :: dims(3)
+    class(field_t) :: self  !! Field to query
+    integer :: dims(3)      !! Array dimensions [nx, ny, nz]
 
     dims = shape(self%data)
 
   end function get_shape
 
   subroutine set_shape(self, dims)
+    !! Reshape the field data to specified 3D dimensions.
+    !!
+    !! Maps the 1D storage array (p_data) to a 3D view with the specified
+    !! dimensions. The total size must match the allocated storage.
     implicit none
 
-    class(field_t) :: self
-    integer, intent(in) :: dims(3)
+    class(field_t) :: self        !! Field to reshape
+    integer, intent(in) :: dims(3)  !! Target dimensions [nx, ny, nz]
 
     self%data(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data
 
diff --git a/src/mesh.f90 b/src/mesh.f90
index bb0dac1c7..f83694b34 100644
--- a/src/mesh.f90
+++ b/src/mesh.f90
@@ -1,4 +1,11 @@
 module m_mesh
+  !! Mesh module providing high-level mesh management and query functions.
+  !!
+  !! This module defines the mesh_t type which aggregates geometry, grid, and
+  !! parallel decomposition information. It provides methods to query mesh
+  !! dimensions, coordinates, and other mesh properties for both global and
+  !! local (per MPI rank) domains.
+
   use iso_fortran_env, only: stderr => error_unit
 
   use mpi
@@ -11,21 +18,28 @@ module m_mesh
 
   implicit none
 
-  ! The mesh class stores all the information about the global and local (due to domain decomposition) mesh
-  ! It also includes getter functions to access some of its parameters
   type :: mesh_t
-    type(geo_t), allocatable :: geo ! object containing geometry information
-    class(grid_t), allocatable :: grid ! object containing grid information
-    class(par_t), allocatable :: par ! object containing parallel domain decomposition information
+    !! Mesh type containing all mesh information for the simulation.
+    !!
+    !! This type aggregates three main components:
+    !! - geo: Geometry information (coordinates, stretching)
+    !! - grid: Grid dimensions and boundary conditions
+    !! - par: Parallel domain decomposition information
+    !!
+    !! The mesh is initialised once and should be treated as read-only
+    !! during the simulation.
+    type(geo_t), allocatable :: geo     !! Geometry information
+    class(grid_t), allocatable :: grid  !! Grid dimensions and boundary conditions
+    class(par_t), allocatable :: par    !! Parallel decomposition information
   contains
-    procedure :: get_dims
-    procedure :: get_global_dims
+    procedure :: get_dims         !! Get local dimensions for a data location
+    procedure :: get_global_dims  !! Get global dimensions for a data location
 
-    procedure :: get_n_dir
-    procedure :: get_n_phi
-    generic :: get_n => get_n_dir, get_n_phi
+    procedure :: get_n_dir        !! Get number of grid points in a direction
+    procedure :: get_n_phi        !! Get number of grid points for a field
+    generic :: get_n => get_n_dir, get_n_phi  !! Generic interface for get_n
 
-    procedure :: get_coordinates
+    procedure :: get_coordinates  !! Get coordinate array for a direction
   end type mesh_t
 
   interface mesh_t
@@ -36,18 +50,23 @@ module m_mesh
 
   function mesh_init(dims_global, nproc_dir, L_global, BC_x, BC_y, BC_z, &
                      stretching, beta, use_2decomp) result(mesh)
+    !! Initialise the mesh object with global domain parameters.
+    !!
+    !! Creates and fully initialises a mesh object containing geometry, grid, and
+    !! parallel decomposition information. The mesh should be treated as read-only
+    !! after initialisation. Supports both uniform and stretched meshes, and can
+    !! use either 2decomp or generic domain decomposition.
     use m_decomp, only: is_avail_2decomp, decomposition_2decomp
-    !! Completely initialise the mesh object.
-    !! Upon initialisation the mesh object can be read-only and shouldn't be edited
-    !! Takes as argument global information about the mesh like its length, number of cells and decomposition in each direction
-    integer, dimension(3), intent(in) :: dims_global
-    integer, dimension(3), intent(in) :: nproc_dir ! Number of proc in each direction
-    real(dp), dimension(3), intent(in) :: L_global
-    character(len=*), dimension(2), intent(in) :: BC_x, BC_y, BC_z
-    character(len=*), dimension(3), optional, intent(in) :: stretching
-    real(dp), dimension(3), optional, intent(in) :: beta
-    logical, optional, intent(in) :: use_2decomp
-    class(mesh_t), allocatable :: mesh
+    integer, dimension(3), intent(in) :: dims_global  !! Global grid dimensions [nx, ny, nz]
+    integer, dimension(3), intent(in) :: nproc_dir    !! Number of processors in each direction
+    real(dp), dimension(3), intent(in) :: L_global    !! Physical domain lengths [Lx, Ly, Lz]
+    character(len=*), dimension(2), intent(in) :: BC_x  !! Boundary conditions in x (lower, upper)
+    character(len=*), dimension(2), intent(in) :: BC_y  !! Boundary conditions in y (lower, upper)
+    character(len=*), dimension(2), intent(in) :: BC_z  !! Boundary conditions in z (lower, upper)
+    character(len=*), dimension(3), optional, intent(in) :: stretching  !! Mesh stretching type per direction
+    real(dp), dimension(3), optional, intent(in) :: beta  !! Stretching parameters per direction
+    logical, optional, intent(in) :: use_2decomp      !! Flag to use 2decomp library
+    class(mesh_t), allocatable :: mesh                !! Initialised mesh object
 
     character(len=20), dimension(3, 2) :: BC_all
     logical :: is_first_domain, is_last_domain
@@ -194,19 +213,25 @@ subroutine decomposition_generic(grid, par)
   end subroutine
 
   pure function get_dims(self, data_loc) result(dims)
-  !! Getter for local domain dimensions
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: data_loc
-    integer, dimension(3) :: dims
+    !! Get local domain dimensions for a specific data location.
+    !!
+    !! Returns the dimensions of the local subdomain (on this MPI rank) for
+    !! the specified data location (VERT, CELL, X_FACE, etc.).
+    class(mesh_t), intent(in) :: self  !! Mesh object
+    integer, intent(in) :: data_loc    !! Data location flag (VERT, CELL, etc.)
+    integer, dimension(3) :: dims      !! Local dimensions [nx, ny, nz]
 
     dims = get_dims_dataloc(data_loc, self%grid%vert_dims, self%grid%cell_dims)
   end function
 
   pure function get_global_dims(self, data_loc) result(dims)
-  !! Getter for local domain dimensions
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: data_loc
-    integer, dimension(3) :: dims
+    !! Get global domain dimensions for a specific data location.
+    !!
+    !! Returns the dimensions of the entire global domain for the specified
+    !! data location (VERT, CELL, X_FACE, etc.).
+    class(mesh_t), intent(in) :: self  !! Mesh object
+    integer, intent(in) :: data_loc    !! Data location flag (VERT, CELL, etc.)
+    integer, dimension(3) :: dims      !! Global dimensions [nx, ny, nz]
 
     dims = get_dims_dataloc(data_loc, self%grid%global_vert_dims, &
                             self%grid%global_cell_dims)
@@ -249,21 +274,30 @@ pure function get_dims_dataloc(data_loc, vert_dims, cell_dims) result(dims)
   end function get_dims_dataloc
 
   pure function get_n_phi(self, phi) result(n)
-  !! Getter for the main dimension of field phi
-    class(mesh_t), intent(in) :: self
-    class(field_t), intent(in) :: phi
-    integer :: n
+    !! Get the main dimension (pencil length) for a field.
+    !!
+    !! Returns the number of grid points along the primary direction for the
+    !! given field, accounting for both the field's orientation (dir) and
+    !! data location on the staggered grid.
+    class(mesh_t), intent(in) :: self   !! Mesh object
+    class(field_t), intent(in) :: phi   !! Field to query
+    integer :: n                        !! Number of grid points in main direction
 
     n = self%get_n(phi%dir, phi%data_loc)
 
   end function
 
   pure function get_n_dir(self, dir, data_loc) result(n)
-  !! Getter for the main dimension a field oriented along `dir` with data on `data_loc`
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: dir
-    integer, intent(in) :: data_loc
-    integer :: n, n_cell, n_vert
+    !! Get the main dimension for a field with given direction and data location.
+    !!
+    !! Returns the number of grid points along a specified direction for a field
+    !! located at the given position on the staggered grid. Handles the different
+    !! grid dimensions for vertex-centered vs cell-centered data.
+    class(mesh_t), intent(in) :: self     !! Mesh object
+    integer, intent(in) :: dir            !! Primary direction (DIR_X, DIR_Y, DIR_Z)
+    integer, intent(in) :: data_loc       !! Data location (VERT, CELL, X_FACE, etc.)
+    integer :: n                          !! Number of grid points in direction
+    integer :: n_cell, n_vert
 
     n_cell = self%grid%cell_dims(dir)
     n_vert = self%grid%vert_dims(dir)
@@ -306,13 +340,17 @@ pure function get_n_dir(self, dir, data_loc) result(n)
   end function get_n_dir
 
   pure function get_coordinates(self, i, j, k, data_loc_op) result(coords)
-    !! Get the coordinates of a vertex with i, j, k local cartesian indices
-    !! Avoid calling this in hot loops
-    class(mesh_t), intent(in) :: self
-    integer, intent(in) :: i, j, k
-    integer, optional, intent(in) :: data_loc_op
+    !! Get physical coordinates for a grid point with given indices.
+    !!
+    !! Returns the physical (x, y, z) coordinates for a grid point specified by
+    !! local Cartesian indices (i, j, k) at the given data location. Default
+    !! location is vertex-centered (VERT). Note: Avoid calling this function in
+    !! hot loops due to performance overhead.
+    class(mesh_t), intent(in) :: self             !! Mesh object
+    integer, intent(in) :: i, j, k                !! Local Cartesian indices
+    integer, optional, intent(in) :: data_loc_op  !! Data location (default: VERT)
     integer :: data_loc
-    real(dp), dimension(3) :: coords
+    real(dp), dimension(3) :: coords              !! Physical coordinates [x, y, z]
 
     if (present(data_loc_op)) then
       data_loc = data_loc_op
diff --git a/src/mesh_content.f90 b/src/mesh_content.f90
index 322870225..8207edeaa 100644
--- a/src/mesh_content.f90
+++ b/src/mesh_content.f90
@@ -1,80 +1,95 @@
 module m_mesh_content
+  !! Module containing mesh content types for geometry, grid, and parallel decomposition.
+  !!
+  !! This module defines three main types:
+  !! - geo_t: Geometry information including coordinates and mesh stretching
+  !! - grid_t: Grid dimensions and boundary conditions
+  !! - par_t: Parallel domain decomposition information
 
   use m_common, only: dp, pi
   implicit none
 
   type :: geo_t
-    !! Stores geometry information
-    !> Origin: coordinates of vertex (1, 1, 1)
-    real(dp) :: origin(3)
-    !> size of a cell in each direction for a uniform mesh
-    real(dp) :: d(3)
-    !> Global dimensions of the domain in each direction
-    real(dp) :: L(3)
-    !> Global coordinates at vertices
-    real(dp), allocatable, dimension(:, :) :: vert_coords
-    !> Global coordinates at midpoints
-    real(dp), allocatable, dimension(:, :) :: midp_coords
-    !> Stretching type
-    character(len=20), dimension(3) :: stretching
-    !> Stretching
-    logical :: stretched(3)
-    !> Stretching parameters
-    real(dp) :: alpha(3), beta(3)
-    !> Stretching factors at vertices
+    !! Geometry information type for domain coordinates and mesh stretching.
+    !!
+    !! This type stores physical domain dimensions, coordinates at grid points,
+    !! and mesh stretching parameters. Coordinates and stretching factors are
+    !! stored for both vertex-centered and cell-centered locations.
+    real(dp) :: origin(3)  !! Coordinates of vertex (1, 1, 1)
+    real(dp) :: d(3)       !! Cell size in each direction for uniform mesh
+    real(dp) :: L(3)       !! Global domain dimensions in each direction
+    real(dp), allocatable, dimension(:, :) :: vert_coords  !! Global coordinates at vertices
+    real(dp), allocatable, dimension(:, :) :: midp_coords  !! Global coordinates at cell midpoints
+    character(len=20), dimension(3) :: stretching  !! Stretching type in each direction
+    logical :: stretched(3)     !! Whether each direction has stretching applied
+    real(dp) :: alpha(3)        !! Stretching parameter \(\alpha\) in each direction
+    real(dp) :: beta(3)         !! Stretching parameter \(\beta\) in each direction
+    !> Stretching factors at vertices: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\)
     real(dp), allocatable, dimension(:, :) :: vert_ds, vert_ds2, vert_d2s
-    !> Stretching factors at midpoints
+    !> Stretching factors at midpoints: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\)
     real(dp), allocatable, dimension(:, :) :: midp_ds, midp_ds2, midp_d2s
   contains
-    procedure :: obtain_coordinates
+    procedure :: obtain_coordinates  !! Compute coordinates and stretching factors
   end type
 
   type :: grid_t
-    !! Stores grid information
-    integer, dimension(3) :: global_vert_dims ! global number of vertices in each direction without padding (cartesian structure)
-    integer, dimension(3) :: global_cell_dims ! global number of cells in each direction without padding (cartesian structure)
-
-    integer, dimension(3) :: vert_dims ! local number of vertices in each direction without padding (cartesian structure)
-    integer, dimension(3) :: cell_dims ! local number of cells in each direction without padding (cartesian structure)
-    logical, dimension(3) :: periodic_BC ! Whether or not a direction has a periodic BC
-    integer, dimension(3, 2) :: BCs_global
-    integer, dimension(3, 2) :: BCs
+    !! Grid information type for mesh dimensions and boundary conditions.
+    !!
+    !! This type stores both global and local (per MPI rank) grid dimensions,
+    !! accounting for both vertex-centered and cell-centered data. It also
+    !! manages boundary condition information.
+    integer, dimension(3) :: global_vert_dims  !! Global number of vertices in each direction
+    integer, dimension(3) :: global_cell_dims  !! Global number of cells in each direction
+    integer, dimension(3) :: vert_dims         !! Local number of vertices in each direction
+    integer, dimension(3) :: cell_dims         !! Local number of cells in each direction
+    logical, dimension(3) :: periodic_BC       !! Whether each direction has periodic BC
+    integer, dimension(3, 2) :: BCs_global     !! Global boundary conditions (lower, upper) in each direction
+    integer, dimension(3, 2) :: BCs            !! Local subdomain boundary conditions (lower, upper)
   contains
-    procedure :: copy_cell2vert_dims  ! Copies cell_dims to vert_dims taking periodicity into account
-    procedure :: copy_vert2cell_dims  ! Copies vert_dims to cell_dims taking periodicity into account
+    procedure :: copy_cell2vert_dims  !! Copy cell_dims to vert_dims accounting for periodicity
+    procedure :: copy_vert2cell_dims  !! Copy vert_dims to cell_dims accounting for periodicity
   end type
 
   type :: par_t
-    !! Stores parallel domain related information
-    integer :: nrank ! local rank ID
-    integer :: nproc ! total number of ranks/proc participating in the domain decomposition
-    integer, dimension(3) :: nrank_dir ! local rank ID in each direction
-    integer, dimension(3) :: nproc_dir ! total number of proc in each direction
-    integer, dimension(3) :: n_offset  ! number of cells offset in each direction due to domain decomposition
-    integer, dimension(3) :: pnext ! rank ID of the previous rank in each direction
-    integer, dimension(3) :: pprev ! rank ID of the next rank in each direction
+    !! Parallel domain decomposition information type.
+    !!
+    !! This type stores all information related to MPI domain decomposition,
+    !! including rank IDs, processor grid layout, and neighbor communication
+    !! information for halo exchanges.
+    integer :: nrank                !! Local MPI rank ID (0-based)
+    integer :: nproc                !! Total number of MPI ranks
+    integer, dimension(3) :: nrank_dir   !! Local rank ID in each direction (0-based)
+    integer, dimension(3) :: nproc_dir   !! Number of processors in each direction
+    integer, dimension(3) :: n_offset    !! Cell offset in each direction due to decomposition
+    integer, dimension(3) :: pnext       !! Rank ID of next neighbor in each direction
+    integer, dimension(3) :: pprev       !! Rank ID of previous neighbor in each direction
   contains
-    procedure :: is_root ! returns if the current rank is the root rank
-    procedure :: compute_rank_pos_from_global ! fills in pnext, pprev and nrank_dir from global ranks map
+    procedure :: is_root                        !! Check if current rank is root (rank 0)
+    procedure :: compute_rank_pos_from_global   !! Compute rank position and neighbors from global map
   end type
 
 contains
 
   pure function is_root(self) result(is_root_rank)
-    !! Returns wether or not the current rank is the root rank
-    class(par_t), intent(in) :: self
-    logical :: is_root_rank
+    !! Check whether the current MPI rank is the root rank.
+    !!
+    !! The root rank is defined as rank 0 in the MPI communicator.
+    class(par_t), intent(in) :: self  !! Parallel decomposition object
+    logical :: is_root_rank           !! True if this is rank 0
 
     is_root_rank = (self%nrank == 0)
 
   end function
 
   pure subroutine compute_rank_pos_from_global(self, global_ranks)
-    !! From the global rank maps, fills in the rank position as well
-    !! as the previous and next rank in the `par` structure
-
-    class(par_t), intent(inout) :: self
-    integer, dimension(:, :, :), intent(in) :: global_ranks
+    !! Compute rank position and neighbor ranks from global rank map.
+    !!
+    !! From the 3D global rank map, this subroutine determines the position
+    !! of the current rank in the processor grid and identifies the previous
+    !! and next neighboring ranks in each direction for halo communication.
+    !! Periodic wrapping is applied for neighbor identification.
+    class(par_t), intent(inout) :: self                !! Parallel decomposition object to update
+    integer, dimension(:, :, :), intent(in) :: global_ranks  !! 3D map of MPI ranks
     integer, dimension(3) :: subd_pos, subd_pos_prev, subd_pos_next
     integer :: dir, nproc
 
@@ -102,10 +117,13 @@ pure subroutine compute_rank_pos_from_global(self, global_ranks)
   end subroutine
 
   pure subroutine copy_vert2cell_dims(self, par)
-    !! Copies vert_dims information to cell_dims taking
-    !! periodicity into account
-    class(grid_t), intent(inout) :: self
-    type(par_t), intent(in) :: par
+    !! Copy vertex dimensions to cell dimensions accounting for periodicity.
+    !!
+    !! For periodic boundaries, vertex and cell dimensions are equal. For
+    !! non-periodic boundaries on the last domain, cell dimensions are one
+    !! less than vertex dimensions.
+    class(grid_t), intent(inout) :: self  !! Grid object to update
+    type(par_t), intent(in) :: par        !! Parallel decomposition info
     integer :: dir
     logical :: is_last_domain
 
@@ -121,10 +139,13 @@ pure subroutine copy_vert2cell_dims(self, par)
   end subroutine
 
   pure subroutine copy_cell2vert_dims(self, par)
-    !! Copies cell_dims information to vert_dims taking
-    !! periodicity into account
-    class(grid_t), intent(inout) :: self
-    type(par_t), intent(in) :: par
+    !! Copy cell dimensions to vertex dimensions accounting for periodicity.
+    !!
+    !! For periodic boundaries, vertex and cell dimensions are equal. For
+    !! non-periodic boundaries on the last domain, vertex dimensions are one
+    !! more than cell dimensions.
+    class(grid_t), intent(inout) :: self  !! Grid object to update
+    type(par_t), intent(in) :: par        !! Parallel decomposition info
     integer :: dir
     logical :: is_last_domain
 
@@ -140,10 +161,17 @@ pure subroutine copy_cell2vert_dims(self, par)
   end subroutine
 
   subroutine obtain_coordinates(self, vert_dims, cell_dims, n_offset)
-    !! Obtains global coordinates for all the vertices and midpoints
+    !! Compute global coordinates and stretching factors for grid points.
+    !!
+    !! This subroutine calculates coordinates at both vertex-centered and
+    !! cell-centered locations, supporting both uniform and stretched meshes.
+    !! For stretched meshes, it also computes the stretching factors
+    !! \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), and \(\frac{d^2\xi}{ds^2}\).
     implicit none
-    class(geo_t) :: self
-    integer, intent(in) :: vert_dims(3), cell_dims(3), n_offset(3)
+    class(geo_t) :: self                      !! Geometry object to populate
+    integer, intent(in) :: vert_dims(3)       !! Local vertex dimensions
+    integer, intent(in) :: cell_dims(3)       !! Local cell dimensions
+    integer, intent(in) :: n_offset(3)        !! Cell offset due to domain decomposition
 
     integer :: dir, i, i_glob
     real(dp) :: L_inf, alpha, beta, r, const, s, yeta_vt, yeta_mp, coord
diff --git a/src/ordering.f90 b/src/ordering.f90
index 19be0a583..0d9a7d466 100644
--- a/src/ordering.f90
+++ b/src/ordering.f90
@@ -1,14 +1,16 @@
 module m_ordering
+  !! Module for index conversion between application storage and Cartesian layouts.
+  !!
+  !! This module provides functions to convert between directional "application storage"
+  !! indices (optimised for cache locality) and Cartesian (i,j,k) indices. The application
+  !! storage layout arranges data in blocks oriented along a specific direction (X, Y, or Z)
+  !! to improve memory access patterns during computations.
 
   use m_common, only: dp, get_dirs_from_rdr, DIR_X, DIR_Y, DIR_Z, DIR_C
 
   implicit none
 
 contains
-   !!
-   !! "Application storage" stores spatial data with a directionality for better cache locality
-   !!  This set of functions converts indices from this application storage (_dir) to cartesian indices (_ijk)
-   !!
 
   pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, &
                                 SZ, nx_padded, ny_padded, nz_padded)

From cbd2a8624a87a589929570649de53e204be9014b Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 09:20:43 +0000
Subject: [PATCH 02/12] docs: add FORD documentation for core solver components

---
 src/solver.f90          | 246 ++++++++++++++++++++++++---------
 src/tdsops.f90          | 296 ++++++++++++++++++++++++++++++----------
 src/time_integrator.f90 | 160 +++++++++++++++++-----
 src/vector_calculus.f90 | 146 ++++++++++++++------
 4 files changed, 642 insertions(+), 206 deletions(-)

diff --git a/src/solver.f90 b/src/solver.f90
index 700f4e912..61342bab8 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -1,4 +1,18 @@
 module m_solver
+  !! Main solver module implementing the Incompact3D numerical algorithm.
+  !!
+  !! This module provides the high-level solver infrastructure for solving
+  !! incompressible Navier-Stokes equations using compact finite differences.
+  !! The solver orchestrates the transport equation (transeq), divergence,
+  !! Poisson solver, and gradient operations required for the fractional-step
+  !! projection method.
+  !!
+  !! The implementation supports:
+  !! - Multiple backend executors (CPU/GPU)
+  !! - Distributed and Thomas algorithm for derivatives
+  !! - Immersed boundary method (IBM)
+  !! - Multi-species transport
+  !! - Various time integration schemes
   use iso_fortran_env, only: stderr => error_unit
   use mpi
 
@@ -47,53 +61,65 @@ module m_solver
       !! method of the allocator can be used to make this field available
       !! for later use.
 
-    real(dp) :: dt, nu
-    real(dp), dimension(:), allocatable :: nu_species
-    integer :: n_iters, n_output
-    integer :: current_iter = 0
-    integer :: ngrid
-    integer :: nvars = 3
-    integer :: nspecies = 0
-
-    class(field_t), pointer :: u, v, w
-    type(flist_t), dimension(:), pointer :: species => null()
-
-    class(base_backend_t), pointer :: backend
-    type(mesh_t), pointer :: mesh
-    type(time_intg_t) :: time_integrator
-    type(allocator_t), pointer :: host_allocator
-    type(dirps_t), pointer :: xdirps, ydirps, zdirps
-    type(vector_calculus_t) :: vector_calculus
-    type(ibm_t) :: ibm
-    logical :: ibm_on
-    procedure(poisson_solver), pointer :: poisson => null()
-    procedure(transport_equation), pointer :: transeq => null()
+    real(dp) :: dt                              !! Time step size
+    real(dp) :: nu                              !! Kinematic viscosity
+    real(dp), dimension(:), allocatable :: nu_species  !! Viscosities for multiple species
+    integer :: n_iters                          !! Total number of time iterations
+    integer :: n_output                         !! Output frequency (every nth iteration)
+    integer :: current_iter = 0                 !! Current iteration number
+    integer :: ngrid                            !! Total number of grid points
+    integer :: nvars = 3                        !! Number of velocity variables (u,v,w)
+    integer :: nspecies = 0                     !! Number of scalar species to transport
+
+    class(field_t), pointer :: u, v, w          !! Velocity field components
+    type(flist_t), dimension(:), pointer :: species => null()  !! Array of scalar species fields
+
+    class(base_backend_t), pointer :: backend  !! Backend executor (CPU/GPU)
+    type(mesh_t), pointer :: mesh               !! Computational mesh
+    type(time_intg_t) :: time_integrator        !! Time integration scheme
+    type(allocator_t), pointer :: host_allocator  !! Memory allocator for host arrays
+    type(dirps_t), pointer :: xdirps, ydirps, zdirps  !! Tridiagonal operators in each direction
+    type(vector_calculus_t) :: vector_calculus  !! Vector calculus operations
+    type(ibm_t) :: ibm                          !! Immersed boundary method handler
+    logical :: ibm_on                           !! Flag to enable/disable IBM
+    procedure(poisson_solver), pointer :: poisson => null()  !! Poisson solver procedure pointer
+    procedure(transport_equation), pointer :: transeq => null()  !! Transport equation solver pointer
   contains
-    procedure :: transeq_species
-    procedure :: pressure_correction
-    procedure :: divergence_v2p
-    procedure :: gradient_p2v
-    procedure :: curl
+    procedure :: transeq_species       !! Compute transport equation for scalar species
+    procedure :: pressure_correction   !! Apply pressure correction to enforce incompressibility
+    procedure :: divergence_v2p        !! Compute divergence of velocity field
+    procedure :: gradient_p2v          !! Compute pressure gradient
+    procedure :: curl                  !! Compute curl (vorticity) of velocity field
   end type solver_t
 
   abstract interface
     subroutine poisson_solver(self, pressure, div_u)
+      !! Interface for Poisson solver implementations.
+      !!
+      !! Solves the Poisson equation \( \nabla^2 p = f \) where f is the
+      !! divergence of the intermediate velocity field.
       import :: solver_t
       import :: field_t
       implicit none
 
       class(solver_t) :: self
-      class(field_t), intent(inout) :: pressure
-      class(field_t), intent(in) :: div_u
+      class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+      class(field_t), intent(in) :: div_u         !! Velocity divergence (RHS)
     end subroutine poisson_solver
 
     subroutine transport_equation(self, rhs, variables)
+      !! Interface for transport equation implementations.
+      !!
+      !! Computes the right-hand side of the transport equation including
+      !! convection, diffusion, and any source terms. The momentum equations are:
+      !! \[ \frac{\partial \mathbf{u}}{\partial t} + (\mathbf{u} \cdot \nabla)\mathbf{u} = -\nabla p + \nu \nabla^2 \mathbf{u} \]
       import :: solver_t
       import :: flist_t
       implicit none
 
       class(solver_t) :: self
-      type(flist_t), intent(inout) :: rhs(:), variables(:)
+      type(flist_t), intent(inout) :: rhs(:)       !! Right-hand side terms (output)
+      type(flist_t), intent(inout) :: variables(:) !! Field variables (velocity components)
     end subroutine transport_equation
   end interface
 
@@ -104,12 +130,25 @@ end subroutine transport_equation
 contains
 
   function init(backend, mesh, host_allocator) result(solver)
+    !! Initialise the solver with backend, mesh, and configuration.
+    !!
+    !! This function sets up the complete solver infrastructure including:
+    !! - Velocity field allocation (u, v, w)
+    !! - Tridiagonal operators for each direction (xdirps, ydirps, zdirps)
+    !! - Time integrator
+    !! - Poisson solver (FFT or CG)
+    !! - Transport equation solver (default or low-memory variant)
+    !! - Optional scalar species transport
+    !! - Optional immersed boundary method (IBM)
+    !!
+    !! All configuration is read from the namelist file specified as the first
+    !! command-line argument.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(solver_t) :: solver
+    class(base_backend_t), target, intent(inout) :: backend     !! Backend executor (CPU/GPU)
+    type(mesh_t), target, intent(inout) :: mesh                  !! Computational mesh
+    type(allocator_t), target, intent(inout) :: host_allocator  !! Host memory allocator
+    type(solver_t) :: solver                                     !! Initialised solver object
 
     type(solver_config_t) :: solver_cfg
     integer :: i
@@ -208,11 +247,22 @@ end function init
 
   subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, &
                              der2nd_scheme, interpl_scheme, stagder_scheme)
-    type(dirps_t), intent(inout) :: dirps
-    class(base_backend_t), intent(in) :: backend
-    type(mesh_t), intent(in) :: mesh
-    character(*), intent(in) :: der1st_scheme, der2nd_scheme, &
-                                interpl_scheme, stagder_scheme
+    !! Allocate and initialise tridiagonal operators for a given direction.
+    !!
+    !! This subroutine creates the compact finite difference operators needed for:
+    !! - First derivatives (der1st)
+    !! - Second derivatives (der2nd)
+    !! - Interpolation (interpl)
+    !! - Staggered derivatives (stagder)
+    !!
+    !! Boundary conditions are determined from the mesh periodicity flags.
+    type(dirps_t), intent(inout) :: dirps           !! Direction-specific operator set
+    class(base_backend_t), intent(in) :: backend    !! Backend executor
+    type(mesh_t), intent(in) :: mesh                 !! Computational mesh
+    character(*), intent(in) :: der1st_scheme        !! First derivative scheme name
+    character(*), intent(in) :: der2nd_scheme        !! Second derivative scheme name
+    character(*), intent(in) :: interpl_scheme       !! Interpolation scheme name
+    character(*), intent(in) :: stagder_scheme       !! Staggered derivative scheme name
 
     integer :: dir, bc_start, bc_end, bc_mp_start, bc_mp_end, n_vert, n_cell, i
     real(dp) :: d
@@ -282,15 +332,23 @@ subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, &
   end subroutine
 
   subroutine transeq_default(self, rhs, variables)
-    !! Skew-symmetric form of convection-diffusion terms in the
-    !! incompressible Navier-Stokes momemtum equations, excluding
-    !! pressure terms.
-    !! Inputs from velocity grid and outputs to velocity grid.
+    !! Compute transport equation RHS using default (high-memory) algorithm.
+    !!
+    !! Evaluates the skew-symmetric form of convection-diffusion terms in the
+    !! incompressible Navier-Stokes momentum equations, excluding pressure:
+    !! \[ RHS = -(\mathbf{u} \cdot \nabla)\mathbf{u} + \nu \nabla^2 \mathbf{u} \]
+    !!
+    !! Uses skew-symmetric formulation for numerical stability:
+    !! \[ (\mathbf{u} \cdot \nabla)\mathbf{u} = \frac{1}{2}[(\mathbf{u} \cdot \nabla)\mathbf{u} + \nabla \cdot (\mathbf{u}\mathbf{u})] \]
+    !!
+    !! This version stores intermediate results for all velocity components,
+    !! providing better performance at the cost of higher memory usage.
+    !! Both inputs and outputs are on the velocity (vertex) grid.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(inout) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)         !! Right-hand side output (du/dt, dv/dt, dw/dt)
+    type(flist_t), intent(inout) :: variables(:)   !! Velocity components (u, v, w)
 
     class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
       du_y, dv_y, dw_y, du_z, dv_z, dw_z, &
@@ -382,12 +440,20 @@ subroutine transeq_default(self, rhs, variables)
   end subroutine transeq_default
 
   subroutine transeq_lowmem(self, rhs, variables)
-    !! low memory version of the transport equation, roughly %2 slower overall
+    !! Compute transport equation RHS using low-memory algorithm.
+    !!
+    !! Evaluates the same skew-symmetric form as transeq_default but with
+    !! reduced memory footprint by reusing field storage. This approach is
+    !! approximately 2% slower but uses significantly less memory, which can
+    !! be important for large simulations or GPU implementations with limited
+    !! memory.
+    !!
+    !! See transeq_default for the mathematical formulation.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(inout) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)        !! Right-hand side output (du/dt, dv/dt, dw/dt)
+    type(flist_t), intent(inout) :: variables(:)  !! Velocity components (u, v, w)
 
     class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
       du_y, dv_y, dw_y, du_z, dv_z, dw_z, du, dv, dw, u, v, w
@@ -498,14 +564,20 @@ subroutine transeq_lowmem(self, rhs, variables)
   end subroutine transeq_lowmem
 
   subroutine transeq_species(self, rhs, variables)
-    !! Skew-symmetric form of convection-diffusion terms in the
-    !! species equation.
-    !! Inputs from velocity grid and outputs to velocity grid.
+    !! Compute transport equation for passive scalar species.
+    !!
+    !! Evaluates the convection-diffusion equation for transported scalars:
+    !! \[ \frac{\partial \phi}{\partial t} + (\mathbf{u} \cdot \nabla)\phi = \nu_\phi \nabla^2 \phi \]
+    !!
+    !! where \( \phi \) represents each scalar species, \( \nu_\phi \) is the
+    !! species diffusivity. Uses skew-symmetric form similar to momentum equations.
+    !! Velocity field must be available in self%u, self%v, self%w.
+    !! Both inputs and outputs are on the velocity (vertex) grid.
     implicit none
 
     class(solver_t) :: self
-    type(flist_t), intent(inout) :: rhs(:)
-    type(flist_t), intent(in) :: variables(:)
+    type(flist_t), intent(inout) :: rhs(:)       !! Right-hand side for species equations
+    type(flist_t), intent(in) :: variables(:)    !! Scalar species fields
 
     integer :: i
     class(field_t), pointer :: u, v, w, &
@@ -594,12 +666,18 @@ subroutine transeq_species(self, rhs, variables)
   end subroutine transeq_species
 
   subroutine divergence_v2p(self, div_u, u, v, w)
-    !! Wrapper for divergence_v2p
+    !! Compute divergence of velocity field from vertex to cell centers.
+    !!
+    !! Calculates \( \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \)
+    !! using staggered derivatives and interpolation operators. The input velocity
+    !! components are on the vertex grid and the output divergence is on the cell-centered grid.
+    !!
+    !! For incompressible flow, this should be zero (up to numerical errors).
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: div_u
-    class(field_t), intent(in) :: u, v, w
+    class(field_t), intent(inout) :: div_u  !! Velocity divergence (output, cell-centered)
+    class(field_t), intent(in) :: u, v, w   !! Velocity components (input, vertex-centered)
 
     call self%vector_calculus%divergence_v2c( &
       div_u, u, v, w, &
@@ -611,12 +689,19 @@ subroutine divergence_v2p(self, div_u, u, v, w)
   end subroutine divergence_v2p
 
   subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
-    !! Wrapper for gradient_p2v
+    !! Compute pressure gradient from cell centers to vertices.
+    !!
+    !! Calculates the pressure gradient components:
+    !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \]
+    !!
+    !! using staggered derivatives and interpolation operators. The input pressure
+    !! is on the cell-centered grid and the output gradient components are on the vertex grid.
+    !! This is used in the pressure correction step of the fractional-step method.
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: dpdx, dpdy, dpdz
-    class(field_t), intent(in) :: pressure
+    class(field_t), intent(inout) :: dpdx, dpdy, dpdz  !! Pressure gradient components (vertex-centered)
+    class(field_t), intent(in) :: pressure              !! Pressure field (cell-centered)
 
     call self%vector_calculus%gradient_c2v( &
       dpdx, dpdy, dpdz, pressure, &
@@ -628,7 +713,13 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
   end subroutine gradient_p2v
 
   subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
-    !! Wrapper for curl
+    !! Compute curl (vorticity) of the velocity field.
+    !!
+    !! Calculates the curl of velocity:
+    !! \[ \boldsymbol{\omega} = \nabla \times \mathbf{u} = \left( \frac{\partial w}{\partial y} - \frac{\partial v}{\partial z}, \frac{\partial u}{\partial z} - \frac{\partial w}{\partial x}, \frac{\partial v}{\partial x} - \frac{\partial u}{\partial y} \right) \]
+    !!
+    !! All fields are on the vertex grid. This is primarily used for
+    !! post-processing and visualisation of vorticity.
     implicit none
 
     class(solver_t) :: self
@@ -644,11 +735,23 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
   end subroutine curl
 
   subroutine poisson_fft(self, pressure, div_u)
+    !! Solve Poisson equation using Fast Fourier Transform method.
+    !!
+    !! Solves \( \nabla^2 p = f \) where f is the velocity divergence,
+    !! using FFT-based spectral method. This is very efficient for periodic
+    !! or Neumann boundary conditions and is the default/recommended solver.
+    !!
+    !! The solution process involves:
+    !! 1. Transform to 3D Cartesian data structure
+    !! 2. Apply FFT in periodic/Neumann directions
+    !! 3. Solve in spectral space
+    !! 4. Inverse FFT back to physical space
+    !! 5. Transform back to pencil decomposition
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: pressure
-    class(field_t), intent(in) :: div_u
+    class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+    class(field_t), intent(in) :: div_u        !! Velocity divergence (RHS)
 
     class(field_t), pointer :: p_temp, temp
 
@@ -671,11 +774,17 @@ subroutine poisson_fft(self, pressure, div_u)
   end subroutine poisson_fft
 
   subroutine poisson_cg(self, pressure, div_u)
+    !! Solve Poisson equation using Conjugate Gradient method.
+    !!
+    !! This is a placeholder for iterative Poisson solver using CG method.
+    !! Currently sets pressure to zero for performance testing.
+    !! Will be fully implemented for cases where FFT is not suitable
+    !! (e.g., complex geometries or Dirichlet boundary conditions).
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: pressure
-    class(field_t), intent(in) :: div_u
+    class(field_t), intent(inout) :: pressure  !! Pressure field (solution)
+    class(field_t), intent(in) :: div_u        !! Velocity divergence (RHS)
 
     ! set the pressure field to 0 so that we can do performance tests easily
     ! this will be removed once the CG solver is implemented of course
@@ -684,10 +793,19 @@ subroutine poisson_cg(self, pressure, div_u)
   end subroutine poisson_cg
 
   subroutine pressure_correction(self, u, v, w)
+    !! Apply pressure correction to enforce incompressibility constraint.
+    !!
+    !! Implements the projection step of the fractional-step method:
+    !! 1. Compute divergence of intermediate velocity: \( \nabla \cdot \mathbf{u}^* \)
+    !! 2. Solve Poisson equation: \( \nabla^2 p = \frac{1}{\Delta t} \nabla \cdot \mathbf{u}^* \)
+    !! 3. Correct velocity: \( \mathbf{u}^{n+1} = \mathbf{u}^* - \Delta t \nabla p \)
+    !!
+    !! After correction, the velocity field is divergence-free (incompressible).
+    !! If IBM is active, IBM forcing is applied after pressure correction.
     implicit none
 
     class(solver_t) :: self
-    class(field_t), intent(inout) :: u, v, w
+    class(field_t), intent(inout) :: u, v, w  !! Velocity components (corrected in-place)
 
     class(field_t), pointer :: div_u, pressure, dpdx, dpdy, dpdz
 
diff --git a/src/tdsops.f90 b/src/tdsops.f90
index 40623711a..cd492979f 100644
--- a/src/tdsops.f90
+++ b/src/tdsops.f90
@@ -1,4 +1,25 @@
 module m_tdsops
+  !! Tridiagonal solver operators for compact finite differences.
+  !!
+  !! This module provides preprocessed tridiagonal operator arrays for
+  !! solving compact finite difference schemes. It supports both distributed
+  !! and Thomas algorithm implementations for computing:
+  !! - First and second derivatives
+  !! - Interpolation between vertex and cell-centre grids
+  !! - Staggered derivatives
+  !!
+  !! The operators are preprocessed based on:
+  !! - Grid spacing and optional stretching
+  !! - Boundary conditions (periodic, Neumann, Dirichlet)
+  !! - Numerical scheme (compact schemes of various orders)
+  !! - Symmetry properties for free-slip boundaries
+  !!
+  !! The distributed algorithm is designed for parallel execution and consists of:
+  !! 1. Forward/backward elimination phase (dist_fw, dist_bw)
+  !! 2. Back-substitution phase (dist_sa, dist_sc)
+  !!
+  !! The Thomas algorithm (thom_*) is used for serial execution or
+  !! when the distributed approach is not suitable.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp, pi, VERT, CELL, &
@@ -24,21 +45,35 @@ module m_tdsops
       !! This class does not know about the current rank or its relative
       !! location among other ranks. All the operator arrays here are used when
       !! executing a distributed tridiagonal solver phase one or two.
-    real(dp), allocatable, dimension(:) :: dist_fw, dist_bw, & !! fw/bw phase
-                                           dist_sa, dist_sc, & !! back subs.
-                                           dist_af !! the auxiliary factors
-    real(dp), allocatable, dimension(:) :: thom_f, thom_s, thom_w, thom_p
-    real(dp), allocatable :: stretch(:), stretch_correct(:)
-    real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :)
-    real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp !! Compact scheme coeffs
-    logical :: periodic
-    integer :: n_tds !! Tridiagonal system size
-    integer :: n_rhs !! Right-hand-side builder size
-    integer :: move = 0 !! move between vertices and cell centres
-    integer :: n_halo !! number of halo points
+    real(dp), allocatable, dimension(:) :: dist_fw     !! Forward elimination coefficients (distributed)
+    real(dp), allocatable, dimension(:) :: dist_bw     !! Backward elimination coefficients (distributed)
+    real(dp), allocatable, dimension(:) :: dist_sa     !! Back-substitution coefficients A (distributed)
+    real(dp), allocatable, dimension(:) :: dist_sc     !! Back-substitution coefficients C (distributed)
+    real(dp), allocatable, dimension(:) :: dist_af     !! Auxiliary factors (distributed)
+    real(dp), allocatable, dimension(:) :: thom_f      !! Forward elimination factors (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_s      !! Scaling factors (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_w      !! Work array (Thomas)
+    real(dp), allocatable, dimension(:) :: thom_p      !! Precomputed products (Thomas)
+    real(dp), allocatable :: stretch(:)                !! Grid stretching coefficients
+    real(dp), allocatable :: stretch_correct(:)        !! Stretch correction for 2nd derivatives
+    real(dp), allocatable :: coeffs(:)                 !! RHS builder coefficients (interior)
+    real(dp), allocatable :: coeffs_s(:, :)            !! RHS builder coefficients (start boundary)
+    real(dp), allocatable :: coeffs_e(:, :)            !! RHS builder coefficients (end boundary)
+    real(dp) :: alpha                                  !! Compact scheme coefficient (LHS)
+    real(dp) :: a, b                                   !! Compact scheme coefficients (RHS)
+    real(dp) :: c = 0._dp, d = 0._dp                   !! Extended compact scheme coefficients
+    logical :: periodic                                !! Periodic boundary condition flag
+    integer :: n_tds                                   !! Tridiagonal system size
+    integer :: n_rhs                                   !! Right-hand-side builder size
+    integer :: move = 0                                !! Offset for vertex/cell-centre conversion
+    integer :: n_halo                                  !! Number of halo points
   contains
-    procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st
-    procedure :: preprocess_dist, preprocess_thom
+    procedure :: deriv_1st       !! Set up first derivative operator
+    procedure :: deriv_2nd       !! Set up second derivative operator
+    procedure :: interpl_mid     !! Set up interpolation operator
+    procedure :: stagder_1st     !! Set up staggered derivative operator
+    procedure :: preprocess_dist !! Preprocess for distributed algorithm
+    procedure :: preprocess_thom !! Preprocess for Thomas algorithm
   end type tdsops_t
 
   interface tdsops_t
@@ -49,10 +84,21 @@ module m_tdsops
     !! Directional tridiagonal solver container.
     !!
     !! This class contains the preprocessed tridiagonal solvers for operating
-    !! in each coordinate direction.
-    class(tdsops_t), allocatable :: der1st, der1st_sym, der2nd, der2nd_sym, &
-      stagder_v2p, stagder_p2v, interpl_v2p, interpl_p2v
-    integer :: dir
+    !! in a specific coordinate direction (x, y, or z). Each direction requires
+    !! different operators for:
+    !! - Regular and symmetric first derivatives
+    !! - Regular and symmetric second derivatives
+    !! - Staggered derivatives (vertex-to-cell and cell-to-vertex)
+    !! - Interpolation (vertex-to-cell and cell-to-vertex)
+    class(tdsops_t), allocatable :: der1st        !! First derivative operator
+    class(tdsops_t), allocatable :: der1st_sym    !! Symmetric first derivative operator
+    class(tdsops_t), allocatable :: der2nd        !! Second derivative operator
+    class(tdsops_t), allocatable :: der2nd_sym    !! Symmetric second derivative operator
+    class(tdsops_t), allocatable :: stagder_v2p   !! Staggered derivative (vertex to cell)
+    class(tdsops_t), allocatable :: stagder_p2v   !! Staggered derivative (cell to vertex)
+    class(tdsops_t), allocatable :: interpl_v2p   !! Interpolation (vertex to cell)
+    class(tdsops_t), allocatable :: interpl_p2v   !! Interpolation (cell to vertex)
+    integer :: dir                                 !! Direction index (DIR_X, DIR_Y, DIR_Z)
   end type dirps_t
 
 contains
@@ -61,44 +107,57 @@ function tdsops_init( &
     n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     ) result(tdsops)
-    !! Constructor function for the tdsops_t class.
+    !! Initialise and construct a tridiagonal operator.
     !!
-    !! 'n_tds', 'delta', 'operation', 'scheme', 'bc_start', and 'bc_end' are
-    !! necessary arguments. The remaining arguments are optional.
+    !! This function creates a preprocessed tridiagonal operator for compact
+    !! finite difference operations. Required arguments are 'n_tds', 'delta',
+    !! 'operation', 'scheme', 'bc_start', and 'bc_end'. Optional arguments
+    !! enable stretched grids, staggered operations, and boundary condition tuning.
     !!
-    !! 'stretch' is for obtaining the correct derivations in a stretched mesh
-    !! 'stretch_correct' is for correcting the second derivative with the first
+    !! **Operation types:**
+    !! - 'first-deriv': First derivative \( \frac{\partial f}{\partial x} \)
+    !! - 'second-deriv': Second derivative \( \frac{\partial^2 f}{\partial x^2} \)
+    !! - 'interpolate': Interpolation between grids
+    !! - 'stag-deriv': Staggered derivative (vertex ↔ cell)
     !!
-    !! 'from_to' is necessary for interpolation and staggared derivative, and
-    !! it can be 'v2p' or 'p2v'.
-    !! If the specific region the instance is operating is not a boundary
-    !! region, then 'bc_start' and 'bc_end' are BC_HALO.
+    !! **Boundary conditions:**
+    !! - BC_PERIODIC: Periodic boundaries
+    !! - BC_NEUMANN: Neumann (zero gradient) boundaries
+    !! - BC_DIRICHLET: Dirichlet (fixed value) boundaries
     !!
-    !! 'sym' is relevant when the BC is free-slip. If sym is .true. then it
-    !! means the field we operate on is assumed to be an even function
-    !! (symmetric, cos type) accross the boundary. If it is .false. it means
-    !! the field is assumed to be an odd function (anti-symmetric, sin type).
+    !! **Optional stretched grid support:**
+    !! 'stretch' provides stretching coefficients for non-uniform grids.
+    !! 'stretch_correct' applies correction for second derivatives on stretched grids.
     !!
-    !! 'c_nu', 'nu0_nu' are relevant when operation is second order
-    !! derivative and scheme is compact6-hyperviscous.
+    !! **Staggered operations:**
+    !! 'from_to' specifies direction: 'v2p' (vertex-to-cell) or 'p2v' (cell-to-vertex)
+    !!
+    !! **Symmetry for free-slip boundaries:**
+    !! 'sym' determines field symmetry at Neumann boundaries:
+    !! - .true. = symmetric (cos-type, even function)
+    !! - .false. = anti-symmetric (sin-type, odd function)
+    !!
+    !! **Hyperviscosity parameters:**
+    !! 'c_nu' and 'nu0_nu' are used for compact6-hyperviscous second derivatives
     implicit none
 
-    type(tdsops_t) :: tdsops !! return value of the function
-
-    integer, intent(in) :: n_tds !! Tridiagonal system size
-    real(dp), intent(in) :: delta !! Grid spacing
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end !! Boundary Cond.
-    real(dp), optional, intent(in) :: stretch(:) !! Stretching coefficients
-    real(dp), optional, intent(in) :: stretch_correct(:) !! Stretch correction
-    integer, optional, intent(in) :: n_halo !! Number of halo cells
-    character(*), optional, intent(in) :: from_to !! 'v2p' or 'p2v'
-    logical, optional, intent(in) :: sym !! (==npaire), only for Neumann BCs
-    real(dp), optional, intent(in) :: c_nu, nu0_nu !! params for hypervisc.
+    type(tdsops_t) :: tdsops                         !! Constructed tridiagonal operator
+
+    integer, intent(in) :: n_tds                     !! Tridiagonal system size
+    real(dp), intent(in) :: delta                    !! Grid spacing
+    character(*), intent(in) :: operation            !! Operation type
+    character(*), intent(in) :: scheme               !! Numerical scheme name
+    integer, intent(in) :: bc_start, bc_end          !! Boundary conditions
+    real(dp), optional, intent(in) :: stretch(:)     !! Grid stretching coefficients
+    real(dp), optional, intent(in) :: stretch_correct(:)  !! Stretch correction
+    integer, optional, intent(in) :: n_halo          !! Number of halo cells
+    character(*), optional, intent(in) :: from_to    !! Staggering: 'v2p' or 'p2v'
+    logical, optional, intent(in) :: sym             !! Symmetry for Neumann BCs
+    real(dp), optional, intent(in) :: c_nu, nu0_nu   !! Hyperviscosity parameters
 #ifdef SINGLE_PREC
-    real(dp) :: tol = 1e-12
+    real(dp) :: tol = 1e-12                          !! Tolerance for checking small coefficients in single precision
 #else
-    real(dp) :: tol = 1e-16
+    real(dp) :: tol = 1e-16                          !! Tolerance for checking small coefficients in double precision
 #endif
 
     integer :: n, n_stencil
@@ -197,13 +256,28 @@ function tdsops_init( &
   end function tdsops_init
 
   subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym)
+    !! Set up first derivative operator.
+    !!
+    !! Configures the compact finite difference operator for computing first
+    !! derivatives \( \frac{\partial f}{\partial x} \). Supports various compact
+    !! schemes with different orders of accuracy:
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order accuracy
+    !! - 'compact6-exp': 6th-order with exponential profile
+    !! - 'compact6-hyp': 6th-order with hyperbolic profile
+    !!
+    !! The operator is built for the tridiagonal system:
+    !! \[ \alpha f'_{i-1} + f'_i + \alpha f'_{i+1} = a \frac{f_{i+1} - f_{i-1}}{2\Delta x} + b \frac{f_{i+2} - f_{i-2}}{4\Delta x} \]
+    !!
+    !! Boundary conditions modify the stencil near domain boundaries.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, afi, bfi
@@ -344,14 +418,30 @@ end subroutine deriv_1st
 
   subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
                        c_nu, nu0_nu)
+    !! Set up second derivative operator.
+    !!
+    !! Configures the compact finite difference operator for computing second
+    !! derivatives \( \frac{\partial^2 f}{\partial x^2} \). Supports various compact
+    !! schemes with different orders of accuracy and optional hyperviscosity.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order accuracy
+    !! - 'compact6-hyperviscous': 6th-order with selective hyperviscosity
+    !!
+    !! The operator is built for the tridiagonal system:
+    !! \[ \alpha f''_{i-1} + f''_i + \alpha f''_{i+1} = a \frac{f_{i+1} - 2f_i + f_{i-1}}{\Delta x^2} + b \frac{f_{i+2} - 2f_i + f_{i-2}}{4\Delta x^2} \]
+    !!
+    !! **Hyperviscosity:** Optional 'c_nu' and 'nu0_nu' parameters enable selective
+    !! damping of high-frequency modes for numerical stability.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
+    real(dp), optional, intent(in) :: c_nu      !! Hyperviscosity coefficient
+    real(dp), optional, intent(in) :: nu0_nu    !! Hyperviscosity parameter
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, asi, bsi, csi, dsi
@@ -556,12 +646,29 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
   end subroutine deriv_2nd
 
   subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
+    !! Set up interpolation operator between vertex and cell grids.
+    !!
+    !! Configures the compact interpolation operator for transferring data
+    !! between staggered grids (vertex-centred ↔ cell-centred). Uses compact
+    !! schemes for high-order accuracy.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order interpolation
+    !! - 'classic': Classical 2nd-order interpolation
+    !!
+    !! **Direction:**
+    !! - 'v2p': Vertex to cell-centre (pressure point)
+    !! - 'p2v': Cell-centre to vertex
+    !!
+    !! The interpolation is critical for maintaining consistency between
+    !! velocity and pressure grids in staggered arrangements.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    character(*), intent(in) :: scheme, from_to
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    character(*), intent(in) :: scheme          !! Interpolation scheme name
+    character(*), intent(in) :: from_to         !! Direction: 'v2p' or 'p2v'
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, aici, bici, cici, dici
@@ -702,13 +809,32 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
   end subroutine interpl_mid
 
   subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
+    !! Set up staggered first derivative operator.
+    !!
+    !! Configures the compact operator for computing first derivatives on
+    !! staggered grids, where the derivative is computed at a different grid
+    !! location than the input data.
+    !!
+    !! **Supported schemes:**
+    !! - 'compact6': 6th-order staggered derivative
+    !! - 'classic': Classical 2nd-order staggered derivative
+    !!
+    !! **Direction:**
+    !! - 'v2p': Derivative from vertex grid to cell-centre grid
+    !! - 'p2v': Derivative from cell-centre grid to vertex grid
+    !!
+    !! Staggered derivatives are essential for:
+    !! - Computing divergence and gradient on staggered grids
+    !! - Maintaining numerical stability in pressure-velocity coupling
+    !! - Accurate representation of boundary conditions
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: scheme, from_to
-    integer, intent(in) :: bc_start, bc_end
-    logical, optional, intent(in) :: sym
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), intent(in) :: delta               !! Grid spacing
+    character(*), intent(in) :: scheme          !! Scheme name
+    character(*), intent(in) :: from_to         !! Direction: 'v2p' or 'p2v'
+    integer, intent(in) :: bc_start, bc_end     !! Boundary conditions
+    logical, optional, intent(in) :: sym        !! Symmetry flag for Neumann BCs
 
     real(dp), allocatable :: dist_b(:)
     real(dp) :: alpha, aci, bci
@@ -810,11 +936,23 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
   end subroutine stagder_1st
 
   subroutine preprocess_dist(self, dist_b)
+    !! Preprocess tridiagonal system for distributed algorithm.
+    !!
+    !! This subroutine preprocesses the tridiagonal matrix coefficients for
+    !! use in the distributed (parallel) tridiagonal solver algorithm. The
+    !! preprocessing follows Algorithm 3 from:
+    !! Reference: DOI: 10.1109/MCSE.2021.3130544
+    !!
+    !! The distributed algorithm consists of two phases:
+    !! 1. **Forward/backward elimination**: Reduces the system in parallel subdomains
+    !! 2. **Back-substitution**: Applies corrections from neighbouring ranks
+    !!
+    !! This preprocessing computes the coefficients (dist_fw, dist_bw, dist_sa,
+    !! dist_sc, dist_af) needed for both phases, enabling efficient parallel execution.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-
-    real(dp), dimension(:), intent(in) :: dist_b
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), dimension(:), intent(in) :: dist_b  !! Diagonal coefficients of tridiagonal system
 
     integer :: i
 
@@ -869,10 +1007,24 @@ subroutine preprocess_dist(self, dist_b)
   end subroutine preprocess_dist
 
   subroutine preprocess_thom(self, b)
+    !! Preprocess tridiagonal system for Thomas algorithm.
+    !!
+    !! This subroutine preprocesses the tridiagonal matrix coefficients for
+    !! use in the Thomas algorithm (serial tridiagonal solver). The Thomas
+    !! algorithm is a simplified form of Gaussian elimination optimised for
+    !! tridiagonal systems.
+    !!
+    !! The preprocessing performs forward elimination on the coefficients:
+    !! \( c'_i = c_i / (b_i - a_i \cdot c'_{i-1}) \)
+    !! \( d'_i = (d_i - a_i \cdot d'_{i-1}) / (b_i - a_i \cdot c'_{i-1}) \)
+    !!
+    !! This enables efficient back-substitution during the solve phase. This
+    !! algorithm is used within individual MPI ranks when the distributed
+    !! algorithm is employed, or for the entire domain in serial execution.
     implicit none
 
-    class(tdsops_t), intent(inout) :: self
-    real(dp), dimension(:), intent(in) :: b
+    class(tdsops_t), intent(inout) :: self     !! Tridiagonal operator (modified in-place)
+    real(dp), dimension(:), intent(in) :: b    !! Diagonal coefficients of tridiagonal system
 
     integer :: i, n
 
diff --git a/src/time_integrator.f90 b/src/time_integrator.f90
index 0ac159246..97cc61ae9 100644
--- a/src/time_integrator.f90
+++ b/src/time_integrator.f90
@@ -1,4 +1,26 @@
 module m_time_integrator
+  !! Time integration schemes for temporal advancement.
+  !!
+  !! This module provides explicit time integration methods for advancing
+  !! solutions in time. It supports two families of schemes:
+  !!
+  !! **1. Runge-Kutta (RK) Methods**
+  !! Multi-stage schemes that achieve high-order accuracy within a single
+  !! timestep. Supported orders: RK1 (Euler), RK2, RK3, RK4. Each stage
+  !! requires an evaluation of the right-hand side (derivative).
+  !!
+  !! **2. Adams-Bashforth (AB) Methods**
+  !! Multi-step schemes that use derivative information from previous
+  !! timesteps to achieve high-order accuracy. Supported orders: AB1, AB2,
+  !! AB3, AB4. These methods are more memory-efficient than RK schemes
+  !! for the same order of accuracy.
+  !!
+  !! The time_intg_t type encapsulates all integration state and provides
+  !! a unified interface through the step procedure pointer, which routes
+  !! to either runge_kutta() or adams_bashforth() based on the selected method.
+  !!
+  !! Old timestep/stage data is stored in the `olds` array and managed
+  !! automatically through rotation mechanisms for AB methods.
   use m_allocator, only: allocator_t
   use m_base_backend, only: base_backend_t
   use m_common, only: dp, DIR_X
@@ -9,19 +31,26 @@ module m_time_integrator
   private adams_bashforth, runge_kutta
 
   type :: time_intg_t
-    integer :: method, istep, istage, order, nstep, nstage, nvars, nolds
-    real(dp) :: coeffs(4, 4)
-    real(dp) :: rk_b(4, 4)
-    real(dp) :: rk_a(3, 3, 4)
-    character(len=3) :: sname
-    type(flist_t), allocatable :: olds(:, :)
-    class(base_backend_t), pointer :: backend
-    class(allocator_t), pointer :: allocator
-    procedure(stepper_func), pointer :: step => null()
+    integer :: method       !! Integration method identifier (unused, kept for compatibility)
+    integer :: istep        !! Current timestep number (for AB startup ramping)
+    integer :: istage       !! Current stage number within timestep (RK only)
+    integer :: order        !! Order of accuracy of the scheme (1-4)
+    integer :: nstep        !! Number of timesteps needed (AB: order, RK: 1)
+    integer :: nstage       !! Number of stages per timestep (AB: 1, RK: order)
+    integer :: nvars        !! Number of variables being integrated
+    integer :: nolds        !! Number of old derivatives/solutions to store
+    real(dp) :: coeffs(4, 4)  !! Adams-Bashforth coefficients [stage, order]
+    real(dp) :: rk_b(4, 4)    !! Runge-Kutta final weights [stage, order]
+    real(dp) :: rk_a(3, 3, 4) !! Runge-Kutta stage weights [from_stage, to_stage, order]
+    character(len=3) :: sname !! Scheme name (e.g., 'AB3', 'RK4')
+    type(flist_t), allocatable :: olds(:, :) !! Old derivatives/solutions [nvars, nolds]
+    class(base_backend_t), pointer :: backend    !! Computational backend for operations
+    class(allocator_t), pointer :: allocator     !! Memory allocator for field storage
+    procedure(stepper_func), pointer :: step => null() !! Function pointer to integration method
   contains
-    procedure :: finalize
-    procedure :: runge_kutta
-    procedure :: adams_bashforth
+    procedure :: finalize       !! Clean up and release allocated memory
+    procedure :: runge_kutta    !! Runge-Kutta time integration implementation
+    procedure :: adams_bashforth !! Adams-Bashforth time integration implementation
   end type time_intg_t
 
   interface time_intg_t
@@ -30,25 +59,34 @@ module m_time_integrator
 
   abstract interface
     subroutine stepper_func(self, curr, deriv, dt)
+      !! Abstract interface for time stepping functions.
+      !!
+      !! Defines the signature for integration methods (RK or AB).
+      !! Each method takes the current solution, its derivative, and
+      !! the timestep size, and updates the solution accordingly.
       import :: time_intg_t
       import :: dp
       import :: flist_t
       implicit none
 
-      class(time_intg_t), intent(inout) :: self
-      type(flist_t), intent(inout) :: curr(:)
-      type(flist_t), intent(in) :: deriv(:)
-      real(dp), intent(in) :: dt
+      class(time_intg_t), intent(inout) :: self  !! Time integrator state
+      type(flist_t), intent(inout) :: curr(:)    !! Current solution variables [nvars]
+      type(flist_t), intent(in) :: deriv(:)      !! Time derivatives of variables [nvars]
+      real(dp), intent(in) :: dt                 !! Timestep size
     end subroutine stepper_func
   end interface
 
 contains
 
   subroutine finalize(self)
+    !! Finalise time integrator and release allocated resources.
+    !!
+    !! Releases all field storage blocks used for storing old derivatives
+    !! or stage solutions, and deallocates the olds array.
     implicit none
 
     !type(time_intg_t), intent(inout) :: self
-    class(time_intg_t), intent(inout) :: self
+    class(time_intg_t), intent(inout) :: self !! Time integrator to finalise
 
     integer :: i, j
 
@@ -67,13 +105,32 @@ subroutine finalize(self)
   end subroutine finalize
 
   function init(backend, allocator, method, nvars)
+    !! Initialise time integrator with specified method and coefficients.
+    !!
+    !! This constructor configures the time integration scheme based on the
+    !! method string (e.g., 'AB3' or 'RK4'). It initialises all Runge-Kutta
+    !! and Adams-Bashforth coefficients for orders 1-4, then selects the
+    !! appropriate method and allocates storage for old derivatives or stages.
+    !!
+    !! **Supported Methods:**
+    !! - AB1, AB2, AB3, AB4: Adams-Bashforth (explicit multi-step)
+    !! - RK1, RK2, RK3, RK4: Runge-Kutta (explicit multi-stage)
+    !!
+    !! **RK Coefficients (Butcher tableau):**
+    !! - RK1: Forward Euler
+    !! - RK2: Midpoint method
+    !! - RK3: Strong Stability Preserving RK3 (SSP-RK3)
+    !! - RK4: Classical fourth-order Runge-Kutta
+    !!
+    !! **AB Coefficients:**
+    !! Derived from polynomial extrapolation of previous derivatives.
     implicit none
 
-    type(time_intg_t) :: init
-    class(base_backend_t), pointer :: backend
-    class(allocator_t), pointer :: allocator
-    character(3), intent(in) :: method
-    integer, intent(in) :: nvars
+    type(time_intg_t) :: init                         !! Initialised time integrator
+    class(base_backend_t), pointer :: backend         !! Computational backend
+    class(allocator_t), pointer :: allocator          !! Memory allocator
+    character(3), intent(in) :: method                !! Integration method ('AB3', 'RK4', etc.)
+    integer, intent(in) :: nvars                      !! Number of variables to integrate
 
     integer :: i, j, stat
 
@@ -160,12 +217,27 @@ function init(backend, allocator, method, nvars)
   end function init
 
   subroutine runge_kutta(self, curr, deriv, dt)
+    !! Advance solution using Runge-Kutta method.
+    !!
+    !! Implements explicit Runge-Kutta schemes of orders 1-4. The general
+    !! form for an s-stage RK method is:
+    !!
+    !! \[ k_i = f(t_n + c_i \Delta t, u_n + \Delta t \sum_{j=1}^{i-1} a_{ij} k_j) \]
+    !! \[ u_{n+1} = u_n + \Delta t \sum_{i=1}^{s} b_i k_i \]
+    !!
+    !! Where \( k_i \) are stage derivatives, \( a_{ij} \) are stage weights,
+    !! and \( b_i \) are final combination weights. This implementation stores
+    !! stage derivatives in `olds(:, 2:nstage+1)` and the initial solution in
+    !! `olds(:, 1)`.
+    !!
+    !! The subroutine is called once per stage. When `istage == nstage`, it
+    !! computes the final solution and resets the stage counter.
     implicit none
 
-    class(time_intg_t), intent(inout) :: self
-    type(flist_t), intent(inout) :: curr(:)
-    type(flist_t), intent(in) :: deriv(:)
-    real(dp), intent(in) :: dt
+    class(time_intg_t), intent(inout) :: self  !! Time integrator state
+    type(flist_t), intent(inout) :: curr(:)    !! Current solution (updated)
+    type(flist_t), intent(in) :: deriv(:)      !! Stage derivative
+    real(dp), intent(in) :: dt                 !! Timestep size
 
     integer :: i, j
 
@@ -219,12 +291,27 @@ subroutine runge_kutta(self, curr, deriv, dt)
   end subroutine runge_kutta
 
   subroutine adams_bashforth(self, curr, deriv, dt)
+    !! Advance solution using Adams-Bashforth method.
+    !!
+    !! Implements explicit Adams-Bashforth schemes of orders 1-4. These
+    !! multi-step methods use derivatives from previous timesteps:
+    !!
+    !! \[ u_{n+1} = u_n + \Delta t \sum_{i=0}^{s-1} b_i f_{n-i} \]
+    !!
+    !! Where \( f_{n-i} \) are stored derivatives from previous steps and
+    !! \( b_i \) are the Adams-Bashforth coefficients. The method has an
+    !! automatic startup phase: for the first `order` steps, it uses a
+    !! lower-order scheme (e.g., AB2 uses AB1 on step 1, then AB2 on step 2+).
+    !!
+    !! Old derivatives are stored in `olds(:, 1:nstep-1)` and rotated after
+    !! each step. The current derivative is used directly and then stored
+    !! in `olds(:, 1)` for the next timestep.
     implicit none
 
-    class(time_intg_t), intent(inout) :: self
-    type(flist_t), intent(inout) :: curr(:)
-    type(flist_t), intent(in) :: deriv(:)
-    real(dp), intent(in) :: dt
+    class(time_intg_t), intent(inout) :: self  !! Time integrator state
+    type(flist_t), intent(inout) :: curr(:)    !! Current solution (updated)
+    type(flist_t), intent(in) :: deriv(:)      !! Current time derivative
+    real(dp), intent(in) :: dt                 !! Timestep size
 
     integer :: i, j
     integer :: nstep
@@ -266,10 +353,19 @@ subroutine adams_bashforth(self, curr, deriv, dt)
   end subroutine adams_bashforth
 
   subroutine rotate(sol, n)
+    !! Rotate pointer array for Adams-Bashforth old derivatives.
+    !!
+    !! Shifts pointers in the array to make room for a new derivative:
+    !! sol(i) <- sol(i-1) for i from n down to 2, and sol(1) gets the
+    !! old sol(n). This implements a circular buffer for old derivatives
+    !! without copying data - only pointers are reassigned.
+    !!
+    !! Example for n=3: [new, old1, old2] becomes [?, new, old1]
+    !! (where ? will be filled with the newest derivative)
     implicit none
 
-    type(flist_t), intent(inout) :: sol(:)
-    integer, intent(in) :: n
+    type(flist_t), intent(inout) :: sol(:)  !! Array of field list pointers to rotate
+    integer, intent(in) :: n                !! Number of elements to rotate
 
     integer :: i
     class(field_t), pointer :: ptr
diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90
index cf1a1da4d..324f482e4 100644
--- a/src/vector_calculus.f90
+++ b/src/vector_calculus.f90
@@ -1,4 +1,28 @@
 module m_vector_calculus
+  !! Vector calculus operators for finite-difference CFD.
+  !!
+  !! This module provides implementations of fundamental differential operators
+  !! (divergence, gradient, curl, Laplacian) on staggered and collocated grids.
+  !! All operators are built using high-order compact finite-difference schemes
+  !! from the tdsops module.
+  !!
+  !! **Key Features:**
+  !! - **Staggered grid support**: Operators handle transitions between cell centres
+  !!   (CELL) and vertices (VERT) through staged derivatives and interpolation
+  !! - **Data reordering**: Automatically manages pencil decomposition, reordering
+  !!   fields between X, Y, Z orientations as needed for derivatives
+  !! - **Memory efficiency**: Uses allocator blocks for temporary fields with
+  !!   careful release management to minimise memory footprint
+  !!
+  !! **Grid Conventions:**
+  !! - CELL (data_loc=CELL): Variables stored at cell centres (e.g., pressure)
+  !! - VERT (data_loc=VERT): Variables stored at cell vertices (e.g., velocity)
+  !! - Staggered operators (v2c, c2v) transition between these locations
+  !!
+  !! **Data Layouts:**
+  !! - DIR_X: Pencil decomposed in X direction (default for most operations)
+  !! - DIR_Y: Pencil decomposed in Y direction (for Y derivatives)
+  !! - DIR_Z: Pencil decomposed in Z direction (for Z derivatives)
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -11,13 +35,16 @@ module m_vector_calculus
   implicit none
 
   type :: vector_calculus_t
-    !! Defines vector calculus operators
-    class(base_backend_t), pointer :: backend
+    !! Container for vector calculus operators.
+    !!
+    !! Provides methods for computing curl, divergence, gradient, and Laplacian.
+    !! All operations are delegated to the backend for computational flexibility.
+    class(base_backend_t), pointer :: backend !! Computational backend (CPU/GPU)
   contains
-    procedure :: curl
-    procedure :: divergence_v2c
-    procedure :: gradient_c2v
-    procedure :: laplacian
+    procedure :: curl            !! Compute curl (vorticity) of vector field
+    procedure :: divergence_v2c  !! Compute divergence from vertices to cell centres
+    procedure :: gradient_c2v    !! Compute gradient from cell centres to vertices
+    procedure :: laplacian       !! Compute Laplacian of scalar field
   end type vector_calculus_t
 
   interface vector_calculus_t
@@ -27,10 +54,15 @@ module m_vector_calculus
 contains
 
   function init(backend) result(vector_calculus)
+    !! Initialise vector calculus module with computational backend.
+    !!
+    !! Simply stores a pointer to the backend, which provides access to
+    !! the allocator, reordering routines, and tridiagonal solvers needed
+    !! for computing derivatives.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(vector_calculus_t) :: vector_calculus
+    class(base_backend_t), target, intent(inout) :: backend !! Computational backend
+    type(vector_calculus_t) :: vector_calculus              !! Initialised vector calculus object
 
     vector_calculus%backend => backend
 
@@ -142,21 +174,33 @@ subroutine divergence_v2c(self, div_u, u, v, w, &
                             x_stagder_v2c, x_interpl_v2c, &
                             y_stagder_v2c, y_interpl_v2c, &
                             z_stagder_v2c, z_interpl_v2c)
-    !! Divergence of a vector field (u, v, w).
+    !! Compute divergence of a vector field from vertices to cell centres.
+    !!
+    !! Computes:
+    !! \[ \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + 
+    !!    \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \]
+    !!
+    !! Input velocity components (u, v, w) are at vertices (VERT), and
+    !! divergence is evaluated at cell centres (CELL). This requires:
+    !! - **Staggered derivatives** in the aligned direction (e.g., du/dx uses x_stagder_v2c)
+    !! - **Interpolation** for cross terms (e.g., v and w interpolated in x direction)
     !!
-    !! Evaluated at the cell centers (data_loc=CELL)
-    !! Input fields are at vertices (data_loc=VERT)
+    !! The algorithm proceeds dimension by dimension:
+    !! 1. Compute du/dx (staggered), interpolate dv/dx, dw/dx in DIR_X
+    !! 2. Reorder to DIR_Y, compute dv/dy (staggered), interpolate du/dy, dw/dy
+    !! 3. Reorder to DIR_Z, compute dw/dz (staggered), interpolate du/dz
+    !! 4. Sum all components: div = du/dx + dv/dy + dw/dz
     !!
-    !! Input fields are in DIR_X data layout.
-    !! Output field is in DIR_Z data layout.
+    !! **Input:** All fields in DIR_X layout
+    !! **Output:** div_u in DIR_Z layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: div_u
-    class(field_t), intent(in) :: u, v, w
-    class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, &
-      y_stagder_v2c, y_interpl_v2c, &
-      z_stagder_v2c, z_interpl_v2c
+    class(vector_calculus_t) :: self     !! Vector calculus object
+    class(field_t), intent(inout) :: div_u !! Divergence output (CELL, DIR_Z)
+    class(field_t), intent(in) :: u, v, w  !! Velocity components (VERT, DIR_X)
+    class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, & !! X operators
+      y_stagder_v2c, y_interpl_v2c, &   !! Y operators
+      z_stagder_v2c, z_interpl_v2c      !! Z operators
 
     class(field_t), pointer :: du_x, dv_x, dw_x, &
       u_y, v_y, w_y, du_y, dv_y, dw_y, &
@@ -248,21 +292,34 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
                           x_stagder_c2v, x_interpl_c2v, &
                           y_stagder_c2v, y_interpl_c2v, &
                           z_stagder_c2v, z_interpl_c2v)
-    !! Gradient of a scalar field 'p'.
+    !! Compute gradient of a scalar field from cell centres to vertices.
+    !!
+    !! Computes:
+    !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, 
+    !!    \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \]
+    !!
+    !! Input pressure p is at cell centres (CELL), and gradient components
+    !! are evaluated at vertices (VERT). This is the inverse operation of
+    !! divergence_v2c and is used in projection methods for incompressible flow.
     !!
-    !! Evaluated at the vertices (data_loc=VERT)
-    !! Input field is at cell centers (data_loc=CELL)
+    !! The algorithm proceeds in reverse order (Z→Y→X):
+    !! 1. Compute dp/dz (staggered), interpolate p in Z direction (DIR_Z)
+    !! 2. Reorder to DIR_Y, compute dp/dy (staggered), interpolate p and dpdz
+    !! 3. Reorder to DIR_X, compute dp/dx (staggered), interpolate dpdy and dpdz
     !!
-    !! Input field is in DIR_Z data layout.
-    !! Output fields (dpdx, dpdy, dpdz) are in DIR_X data layout.
+    !! This reverse ordering optimises memory usage by minimising temporary
+    !! field allocations.
+    !!
+    !! **Input:** p in DIR_Z layout
+    !! **Output:** dpdx, dpdy, dpdz in DIR_X layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: dpdx, dpdy, dpdz
-    class(field_t), intent(in) :: p
-    class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, &
-      y_stagder_c2v, y_interpl_c2v, &
-      z_stagder_c2v, z_interpl_c2v
+    class(vector_calculus_t) :: self                      !! Vector calculus object
+    class(field_t), intent(inout) :: dpdx, dpdy, dpdz    !! Gradient components (VERT, DIR_X)
+    class(field_t), intent(in) :: p                       !! Scalar field (CELL, DIR_Z)
+    class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, & !! X operators
+      y_stagder_c2v, y_interpl_c2v, &   !! Y operators
+      z_stagder_c2v, z_interpl_c2v      !! Z operators
 
     class(field_t), pointer :: p_sxy_z, dpdz_sxy_z, &
       p_sxy_y, dpdz_sxy_y, &
@@ -331,18 +388,31 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
   end subroutine gradient_c2v
 
   subroutine laplacian(self, lapl_u, u, x_der2nd, y_der2nd, z_der2nd)
-    !! Laplacian of a scalar field 'u'.
+    !! Compute Laplacian of a scalar field.
+    !!
+    !! Computes:
+    !! \[ \nabla^2 u = \frac{\partial^2 u}{\partial x^2} + 
+    !!    \frac{\partial^2 u}{\partial y^2} + \frac{\partial^2 u}{\partial z^2} \]
+    !!
+    !! The Laplacian is evaluated at the same grid location (CELL or VERT)
+    !! as the input field. This operator is used in diffusion terms and
+    !! Poisson equations.
     !!
-    !! Evaluated at the data_loc defined by the input u field
+    !! The algorithm computes second derivatives in each direction:
+    !! 1. Compute d²u/dx² directly in DIR_X
+    !! 2. Reorder to DIR_Y, compute d²u/dy², sum into result via sum_yintox
+    !! 3. Reorder to DIR_Z, compute d²u/dz², sum into result via sum_zintox
     !!
-    !! Input and output fields are in DIR_X layout.
+    !! The sum_yintox and sum_zintox operations add directional derivatives
+    !! directly into the DIR_X result field without additional reordering.
+    !!
+    !! **Input/Output:** All fields in DIR_X layout
     implicit none
 
-    class(vector_calculus_t) :: self
-    class(field_t), intent(inout) :: lapl_u
-    class(field_t), intent(in) :: u
-
-    class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd
+    class(vector_calculus_t) :: self           !! Vector calculus object
+    class(field_t), intent(inout) :: lapl_u    !! Laplacian output (same data_loc as u, DIR_X)
+    class(field_t), intent(in) :: u            !! Scalar field (DIR_X)
+    class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd !! Second derivative operators
 
     class(field_t), pointer :: u_y, d2u_y, u_z, d2u_z
 

From c651bb1ee1ed5c55a62cdb436370e2a9a4222227 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 10:49:12 +0000
Subject: [PATCH 03/12] docs: add documentation for main program call and
 poisson_fft

---
 src/poisson_fft.f90 | 190 ++++++++++++++++++++++++++++++++------------
 src/xcompact.f90    |  72 +++++++++++++----
 2 files changed, 196 insertions(+), 66 deletions(-)

diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
index 3efe96b65..89ef652f4 100644
--- a/src/poisson_fft.f90
+++ b/src/poisson_fft.f90
@@ -1,4 +1,32 @@
 module m_poisson_fft
+  !! FFT-based spectral Poisson solver for incompressible flow.
+  !!
+  !! This module implements fast Fourier transform (FFT) based solvers for
+  !! the Poisson equation:
+  !! \[ \nabla^2 \phi = f \]
+  !!
+  !! **Solution Strategy:**
+  !! 1. **Forward FFT**: Transform RHS from physical to spectral space
+  !! 2. **Spectral division**: Solve algebraically using wave numbers:
+  !!    \( \hat{\phi} = \hat{f} / k^2 \)
+  !! 3. **Backward FFT**: Transform solution back to physical space
+  !!
+  !! **Boundary Condition Support:**
+  !! - **Periodic (000)**: Fully periodic in all directions (standard FFT)
+  !! - **Mixed (010)**: Periodic in X/Z, non-periodic in Y (requires special handling)
+  !!
+  !! **Grid Stretching:**
+  !! - Uniform grids in X and Z (required for FFT)
+  !! - Y-direction stretching supported for 010 BCs via transformation matrices
+  !! - Stretching handled through spectral equivalence constants
+  !!
+  !! **Parallel Implementation:**
+  !! - Pencil decomposition in Y and Z directions (X must be undivided)
+  !! - Spectral space operations on permuted/transposed data layouts
+  !! - Backend-specific FFT implementations (CPU/GPU)
+  !!
+  !! The module is abstract; concrete implementations provide FFT routines
+  !! via deferred procedures (fft_forward, fft_backward, fft_postprocess).
   use m_common, only: dp, pi, CELL
   use m_field, only: field_t
   use m_mesh, only: mesh_t, geo_t
@@ -7,111 +35,151 @@ module m_poisson_fft
   implicit none
 
   type, abstract :: poisson_fft_t
-    !! FFT based Poisson solver
-    !> Global dimensions
+    !! Abstract base type for FFT-based Poisson solvers.
+    !!
+    !! Concrete backend implementations (OMP, CUDA) extend this type
+    !! and provide FFT library integration (FFTW, cuFFT, etc.).
+    !> Global dimensions (full domain)
     integer :: nx_glob, ny_glob, nz_glob
-    !> Local dimensions
+    !> Local dimensions (subdomain on this rank)
     integer :: nx_loc, ny_loc, nz_loc
-    !> Local dimensions in the permuted slabs
+    !> Local dimensions in the permuted slabs (after transpose for FFT)
     integer :: nx_perm, ny_perm, nz_perm
-    !> Local dimensions in the permuted slabs in spectral space
+    !> Local dimensions in the permuted slabs in spectral space (complex)
     integer :: nx_spec, ny_spec, nz_spec
-    !> Offset in y and z directions in the permuted slabs in spectral space
+    !> Offset in x, y, z directions in the spectral space pencil
     integer :: x_sp_st, y_sp_st, z_sp_st
-    !> Local domain sized array storing the spectral equivalence constants
+    !> Local spectral equivalence constants (modified wave numbers)
     complex(dp), allocatable, dimension(:, :, :) :: waves
-    !> Wave numbers in x, y, and z
+    !> Tridiagonal coefficients for wave number computation (real part)
     real(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz
-    !> Wave numbers in x, y, and z
+    !> Complex wave numbers and their squares for each direction
     complex(dp), allocatable, dimension(:) :: kx, ky, kz, exs, eys, ezs, &
                                               k2x, k2y, k2z
-    !> Staggared grid transformation
+    !> Staggered grid transformation coefficients (real and imaginary parts)
     real(dp), allocatable, dimension(:) :: trans_x_re, trans_x_im, &
                                            trans_y_re, trans_y_im, &
                                            trans_z_re, trans_z_im
-    !> Periodicity in x, y, and z
+    !> Periodicity flags for each direction
     logical :: periodic_x, periodic_y, periodic_z, &
-               stretched_y = .false., stretched_y_sym
-    !> Stretching operator matrices
+               stretched_y = .false., stretched_y_sym  !! Y-direction stretching
+    !> Stretching transformation matrices (odd/even modes, real/imaginary)
     real(dp), allocatable, dimension(:, :, :, :) :: a_odd_re, a_odd_im, &
                                                     a_even_re, a_even_im, &
                                                     a_re, a_im
-    !> lowmem option, only used in CUDA backend
+    !> Low memory mode flag (used for GPU backends to reduce memory usage)
     logical :: lowmem = .false.
-    !> Procedure pointer to BC specific poisson solvers
+    !> Procedure pointer to BC-specific Poisson solver implementation
     procedure(poisson_xxx), pointer :: poisson => null()
   contains
-    procedure(fft_forward), deferred :: fft_forward
-    procedure(fft_backward), deferred :: fft_backward
-    procedure(fft_postprocess), deferred :: fft_postprocess_000
-    procedure(fft_postprocess), deferred :: fft_postprocess_010
-    procedure(field_process), deferred :: enforce_periodicity_y
-    procedure(field_process), deferred :: undo_periodicity_y
-    procedure :: base_init
-    procedure :: solve_poisson
-    procedure :: stretching_matrix
-    procedure :: waves_set
-    procedure :: get_km
-    procedure :: get_km_re
-    procedure :: get_km_im
+    procedure(fft_forward), deferred :: fft_forward           !! Forward FFT (deferred)
+    procedure(fft_backward), deferred :: fft_backward         !! Backward FFT (deferred)
+    procedure(fft_postprocess), deferred :: fft_postprocess_000 !! Postprocess for 000 BCs
+    procedure(fft_postprocess), deferred :: fft_postprocess_010 !! Postprocess for 010 BCs
+    procedure(field_process), deferred :: enforce_periodicity_y !! Enforce Y periodicity
+    procedure(field_process), deferred :: undo_periodicity_y    !! Undo Y periodicity
+    procedure :: base_init           !! Initialise Poisson solver
+    procedure :: solve_poisson       !! Main interface to solve Poisson equation
+    procedure :: stretching_matrix   !! Compute stretching transformation matrices
+    procedure :: waves_set           !! Compute spectral equivalence constants
+    procedure :: get_km              !! Get complex wave number
+    procedure :: get_km_re           !! Get real part of wave number
+    procedure :: get_km_im           !! Get imaginary part of wave number
   end type poisson_fft_t
 
   abstract interface
     subroutine fft_forward(self, f_in)
+      !! Abstract interface for forward FFT transform.
+      !!
+      !! Transforms field from physical space to spectral space.
+      !! Implementation is backend-specific (FFTW, cuFFT, etc.).
       import :: poisson_fft_t
       import :: field_t
       implicit none
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(in) :: f_in
+      class(poisson_fft_t) :: self      !! Poisson solver instance
+      class(field_t), intent(in) :: f_in !! Input field in physical space
     end subroutine fft_forward
 
     subroutine fft_backward(self, f_out)
+      !! Abstract interface for backward (inverse) FFT transform.
+      !!
+      !! Transforms field from spectral space back to physical space.
+      !! Implementation is backend-specific (FFTW, cuFFT, etc.).
       import :: poisson_fft_t
       import :: field_t
       implicit none
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f_out
+      class(poisson_fft_t) :: self           !! Poisson solver instance
+      class(field_t), intent(inout) :: f_out !! Output field in physical space
     end subroutine fft_backward
 
     subroutine fft_postprocess(self)
+      !! Abstract interface for spectral space postprocessing.
+      !!
+      !! Applies spectral division and any BC-specific operations
+      !! in Fourier space. Different implementations for different
+      !! boundary condition combinations (000, 010, etc.).
       import :: poisson_fft_t
       implicit none
 
-      class(poisson_fft_t) :: self
+      class(poisson_fft_t) :: self !! Poisson solver instance
     end subroutine fft_postprocess
   end interface
 
   abstract interface
     subroutine poisson_xxx(self, f, temp)
+      !! Abstract interface for complete Poisson solve.
+      !!
+      !! Orchestrates forward FFT, postprocessing, and backward FFT.
+      !! Different implementations for different BC combinations.
       import :: poisson_fft_t
       import :: field_t
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f, temp
+      class(poisson_fft_t) :: self                !! Poisson solver instance
+      class(field_t), intent(inout) :: f, temp    !! Field and temporary storage
     end subroutine poisson_xxx
 
     subroutine field_process(self, f_out, f_in)
+      !! Abstract interface for field processing operations.
+      !!
+      !! Used for enforcing or undoing periodicity in non-periodic
+      !! directions (e.g., Y direction for 010 BCs).
       import :: poisson_fft_t
       import :: field_t
 
-      class(poisson_fft_t) :: self
-      class(field_t), intent(inout) :: f_out
-      class(field_t), intent(in) :: f_in
+      class(poisson_fft_t) :: self             !! Poisson solver instance
+      class(field_t), intent(inout) :: f_out   !! Output field
+      class(field_t), intent(in) :: f_in       !! Input field
     end subroutine field_process
   end interface
 
 contains
 
   subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st)
+    !! Initialise FFT-based Poisson solver with mesh and decomposition info.
+    !!
+    !! Sets up:
+    !! - Domain dimensions (global and local)
+    !! - Periodicity flags from boundary conditions
+    !! - Spectral space dimensions and offsets
+    !! - Wave number arrays and spectral equivalence constants
+    !! - Stretching matrices (if Y-direction is stretched)
+    !! - Function pointer to appropriate BC-specific solver
+    !!
+    !! **Restrictions:**
+    !! - X-direction must not be decomposed (nproc_dir(1) must be 1)
+    !! - Only Y-direction stretching is supported
+    !! - Currently supports 000 (fully periodic) and 010 (Y non-periodic) BCs
+    !!
+    !! **Note:** 010 BCs with multiple MPI ranks not yet supported.
     implicit none
 
-    class(poisson_fft_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    integer, dimension(3), intent(in) :: n_spec ! Size of the spectral pencil
-    integer, dimension(3), intent(in) :: n_sp_st ! Offset of the spectral pencil
+    class(poisson_fft_t) :: self                  !! Poisson solver instance
+    type(mesh_t), intent(in) :: mesh              !! Mesh object with grid and decomposition
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Directional operators
+    integer, dimension(3), intent(in) :: n_spec   !! Size of the spectral pencil [nx, ny, nz]
+    integer, dimension(3), intent(in) :: n_sp_st  !! Offset of the spectral pencil [x, y, z]
 
     integer :: dims(3)
 
@@ -180,20 +248,33 @@ subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st)
   end subroutine base_init
 
   subroutine solve_poisson(self, f, temp)
+    !! Main interface to solve Poisson equation.
+    !!
+    !! Delegates to the BC-specific solver function pointed to by
+    !! self%poisson (either poisson_000 or poisson_010). This provides
+    !! a uniform interface regardless of boundary conditions.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field to solve (RHS in, solution out), temporary
 
     call self%poisson(f, temp)
 
   end subroutine solve_poisson
 
   subroutine poisson_000(self, f, temp)
+    !! Solve Poisson equation with fully periodic (000) boundary conditions.
+    !!
+    !! For periodic BCs in all directions, the solution procedure is:
+    !! 1. Forward FFT: f → f_hat
+    !! 2. Spectral division: f_hat / k² → solution_hat
+    !! 3. Backward FFT: solution_hat → solution
+    !!
+    !! This is the simplest case requiring no special handling for BCs.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field (RHS in, solution out), temporary (unused)
 
     call self%fft_forward(f)
     call self%fft_postprocess_000
@@ -202,10 +283,21 @@ subroutine poisson_000(self, f, temp)
   end subroutine poisson_000
 
   subroutine poisson_010(self, f, temp)
+    !! Solve Poisson equation with mixed (010) boundary conditions.
+    !!
+    !! For periodic in X/Z, non-periodic in Y, the solution procedure is:
+    !! 1. Enforce artificial periodicity in Y using symmetry extension
+    !! 2. Forward FFT: f → f_hat
+    !! 3. Spectral division with stretching corrections (if grid is stretched)
+    !! 4. Backward FFT: solution_hat → solution
+    !! 5. Undo artificial periodicity to recover physical solution
+    !!
+    !! The symmetry extension doubles the domain size in Y to handle
+    !! non-periodic BCs via FFT.
     implicit none
 
-    class(poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f, temp
+    class(poisson_fft_t) :: self                !! Poisson solver instance
+    class(field_t), intent(inout) :: f, temp    !! Field (RHS in, solution out), temporary
 
     call self%enforce_periodicity_y(temp, f)
 
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index dcfed3fd8..e1d091c9b 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -1,4 +1,39 @@
 program xcompact
+  !! Main program for X3D2 CFD solver.
+  !!
+  !! X3D2 is a high-order finite-difference incompressible Navier-Stokes
+  !! solver based on Xcompact3D/Incompact3D. It solves the incompressible
+  !! Navier-Stokes equations using:
+  !! - **Compact finite differences** for spatial derivatives (4th-6th order)
+  !! - **Fractional-step method** for pressure-velocity coupling
+  !! - **FFT-based or iterative Poisson solvers** for pressure
+  !! - **Explicit time integration** (Runge-Kutta or Adams-Bashforth)
+  !!
+  !! **Program Flow:**
+  !! 1. Initialise MPI and determine rank/size
+  !! 2. Select computational backend (CUDA GPU or OpenMP CPU)
+  !! 3. Read configuration from input file (domain and solver parameters)
+  !! 4. Create mesh with domain decomposition (pencil decomposition)
+  !! 5. Instantiate allocator and backend for the selected platform
+  !! 6. Select and instantiate flow case (channel, TGV, generic, etc.)
+  !! 7. Run simulation via flow_case%run()
+  !! 8. Report timing and finalise MPI
+  !!
+  !! **Backend Options:**
+  !! - **CUDA**: GPU acceleration via NVIDIA CUDA (compile with -DCUDA)
+  !! - **OMP**: CPU parallelism via OpenMP threading
+  !!
+  !! **Input:** Namelist file specified as command-line argument (e.g., input.x3d)
+  !!
+  !! **Domain Decomposition:**
+  !! X3D2 supports two decomposition strategies:
+  !! - **2DECOMP&FFT**: External library used when FFT Poisson solver + OMP backend.
+  !!   Provides optimised pencil decomposition and FFT transforms. Cannot decompose
+  !!   in X-direction (nproc_dir(1) must be 1).
+  !! - **Generic**: Built-in X3D2 decomposition used for CUDA backend or when
+  !!   2DECOMP&FFT is unavailable. Can decompose in any direction (X, Y, Z).
+  !!
+  !! The decomposition is selected automatically based on backend and solver type.
   use mpi
 
   use m_allocator
@@ -22,30 +57,31 @@ program xcompact
 
   implicit none
 
-  class(base_backend_t), pointer :: backend
-  class(allocator_t), pointer :: allocator
-  type(allocator_t), pointer :: host_allocator
-  type(mesh_t), target :: mesh
-  class(base_case_t), allocatable :: flow_case
+  class(base_backend_t), pointer :: backend       !! Active computational backend (CUDA or OMP)
+  class(allocator_t), pointer :: allocator        !! Memory allocator for device/host
+  type(allocator_t), pointer :: host_allocator    !! Host memory allocator (for I/O, etc.)
+  type(mesh_t), target :: mesh                    !! Computational mesh with decomposition
+  class(base_case_t), allocatable :: flow_case    !! Flow case instance (polymorphic)
 
 #ifdef CUDA
-  type(cuda_backend_t), target :: cuda_backend
-  type(cuda_allocator_t), target :: cuda_allocator
-  integer :: ndevs, devnum
+  type(cuda_backend_t), target :: cuda_backend    !! CUDA backend implementation
+  type(cuda_allocator_t), target :: cuda_allocator !! CUDA device memory allocator
+  integer :: ndevs, devnum                         !! Number of GPUs, assigned device number
 #else
-  type(omp_backend_t), target :: omp_backend
+  type(omp_backend_t), target :: omp_backend       !! OpenMP backend implementation
 #endif
 
-  type(allocator_t), target :: omp_allocator
+  type(allocator_t), target :: omp_allocator       !! Host/CPU memory allocator
 
-  real(dp) :: t_start, t_end
+  real(dp) :: t_start, t_end                       !! CPU timing for performance measurement
 
-  type(domain_config_t) :: domain_cfg
-  type(solver_config_t) :: solver_cfg
-  character(32) :: backend_name
-  integer :: dims(3), nrank, nproc, ierr
-  logical :: use_2decomp
+  type(domain_config_t) :: domain_cfg              !! Domain configuration from input file
+  type(solver_config_t) :: solver_cfg              !! Solver configuration from input file
+  character(32) :: backend_name                    !! Backend name string ("CUDA" or "OMP")
+  integer :: dims(3), nrank, nproc, ierr           !! Dimensions, MPI rank/size, error code
+  logical :: use_2decomp                           !! Whether to use 2DECOMP&FFT library
 
+  ! Initialise MPI
   call MPI_Init(ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
@@ -74,7 +110,9 @@ program xcompact
     domain_cfg%nproc_dir = [1, 1, nproc]
   end if
 
-  ! Decide whether 2decomp is used or not
+  ! Select decomposition strategy:
+  ! - 2DECOMP&FFT: Used for FFT Poisson solver with OMP backend (optimised)
+  ! - Generic: Used for CUDA backend or non-FFT solvers (more flexible)
   use_2decomp = solver_cfg%poisson_solver_type == 'FFT' &
                 .and. trim(backend_name) == 'OMP'
 

From fe38405046962b8eb6038884831c4ab57291915b Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 11:16:01 +0000
Subject: [PATCH 04/12] docs: removed special characters introduced previously

---
 src/poisson_fft.f90     | 10 +++++-----
 src/vector_calculus.f90 |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
index 89ef652f4..97fd11a32 100644
--- a/src/poisson_fft.f90
+++ b/src/poisson_fft.f90
@@ -266,9 +266,9 @@ subroutine poisson_000(self, f, temp)
     !! Solve Poisson equation with fully periodic (000) boundary conditions.
     !!
     !! For periodic BCs in all directions, the solution procedure is:
-    !! 1. Forward FFT: f → f_hat
-    !! 2. Spectral division: f_hat / k² → solution_hat
-    !! 3. Backward FFT: solution_hat → solution
+    !! 1. Forward FFT: f to f_hat
+    !! 2. Spectral division: \( \hat{f} / k^2 \) gives solution_hat
+    !! 3. Backward FFT: solution_hat to solution
     !!
     !! This is the simplest case requiring no special handling for BCs.
     implicit none
@@ -287,9 +287,9 @@ subroutine poisson_010(self, f, temp)
     !!
     !! For periodic in X/Z, non-periodic in Y, the solution procedure is:
     !! 1. Enforce artificial periodicity in Y using symmetry extension
-    !! 2. Forward FFT: f → f_hat
+    !! 2. Forward FFT: f to f_hat
     !! 3. Spectral division with stretching corrections (if grid is stretched)
-    !! 4. Backward FFT: solution_hat → solution
+    !! 4. Backward FFT: solution_hat to solution
     !! 5. Undo artificial periodicity to recover physical solution
     !!
     !! The symmetry extension doubles the domain size in Y to handle
diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90
index 324f482e4..ce1110f1e 100644
--- a/src/vector_calculus.f90
+++ b/src/vector_calculus.f90
@@ -302,7 +302,7 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
     !! are evaluated at vertices (VERT). This is the inverse operation of
     !! divergence_v2c and is used in projection methods for incompressible flow.
     !!
-    !! The algorithm proceeds in reverse order (Z→Y→X):
+    !! The algorithm proceeds in reverse order (Z to Y to X):
     !! 1. Compute dp/dz (staggered), interpolate p in Z direction (DIR_Z)
     !! 2. Reorder to DIR_Y, compute dp/dy (staggered), interpolate p and dpdz
     !! 3. Reorder to DIR_X, compute dp/dx (staggered), interpolate dpdy and dpdz
@@ -399,9 +399,9 @@ subroutine laplacian(self, lapl_u, u, x_der2nd, y_der2nd, z_der2nd)
     !! Poisson equations.
     !!
     !! The algorithm computes second derivatives in each direction:
-    !! 1. Compute d²u/dx² directly in DIR_X
-    !! 2. Reorder to DIR_Y, compute d²u/dy², sum into result via sum_yintox
-    !! 3. Reorder to DIR_Z, compute d²u/dz², sum into result via sum_zintox
+    !! 1. Compute \( d^2u/dx^2 \) directly in DIR_X
+    !! 2. Reorder to DIR_Y, compute \( d^2u/dy^2 \), sum into result via sum_yintox
+    !! 3. Reorder to DIR_Z, compute \( d^2u/dz^2 \), sum into result via sum_zintox
     !!
     !! The sum_yintox and sum_zintox operations add directional derivatives
     !! directly into the DIR_X result field without additional reordering.

From d5ec4fbc83ef7c4bfa34050bde6c4e76d834cac5 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 11:20:39 +0000
Subject: [PATCH 05/12] docs: add documentation to cases

---
 src/case/base_case.f90 | 228 ++++++++++++++++++++++++++++++++---------
 src/case/channel.f90   | 117 +++++++++++++++++----
 src/case/generic.f90   |  56 +++++++---
 src/case/tgv.f90       |  72 ++++++++++---
 4 files changed, 378 insertions(+), 95 deletions(-)

diff --git a/src/case/base_case.f90 b/src/case/base_case.f90
index e7a673d81..3445a5345 100644
--- a/src/case/base_case.f90
+++ b/src/case/base_case.f90
@@ -1,8 +1,33 @@
 module m_base_case
-  !! Provides the base case for running a simulation. New cases are
-  !! implemented by extending this to specify the initial and boundary
-  !! conditions, forcing terms and case-specific postprocessing and analysis.
-
+  !! Base class for flow simulation cases.
+  !!
+  !! This abstract base class provides the framework for implementing specific
+  !! flow cases (channel, TGV, generic, etc.). New cases extend this class and
+  !! override deferred procedures to specify:
+  !! - **Initial conditions**: Set velocity and other field initial states
+  !! - **Boundary conditions**: Apply physical boundary conditions each timestep
+  !! - **Forcing terms**: Add body forces or model-specific source terms
+  !! - **Pre-correction**: Modify velocity before pressure correction (e.g., IBM)
+  !! - **Postprocessing**: Compute statistics, output diagnostics, etc.
+  !!
+  !! **Simulation Workflow:**
+  !! The `run()` method orchestrates the time integration loop:
+  !! 1. Apply boundary conditions
+  !! 2. Advance solution one timestep via solver%step()
+  !! 3. Write checkpoints/snapshots (via checkpoint_mgr)
+  !! 4. Perform case-specific postprocessing
+  !! 5. Repeat until final time reached
+  !!
+  !! **Time Integration:**
+  !! Each timestep involves multiple stages (for RK) or steps (for AB):
+  !! - Transport equation (transeq) computes velocity derivatives
+  !! - Forcing terms applied after transeq
+  !! - Pre-correction modifies velocity (e.g., for immersed boundaries)
+  !! - Pressure correction enforces incompressibility
+  !!
+  !! **Restart Capability:**
+  !! The checkpoint manager handles restart from saved states automatically
+  !! if a restart file is detected.
   use m_allocator, only: allocator_t
   use m_base_backend, only: base_backend_t
   use m_common, only: dp, DIR_X, DIR_Z, DIR_C, VERT
@@ -15,82 +40,109 @@ module m_base_case
   implicit none
 
   type, abstract :: base_case_t
-    class(solver_t), allocatable :: solver
-    type(io_manager_t) :: checkpoint_mgr
+    !! Abstract base type for flow cases.
+    !!
+    !! Derived types must implement all deferred procedures to define
+    !! case-specific behaviour.
+    class(solver_t), allocatable :: solver       !! Incompressible Navier-Stokes solver
+    type(io_manager_t) :: checkpoint_mgr         !! Checkpoint and snapshot manager
   contains
-    procedure(boundary_conditions), deferred :: boundary_conditions
-    procedure(initial_conditions), deferred :: initial_conditions
-    procedure(forcings), deferred :: forcings
-    procedure(pre_correction), deferred :: pre_correction
-    procedure(postprocess), deferred :: postprocess
-    procedure :: case_init
-    procedure :: case_finalise
-    procedure :: set_init
-    procedure :: run
-    procedure :: print_enstrophy
-    procedure :: print_div_max_mean
+    procedure(boundary_conditions), deferred :: boundary_conditions !! Apply BCs (deferred)
+    procedure(initial_conditions), deferred :: initial_conditions   !! Set ICs (deferred)
+    procedure(forcings), deferred :: forcings                       !! Add forcing terms (deferred)
+    procedure(pre_correction), deferred :: pre_correction           !! Pre-pressure correction (deferred)
+    procedure(postprocess), deferred :: postprocess                 !! Case-specific analysis (deferred)
+    procedure :: case_init          !! Initialise case and solver
+    procedure :: case_finalise      !! Clean up and finalise
+    procedure :: set_init           !! Set initial conditions and prepare for run
+    procedure :: run                !! Main time integration loop
+    procedure :: print_enstrophy    !! Print enstrophy diagnostic
+    procedure :: print_div_max_mean !! Print divergence diagnostics
   end type base_case_t
 
   abstract interface
     subroutine boundary_conditions(self)
-      !! Applies case-specific boundary coinditions
+      !! Abstract interface for applying boundary conditions.
+      !!
+      !! Called each timestep before computing derivatives. Implementations
+      !! should set velocity and scalar values at domain boundaries according
+      !! to the physical boundary conditions (Dirichlet, Neumann, periodic, etc.).
       import :: base_case_t
       implicit none
 
-      class(base_case_t) :: self
+      class(base_case_t) :: self !! Case instance
     end subroutine boundary_conditions
 
     subroutine initial_conditions(self)
-      !! Sets case-specific initial conditions
+      !! Abstract interface for setting initial conditions.
+      !!
+      !! Called once during initialisation to set the initial state of velocity
+      !! and scalar fields. Implementations should populate u, v, w (and species
+      !! if present) with case-appropriate initial values.
       import :: base_case_t
       implicit none
 
-      class(base_case_t) :: self
+      class(base_case_t) :: self !! Case instance
     end subroutine initial_conditions
 
     subroutine forcings(self, du, dv, dw, iter)
-      !! Applies case-specific or model realated forcings after transeq
+      !! Abstract interface for applying forcing terms.
+      !!
+      !! Called after transport equation (transeq) but before pressure correction.
+      !! Add body forces, source terms, or model-specific forcings (e.g., mean
+      !! pressure gradient for channel flow, immersed boundary forces, etc.).
       import :: base_case_t
       import :: field_t
       implicit none
 
-      class(base_case_t) :: self
-      class(field_t), intent(inout) :: du, dv, dw
-      integer, intent(in) :: iter
+      class(base_case_t) :: self                  !! Case instance
+      class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify
+      integer, intent(in) :: iter                 !! Current iteration number
     end subroutine forcings
 
     subroutine pre_correction(self, u, v, w)
-      !! Applies case-specific pre-correction to the velocity fields before
-      !! pressure correction
+      !! Abstract interface for pre-pressure correction modifications.
+      !!
+      !! Called after forcings but before pressure correction. Used for operations
+      !! that need to modify the velocity field before enforcing incompressibility,
+      !! such as immersed boundary method (IBM) velocity corrections.
       import :: base_case_t
       import :: field_t
       implicit none
 
-      class(base_case_t) :: self
-      class(field_t), intent(inout) :: u, v, w
+      class(base_case_t) :: self                 !! Case instance
+      class(field_t), intent(inout) :: u, v, w   !! Velocity fields to modify
     end subroutine pre_correction
 
     subroutine postprocess(self, iter, t)
-      !! Triggers case-specific postprocessings at user specified intervals
+      !! Abstract interface for case-specific postprocessing.
+      !!
+      !! Called at user-specified intervals during time integration. Implement
+      !! this to compute statistics, output diagnostics, write custom data files,
+      !! or perform any case-specific analysis.
       import :: base_case_t
       import :: dp
       implicit none
 
-      class(base_case_t) :: self
-      integer, intent(in) :: iter
-      real(dp), intent(in) :: t
+      class(base_case_t) :: self    !! Case instance
+      integer, intent(in) :: iter   !! Current iteration number
+      real(dp), intent(in) :: t     !! Current simulation time
     end subroutine postprocess
   end interface
 
 contains
 
   subroutine case_init(self, backend, mesh, host_allocator)
+    !! Initialise case with solver and checkpoint manager.
+    !!
+    !! Creates the solver instance and initialises the checkpoint/snapshot
+    !! manager. If a restart file is detected, loads the saved state.
     implicit none
 
-    class(base_case_t) :: self
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
+    class(base_case_t) :: self                              !! Case instance
+    class(base_backend_t), target, intent(inout) :: backend !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh             !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator
 
     self%solver = init(backend, mesh, host_allocator)
 
@@ -104,7 +156,14 @@ subroutine case_init(self, backend, mesh, host_allocator)
   end subroutine case_init
 
   subroutine case_finalise(self)
-    class(base_case_t) :: self
+    !! Finalise the case and clean up resources.
+    !!
+    !! Performs cleanup operations at the end of a simulation run:
+    !! - Finalises the checkpoint manager (closes files, flushes buffers)
+    !! - Prints completion message on root process
+    !!
+    !! This should be called after the main time integration loop completes.
+    class(base_case_t) :: self !! Case instance to finalise
 
     if (self%solver%mesh%par%is_root()) print *, 'run end'
 
@@ -112,18 +171,35 @@ subroutine case_finalise(self)
   end subroutine case_finalise
 
   subroutine set_init(self, field, field_func)
+    !! Initialise a field using an analytical function.
+    !!
+    !! This utility subroutine sets a field's values by evaluating a
+    !! user-provided pure function at each grid point. The function
+    !! is evaluated on the host, then transferred to the backend device
+    !! (if using GPU backend).
+    !!
+    !! **Usage Example:**
+    !! ```fortran
+    !! call self%set_init(self%solver%u, u_initial)
+    !! ```
+    !! where `u_initial` is a pure function taking coordinates [x,y,z]
+    !! and returning the initial velocity value.
+    !!
+    !! This is commonly used in `initial_conditions()` implementations
+    !! to set velocity or scalar fields from analytical expressions.
     implicit none
 
-    class(base_case_t) :: self
-    class(field_t), intent(inout) :: field
+    class(base_case_t) :: self                  !! Case instance
+    class(field_t), intent(inout) :: field      !! Field to initialise
 
     interface
       pure function field_func(coords) result(r)
+        !! Pure function defining field values at each point.
         import dp
         implicit none
 
-        real(dp), intent(in) :: coords(3)
-        real(dp) :: r
+        real(dp), intent(in) :: coords(3)  !! Spatial coordinates [x, y, z]
+        real(dp) :: r                      !! Field value at this location
       end function field_func
     end interface
 
@@ -151,11 +227,24 @@ end function field_func
   end subroutine set_init
 
   subroutine print_enstrophy(self, u, v, w)
-    !! Reports the enstrophy
+    !! Compute and print the volume-averaged enstrophy.
+    !!
+    !! Enstrophy is a measure of the rotational kinetic energy density:
+    !! \[ E = \frac{1}{2V} \int_V |\omega|^2 \, dV = \frac{1}{2V} \int_V |\nabla \times \mathbf{u}|^2 \, dV \]
+    !!
+    !! where \( \omega = \nabla \times \mathbf{u} \) is the vorticity.
+    !!
+    !! This diagnostic is useful for monitoring:
+    !! - Flow transition to turbulence (enstrophy increases)
+    !! - Energy cascade to small scales
+    !! - Numerical stability (sudden spikes indicate problems)
+    !! - Comparison with theoretical predictions (e.g., TGV decay)
+    !!
+    !! Only the root MPI rank prints the result.
     implicit none
 
-    class(base_case_t), intent(in) :: self
-    class(field_t), intent(in) :: u, v, w
+    class(base_case_t), intent(in) :: self  !! Case instance
+    class(field_t), intent(in) :: u, v, w   !! Velocity components
 
     class(field_t), pointer :: du, dv, dw
     real(dp) :: enstrophy
@@ -180,11 +269,29 @@ subroutine print_enstrophy(self, u, v, w)
   end subroutine print_enstrophy
 
   subroutine print_div_max_mean(self, u, v, w)
-    !! Reports the div(u) at cell centres
+    !! Compute and print maximum and mean divergence.
+    !!
+    !! For incompressible flow, the velocity divergence should be zero:
+    !! \[ \nabla \cdot \mathbf{u} = 0 \]
+    !!
+    !! This diagnostic reports:
+    !! - **Maximum divergence**: Largest local violation of incompressibility
+    !! - **Mean divergence**: Volume-averaged divergence (should be near machine zero)
+    !!
+    !! **Purpose:**
+    !! - Monitor quality of pressure correction (divergence should be ~ 1e-10 or smaller)
+    !! - Detect numerical issues (large divergence indicates solver problems)
+    !! - Verify proper boundary condition implementation
+    !! - Check convergence of iterative Poisson solvers
+    !!
+    !! Divergence is computed at cell centres from vertex velocities using
+    !! staggered derivatives and interpolation.
+    !!
+    !! Only the root MPI rank prints the result.
     implicit none
 
-    class(base_case_t), intent(in) :: self
-    class(field_t), intent(in) :: u, v, w
+    class(base_case_t), intent(in) :: self  !! Case instance
+    class(field_t), intent(in) :: u, v, w   !! Velocity components
 
     class(field_t), pointer :: div_u
     real(dp) :: div_u_max, div_u_mean
@@ -202,11 +309,30 @@ subroutine print_div_max_mean(self, u, v, w)
   end subroutine print_div_max_mean
 
   subroutine run(self)
-    !! Runs the solver forwards in time from t=t_0 to t=T, performing
-    !! postprocessing/IO and reporting diagnostics.
+    !! Main time integration loop for the simulation.
+    !!
+    !! Advances the solution from initial time t=t_0 to final time t=T,
+    !! orchestrating all aspects of the simulation:
+    !!
+    !! **Each Timestep:**
+    !! 1. Apply boundary conditions
+    !! 2. Compute derivatives and advance via time_integrator%step()
+    !! 3. Handle checkpointing and snapshot output (via checkpoint_mgr)
+    !! 4. Perform case-specific postprocessing
+    !! 5. Print diagnostics (divergence, enstrophy)
+    !!
+    !! **Time Integration Stages:**
+    !! For multi-stage methods (RK), each timestep involves multiple stages.
+    !! The solver%step() method handles the stage-by-stage advancement,
+    !! calling transeq, forcings, pre_correction, and pressure_correction
+    !! at appropriate points.
+    !!
+    !! **Restart Support:**
+    !! If a restart file is detected, continues from the saved iteration
+    !! and time rather than starting from t=0.
     implicit none
 
-    class(base_case_t), intent(inout) :: self
+    class(base_case_t), intent(inout) :: self !! Case instance
 
     type(flist_t), allocatable :: curr(:)
     type(flist_t), allocatable :: deriv(:)
diff --git a/src/case/channel.f90 b/src/case/channel.f90
index 846dcf93e..7010e94e4 100644
--- a/src/case/channel.f90
+++ b/src/case/channel.f90
@@ -1,4 +1,28 @@
 module m_case_channel
+  !! Turbulent channel flow case with optional rotation.
+  !!
+  !! This module implements a turbulent channel flow simulation between
+  !! two parallel walls. The flow is driven by a mean pressure gradient
+  !! to maintain a target bulk velocity.
+  !!
+  !! **Flow Configuration:**
+  !! - Domain: Periodic in X and Z, wall-bounded in Y
+  !! - Walls at y = 0 and y = L_y with no-slip boundary conditions
+  !! - Mean pressure gradient maintains constant bulk velocity
+  !! - Optional rotation forcing (Coriolis-like terms) for rotating channel
+  !!
+  !! **Initial Conditions:**
+  !! - Parabolic base profile: \( u = 1 - y^2 \)
+  !! - Random perturbations with configurable amplitude (noise parameter)
+  !! - Perturbations concentrated near centreline for faster transition
+  !!
+  !! **Boundary Conditions:**
+  !! - No-slip walls: u = v = w = 0 at y = 0 and y = L_y
+  !! - Enforces mean bulk velocity via volume shift (simulates pressure gradient)
+  !!
+  !! **Forcing:**
+  !! - Mean pressure gradient (constant in time, via bulk velocity constraint)
+  !! - Optional Coriolis forcing for rotating channel flows
   use iso_fortran_env, only: stderr => error_unit
   use mpi
 
@@ -13,13 +37,14 @@ module m_case_channel
   implicit none
 
   type, extends(base_case_t) :: case_channel_t
-    type(channel_config_t) :: channel_cfg
+    !! Channel flow case with optional rotation forcing.
+    type(channel_config_t) :: channel_cfg !! Channel-specific configuration
   contains
-    procedure :: boundary_conditions => boundary_conditions_channel
-    procedure :: initial_conditions => initial_conditions_channel
-    procedure :: forcings => forcings_channel
-    procedure :: pre_correction => pre_correction_channel
-    procedure :: postprocess => postprocess_channel
+    procedure :: boundary_conditions => boundary_conditions_channel !! Apply bulk velocity constraint
+    procedure :: initial_conditions => initial_conditions_channel   !! Set perturbed parabolic profile
+    procedure :: forcings => forcings_channel                       !! Apply rotation forcing (if enabled)
+    procedure :: pre_correction => pre_correction_channel           !! Enforce wall boundary conditions
+    procedure :: postprocess => postprocess_channel                 !! Compute statistics
   end type case_channel_t
 
   interface case_channel_t
@@ -29,12 +54,15 @@ module m_case_channel
 contains
 
   function case_channel_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise channel flow case.
+    !!
+    !! Reads channel-specific configuration and initialises the base case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_channel_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_channel_t) :: flow_case                               !! Initialised channel case
 
     call flow_case%channel_cfg%read(nml_file=get_argument(1))
 
@@ -43,9 +71,14 @@ function case_channel_init(backend, mesh, host_allocator) result(flow_case)
   end function case_channel_init
 
   subroutine boundary_conditions_channel(self)
+    !! Apply boundary conditions to enforce target bulk velocity.
+    !!
+    !! Computes the current bulk (volume-averaged) velocity and applies
+    !! a uniform shift to maintain the target value of 2/3. This simulates
+    !! the effect of a mean pressure gradient driving the flow.
     implicit none
 
-    class(case_channel_t) :: self
+    class(case_channel_t) :: self !! Channel case instance
 
     real(dp) :: can, ub
     integer :: ierr
@@ -63,9 +96,18 @@ subroutine boundary_conditions_channel(self)
   end subroutine boundary_conditions_channel
 
   subroutine initial_conditions_channel(self)
+    !! Set initial velocity field with perturbed parabolic profile.
+    !!
+    !! Creates a laminar parabolic profile \( u = 1 - y^2 \) and adds random
+    !! perturbations scaled by the noise parameter. Perturbations are
+    !! amplitude-modulated with a Gaussian centred at the channel centreline
+    !! to concentrate disturbances where they are most effective for
+    !! triggering turbulent transition.
+    !!
+    !! No-slip conditions (u = v = w = 0) are enforced at walls (y=0, y=L_y).
     implicit none
 
-    class(case_channel_t) :: self
+    class(case_channel_t) :: self !! Channel case instance
 
     class(field_t), pointer :: u_init, v_init, w_init
 
@@ -119,13 +161,32 @@ subroutine initial_conditions_channel(self)
   end subroutine initial_conditions_channel
 
   subroutine forcings_channel(self, du, dv, dw, iter)
+    !! Apply rotation forcing (Coriolis-like terms) if enabled.
+    !!
+    !! For rotating channel flows, adds Coriolis-like forcing terms that
+    !! couple the streamwise (u) and spanwise (v) velocities:
+    !!
+    !! \[ \frac{du}{dt} = \ldots - \Omega v \]
+    !! \[ \frac{dv}{dt} = \ldots + \Omega u \]
+    !!
+    !! where \( \Omega \) is the rotation rate (omega_rot).
+    !!
+    !! **Configuration:**
+    !! - Activated via `channel_cfg%rotation = .true.`
+    !! - Rotation rate set by `channel_cfg%omega_rot`
+    !! - Applied only for first `n_rotate` iterations to allow spin-up
+    !!
+    !! **Physical Interpretation:**
+    !! Mimics effects of system rotation (e.g., rotating reference frame)
+    !! without explicitly implementing Coriolis force. Useful for studying
+    !! rotation effects on turbulent channel flows.
     implicit none
 
-    class(case_channel_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    integer, intent(in) :: iter
+    class(case_channel_t) :: self                !! Channel case instance
+    class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify
+    integer, intent(in) :: iter                  !! Current iteration number
 
-    real(dp) :: rot
+    real(dp) :: rot !! Rotation rate for current forcing application
 
     if (self%channel_cfg%rotation .and. iter < self%channel_cfg%n_rotate) then
       rot = self%channel_cfg%omega_rot
@@ -136,10 +197,30 @@ subroutine forcings_channel(self, du, dv, dw, iter)
   end subroutine forcings_channel
 
   subroutine pre_correction_channel(self, u, v, w)
+    !! Enforce no-slip boundary conditions at channel walls.
+    !!
+    !! Sets all velocity components to zero at the wall boundaries (Y-faces):
+    !! - Lower wall: y = 0
+    !! - Upper wall: y = L_y
+    !!
+    !! This implements the no-slip condition:
+    !! \[ u = v = w = 0 \quad \text{at walls} \]
+    !!
+    !! **Implementation:**
+    !! Uses `field_set_face` to directly set values on Y-direction faces
+    !! (boundaries perpendicular to Y-axis). This is applied after the
+    !! time integration step but before pressure correction, ensuring that
+    !! the corrected velocity field satisfies both incompressibility and
+    !! no-slip boundary conditions.
+    !!
+    !! **Note:**
+    !! This is the standard approach for wall-bounded flows. For periodic
+    !! or other boundary conditions, this subroutine would be modified or
+    !! left empty.
     implicit none
 
-    class(case_channel_t) :: self
-    class(field_t), intent(inout) :: u, v, w
+    class(case_channel_t) :: self             !! Channel case instance
+    class(field_t), intent(inout) :: u, v, w !! Velocity components to correct
 
     call self%solver%backend%field_set_face(u, 0._dp, 0._dp, Y_FACE)
     call self%solver%backend%field_set_face(v, 0._dp, 0._dp, Y_FACE)
diff --git a/src/case/generic.f90 b/src/case/generic.f90
index 193767999..a83fa3d47 100644
--- a/src/case/generic.f90
+++ b/src/case/generic.f90
@@ -1,6 +1,26 @@
 module m_case_generic
-  !! An example case set up to run and sustain a freestream flow.
-  !! This is a good place to start for adding a new flow case.
+  !! Generic freestream flow case for general-purpose simulations.
+  !!
+  !! This module provides a minimal template for setting up custom flow
+  !! cases. It implements a simple uniform freestream flow (u=1, v=0, w=0)
+  !! with no forcing or boundary corrections.
+  !!
+  !! **Use Cases:**
+  !! - Starting point for implementing new flow cases
+  !! - Testing solver functionality with simple initial conditions
+  !! - Freestream simulations with immersed boundaries (add IBM via forcings)
+  !! - Custom flow setups requiring minimal default behaviour
+  !!
+  !! **Default Configuration:**
+  !! - Initial condition: Uniform flow u=1, v=w=0
+  !! - No boundary condition corrections
+  !! - No forcing terms
+  !! - No pre-correction
+  !! - Minimal postprocessing
+  !!
+  !! **Customisation:**
+  !! Users can extend this case or modify the procedures directly to implement
+  !! specific flow physics, boundary conditions, or forcing terms.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -14,12 +34,13 @@ module m_case_generic
   implicit none
 
   type, extends(base_case_t) :: case_generic_t
+    !! Generic case with minimal default behaviour.
   contains
-    procedure :: boundary_conditions => boundary_conditions_generic
-    procedure :: initial_conditions => initial_conditions_generic
-    procedure :: forcings => forcings_generic
-    procedure :: pre_correction => pre_correction_generic
-    procedure :: postprocess => postprocess_generic
+    procedure :: boundary_conditions => boundary_conditions_generic !! No action (use domain BCs)
+    procedure :: initial_conditions => initial_conditions_generic   !! Uniform freestream
+    procedure :: forcings => forcings_generic                       !! No forcing
+    procedure :: pre_correction => pre_correction_generic           !! No correction
+    procedure :: postprocess => postprocess_generic                 !! Minimal diagnostics
   end type case_generic_t
 
   interface case_generic_t
@@ -29,12 +50,13 @@ module m_case_generic
 contains
 
   function case_generic_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise generic flow case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_generic_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_generic_t) :: flow_case                               !! Initialised generic case
 
     call flow_case%case_init(backend, mesh, host_allocator)
 
@@ -48,9 +70,19 @@ subroutine boundary_conditions_generic(self)
   end subroutine boundary_conditions_generic
 
   subroutine initial_conditions_generic(self)
+    !! Set initial velocity field for generic freestream case.
+    !!
+    !! Initialises a uniform flow field with:
+    !! - \( u = 1 \) (streamwise velocity)
+    !! - \( v = 0 \) (cross-stream velocity)
+    !! - \( w = 0 \) (spanwise velocity)
+    !!
+    !! All velocity components are located at vertices (VERT).
+    !! This simple uniform flow serves as a starting point that users
+    !! can modify for their specific applications.
     implicit none
 
-    class(case_generic_t) :: self
+    class(case_generic_t) :: self !! Generic case instance
 
     call self%solver%u%fill(1._dp)
     call self%solver%v%fill(0._dp)
diff --git a/src/case/tgv.f90 b/src/case/tgv.f90
index 971ee29fb..a9462ef08 100644
--- a/src/case/tgv.f90
+++ b/src/case/tgv.f90
@@ -1,4 +1,35 @@
 module m_case_tgv
+  !! Taylor-Green vortex (TGV) case for validation and benchmarking.
+  !!
+  !! The Taylor-Green vortex is a canonical test case for incompressible
+  !! Navier-Stokes solvers. It features an analytically-defined initial
+  !! condition that transitions from laminar to turbulent flow, providing
+  !! a rigorous test of:
+  !! - Spatial discretisation accuracy
+  !! - Time integration stability
+  !! - Energy conservation properties
+  !! - Transition to turbulence physics
+  !!
+  !! **Initial Conditions:**
+  !! \[ u = \sin(x) \cos(y) \cos(z) \]
+  !! \[ v = -\cos(x) \sin(y) \cos(z) \]
+  !! \[ w = 0 \]
+  !!
+  !! This satisfies incompressibility (\( \nabla \cdot \mathbf{u} = 0 \)) exactly and is periodic
+  !! in all three directions.
+  !!
+  !! **Domain:**
+  !! Typically \( [0, 2\pi]^3 \) with periodic boundary conditions in all directions.
+  !!
+  !! **Validation Metrics:**
+  !! - Kinetic energy decay rate
+  !! - Enstrophy evolution
+  !! - Dissipation rate
+  !! - Vorticity dynamics
+  !!
+  !! **Reference:**
+  !! Taylor, G. I., & Green, A. E. (1937). Mechanism of the production of
+  !! small eddies from large ones. Proc. R. Soc. Lond. A, 158(895), 499-521.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
@@ -12,12 +43,13 @@ module m_case_tgv
   implicit none
 
   type, extends(base_case_t) :: case_tgv_t
+    !! Taylor-Green vortex case (no additional state needed beyond base).
   contains
-    procedure :: boundary_conditions => boundary_conditions_tgv
-    procedure :: initial_conditions => initial_conditions_tgv
-    procedure :: forcings => forcings_tgv
-    procedure :: pre_correction => pre_correction_tgv
-    procedure :: postprocess => postprocess_tgv
+    procedure :: boundary_conditions => boundary_conditions_tgv !! No action (periodic BCs)
+    procedure :: initial_conditions => initial_conditions_tgv   !! Set TGV velocity field
+    procedure :: forcings => forcings_tgv                       !! No forcing
+    procedure :: pre_correction => pre_correction_tgv           !! No correction
+    procedure :: postprocess => postprocess_tgv                 !! Compute diagnostics
   end type case_tgv_t
 
   interface case_tgv_t
@@ -27,21 +59,27 @@ module m_case_tgv
 contains
 
   function case_tgv_init(backend, mesh, host_allocator) result(flow_case)
+    !! Initialise Taylor-Green vortex case.
     implicit none
 
-    class(base_backend_t), target, intent(inout) :: backend
-    type(mesh_t), target, intent(inout) :: mesh
-    type(allocator_t), target, intent(inout) :: host_allocator
-    type(case_tgv_t) :: flow_case
+    class(base_backend_t), target, intent(inout) :: backend         !! Computational backend
+    type(mesh_t), target, intent(inout) :: mesh                     !! Mesh with decomposition
+    type(allocator_t), target, intent(inout) :: host_allocator      !! Host memory allocator
+    type(case_tgv_t) :: flow_case                                   !! Initialised TGV case
 
     call flow_case%case_init(backend, mesh, host_allocator)
 
   end function case_tgv_init
 
   subroutine initial_conditions_tgv(self)
+    !! Set Taylor-Green vortex initial velocity field.
+    !!
+    !! Initialises the three velocity components according to the TGV
+    !! analytical solution. The field is exactly divergence-free and
+    !! periodic, making it ideal for testing solver accuracy.
     implicit none
 
-    class(case_tgv_t) :: self
+    class(case_tgv_t) :: self !! TGV case instance
 
     call self%set_init(self%solver%u, u_func)
     call self%set_init(self%solver%v, v_func)
@@ -54,19 +92,25 @@ subroutine initial_conditions_tgv(self)
   end subroutine initial_conditions_tgv
 
   pure function u_func(coords) result(r)
+    !! Compute x-velocity component of TGV at given coordinates.
+    !!
+    !! \[ u = \sin(x) \cos(y) \cos(z) \]
     implicit none
 
-    real(dp), intent(in) :: coords(3)
-    real(dp) :: r
+    real(dp), intent(in) :: coords(3) !! Position [x, y, z]
+    real(dp) :: r                     !! Velocity component u
 
     r = sin(coords(1))*cos(coords(2))*cos(coords(3))
   end function u_func
 
   pure function v_func(coords) result(r)
+    !! Compute y-velocity component of TGV at given coordinates.
+    !!
+    !! \[ v = -\cos(x) \sin(y) \cos(z) \]
     implicit none
 
-    real(dp), intent(in) :: coords(3)
-    real(dp) :: r
+    real(dp), intent(in) :: coords(3) !! Position [x, y, z]
+    real(dp) :: r                     !! Velocity component v
 
     r = -cos(coords(1))*sin(coords(2))*cos(coords(3))
   end function v_func

From df987c3437be8b41b4d4e969f9929c4ea4c1b221 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 12:12:04 +0000
Subject: [PATCH 06/12] docs: update IO documentation

---
 src/io/adios2/io.f90          | 134 +++++++++++--------
 src/io/checkpoint_manager.f90 | 236 +++++++++++++++++++++++-----------
 src/io/dummy/io.f90           |  95 ++++++++------
 src/io/io_base.f90            | 125 ++++++++++--------
 src/io/io_manager.f90         | 106 +++++++++++----
 src/io/snapshot_manager.f90   | 218 +++++++++++++++++++++----------
 6 files changed, 597 insertions(+), 317 deletions(-)

diff --git a/src/io/adios2/io.f90 b/src/io/adios2/io.f90
index 26976b98d..d8b95c86a 100644
--- a/src/io/adios2/io.f90
+++ b/src/io/adios2/io.f90
@@ -1,24 +1,37 @@
 module m_io_backend
-!! @brief Provides ADIOS2-specific implementation of the I/O backend interface
-!!
-!! @details This module contains the concrete backend implementation for ADIOS2
-!! (ADaptive Input Output System v2) library. It acts as a translation layer
-!! converting generic I/O calls from the session interface into specific calls
-!! to the ADIOS2 API.
-!!
-!! The `adios2_reader_t` and `adios2_writer_t` types defined here extend the
-!! abstract base types from `m_io_base` and implement required procedures
-!!
-!! This backend leverages several key features of the underlying ADIOS2 library
-!! - engine abstraction - the same API can be used for different transport
-!! methods (e.g. BP4, BP5, HDF5)
-!! - Asynchronous I/O - by default ADIOS2 uses a deferred transport mode
-!! which can improve performance by overlapping computation and I/O
-!! - MPI integration - it is designed for large-scale paralle I/O and
-!! integrates with MPI, though serial operation is also supported
-!!
-!! @note This is an internal backend module and should never be used directly.
-!! All user interaction must go through `m_io_session`.
+  !! ADIOS2-specific implementation of the I/O backend interface.
+  !!
+  !! This module provides the concrete backend implementation for ADIOS2
+  !! (Adaptable Input Output System v2), a high-performance parallel I/O
+  !! library. It acts as a translation layer converting generic I/O calls
+  !! from the session interface into specific ADIOS2 API calls.
+  !!
+  !! **Architecture:**
+  !! - Extends abstract base types from `m_io_base`
+  !! - Implements all required I/O procedures (init, open, read, write, etc.)
+  !! - Manages ADIOS2-specific objects (adios, io, engine)
+  !! - Handles step-based I/O for time-series data
+  !!
+  !! **ADIOS2 Features Leveraged:**
+  !! - **Engine Abstraction**: Same API for different formats (BP4, BP5, HDF5)
+  !! - **Asynchronous I/O**: Deferred transport mode overlaps computation and I/O
+  !! - **MPI Integration**: Designed for large-scale parallel I/O
+  !! - **Variable/Attribute Management**: Efficient metadata handling
+  !! - **Hyperslab Selection**: Parallel distributed array I/O
+  !!
+  !! **Type Hierarchy:**
+  !! ```
+  !! io_base (abstract)
+  !!   |-- io_reader_t (abstract)
+  !!   |     |-- io_adios2_reader_t (concrete)
+  !!   |-- io_writer_t (abstract)
+  !!   |     |-- io_adios2_writer_t (concrete)
+  !!   |-- io_file_t (abstract)
+  !!         |-- io_adios2_file_t (concrete)
+  !! ```
+  !!
+  !! **Note:** This is an internal backend module and should never be used
+  !! directly. All user interaction must go through `m_io_session`.
   use adios2, only: adios2_adios, adios2_io, adios2_engine, &
                     adios2_variable, adios2_attribute, &
                     adios2_mode_sync, adios2_mode_write, &
@@ -45,56 +58,71 @@ module m_io_backend
   public :: allocate_io_reader, allocate_io_writer
   public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2
 
-  integer, parameter :: IO_BACKEND_DUMMY = 0
-  integer, parameter :: IO_BACKEND_ADIOS2 = 1
+  integer, parameter :: IO_BACKEND_DUMMY = 0   !! Dummy backend identifier
+  integer, parameter :: IO_BACKEND_ADIOS2 = 1  !! ADIOS2 backend identifier
 
   type, extends(io_reader_t) :: io_adios2_reader_t
+    !! ADIOS2 reader implementation for reading data from files.
+    !!
+    !! Manages ADIOS2 objects required for reading operations including
+    !! the global ADIOS handler, I/O object, and tracks step state for
+    !! time-series data reading.
     private
     type(adios2_adios) :: adios              !! ADIOS2 global handler
-    type(adios2_io) :: io_handle             !! ADIOS2 IO object for managing I/O
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    integer :: comm = MPI_COMM_NULL          !! MPI communicator
+    type(adios2_io) :: io_handle             !! ADIOS2 I/O object for managing variables
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    integer :: comm = MPI_COMM_NULL          !! MPI communicator for parallel I/O
   contains
-    procedure :: init => reader_init_adios2
-    procedure :: open => reader_open_adios2
-    procedure :: read_data_i8 => read_data_i8_adios2
-    procedure :: read_data_integer => read_data_integer_adios2
-    procedure :: read_data_real => read_data_real_adios2
-    procedure :: read_data_array_3d => read_data_array_3d_adios2
-    procedure :: finalise => finalise_reader_adios2
-    procedure, private :: handle_error => handle_error_reader
+    procedure :: init => reader_init_adios2                 !! Initialise reader
+    procedure :: open => reader_open_adios2                 !! Open file for reading
+    procedure :: read_data_i8 => read_data_i8_adios2        !! Read 64-bit integer
+    procedure :: read_data_integer => read_data_integer_adios2 !! Read default integer
+    procedure :: read_data_real => read_data_real_adios2    !! Read double precision real
+    procedure :: read_data_array_3d => read_data_array_3d_adios2 !! Read 3D array with hyperslab
+    procedure :: finalise => finalise_reader_adios2         !! Finalise and clean up
+    procedure, private :: handle_error => handle_error_reader !! Error handling (internal)
   end type io_adios2_reader_t
 
   type, extends(io_writer_t) :: io_adios2_writer_t
+    !! ADIOS2 writer implementation for writing data to files.
+    !!
+    !! Manages ADIOS2 objects required for writing operations including
+    !! the global ADIOS handler, I/O object, and tracks step state for
+    !! time-series data writing.
     private
     type(adios2_adios) :: adios              !! ADIOS2 global handler
-    type(adios2_io) :: io_handle             !! ADIOS2 IO object for managing I/O
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    integer :: comm = MPI_COMM_NULL          !! MPI communicator
+    type(adios2_io) :: io_handle             !! ADIOS2 I/O object for managing variables
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    integer :: comm = MPI_COMM_NULL          !! MPI communicator for parallel I/O
   contains
-    procedure :: init => writer_init_adios2
-    procedure :: open => writer_open_adios2
-    procedure :: write_data_i8 => write_data_i8_adios2
-    procedure :: write_data_integer => write_data_integer_adios2
-    procedure :: write_data_real => write_data_real_adios2
-    procedure :: write_data_array_3d => write_data_array_3d_adios2
-    procedure :: write_attribute_string => write_attribute_string_adios2
+    procedure :: init => writer_init_adios2                 !! Initialise writer
+    procedure :: open => writer_open_adios2                 !! Open file for writing
+    procedure :: write_data_i8 => write_data_i8_adios2      !! Write 64-bit integer
+    procedure :: write_data_integer => write_data_integer_adios2 !! Write default integer
+    procedure :: write_data_real => write_data_real_adios2  !! Write double precision real
+    procedure :: write_data_array_3d => write_data_array_3d_adios2 !! Write 3D array with hyperslab
+    procedure :: write_attribute_string => write_attribute_string_adios2 !! Write string attribute
     procedure :: write_attribute_array_1d_real => &
-      write_attribute_array_1d_real_adios2
-    procedure :: finalise => finalise_writer_adios2
-    procedure, private :: handle_error => handle_error_writer
+      write_attribute_array_1d_real_adios2                  !! Write 1D real array attribute
+    procedure :: finalise => finalise_writer_adios2         !! Finalise and clean up
+    procedure, private :: handle_error => handle_error_writer !! Error handling (internal)
   end type io_adios2_writer_t
 
   type, extends(io_file_t) :: io_adios2_file_t
+    !! ADIOS2 file handle for open file operations.
+    !!
+    !! Wraps the ADIOS2 engine object and manages step-based I/O for
+    !! time-series data. Tracks whether file is opened for reading or
+    !! writing and current step state.
     private
-    type(adios2_engine) :: engine            !! ADIOS2 engine for data reading/writing
-    logical :: is_step_active = .false.      !! Flag to track if a step is active
-    logical :: is_writer = .false.           !! Flag to track if this is for writing
+    type(adios2_engine) :: engine            !! ADIOS2 engine for data transport
+    logical :: is_step_active = .false.      !! Flag tracking if a step is active
+    logical :: is_writer = .false.           !! True if file opened for writing
   contains
-    procedure :: close => file_close_adios2
-    procedure :: begin_step => file_begin_step_adios2
-    procedure :: end_step => file_end_step_adios2
-    procedure, private :: handle_error => handle_error_file
+    procedure :: close => file_close_adios2              !! Close file and engine
+    procedure :: begin_step => file_begin_step_adios2    !! Begin new I/O step
+    procedure :: end_step => file_end_step_adios2        !! End current I/O step
+    procedure, private :: handle_error => handle_error_file !! Error handling (internal)
   end type io_adios2_file_t
 
 contains
diff --git a/src/io/checkpoint_manager.f90 b/src/io/checkpoint_manager.f90
index 8bd2ed97f..0d4831be3 100644
--- a/src/io/checkpoint_manager.f90
+++ b/src/io/checkpoint_manager.f90
@@ -1,20 +1,31 @@
 module m_checkpoint_manager
-! @brief Manages the creation and restoration of simulation checkpoints
-!! for restart capabilities.
-!!
-!! @details This module is responsible for periodically saving the full, unstrided
-!! simulation state to a file. This allows a simulation to be stopped and resumed
-!! from the exact state it was in.
-!!
-!! Key features include:
-!! - Reading all checkpoint settings from a configuration file
-!! - Periodically writing the full-resolution simulation state
-!! - Handling the full logic for restarting a simulation from
-!! a specified checkpoint file.
-!! - A safe-write strategy that writes to a temporary file first,
-!!   then atomically renames it to the final filename to
-!! prevent corrupted checkpoints.
-!! - Optional cleanup of old checkpoint files to conserve disk space.
+  !! Manages creation and restoration of simulation checkpoints for restart.
+  !!
+  !! This module is responsible for periodically saving the full simulation
+  !! state to checkpoint files and restoring from them for restarts. This
+  !! allows simulations to be stopped and resumed from the exact state.
+  !!
+  !! **Key Features:**
+  !! - Configuration via namelist (checkpoint frequency, prefix, etc.)
+  !! - Periodic writing of full-resolution simulation state
+  !! - Complete restart logic from specified checkpoint file
+  !! - Safe-write strategy: temporary file then atomic rename
+  !! - Optional cleanup of old checkpoints to conserve disk space
+  !! - Stores velocity fields (u, v, w), timestep, and simulation time
+  !!
+  !! **Safe-Write Strategy:**
+  !! To prevent corrupted checkpoints from crashes during write:
+  !! 1. Write to temporary file (e.g., checkpoint_0001000.tmp.bp)
+  !! 2. Atomic rename to final name (checkpoint_0001000.bp)
+  !! 3. Optionally delete previous checkpoint if keep_checkpoint=false
+  !!
+  !! **Configuration:**
+  !! Controlled via `checkpoint_config_t` read from input namelist:
+  !! - checkpoint_freq: write interval (iterations)
+  !! - keep_checkpoint: retain all checkpoints vs overwrite old ones
+  !! - checkpoint_prefix: filename prefix
+  !! - restart_from_checkpoint: enable restart
+  !! - restart_file: checkpoint file to restart from
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank, MPI_Abort
   use m_common, only: dp, i8, DIR_X, get_argument
   use m_field, only: field_t
@@ -30,38 +41,48 @@ module m_checkpoint_manager
   implicit none
 
   type :: raw_old_field_buffer_t
-    real(dp), allocatable :: data(:, :, :)
+    !! Temporary buffer for field data (used internally).
+    real(dp), allocatable :: data(:, :, :) !! 3D array storage
   end type raw_old_field_buffer_t
 
   private
   public :: checkpoint_manager_t
 
   type :: checkpoint_manager_t
-    type(checkpoint_config_t) :: config
-    integer :: last_checkpoint_step = -1
-    integer, dimension(3) :: full_resolution = [1, 1, 1]
-    type(field_buffer_map_t), allocatable :: field_buffers(:)
-    integer(i8), dimension(3) :: last_shape_dims = 0
-    integer, dimension(3) :: last_stride_factors = 0
-    integer(i8), dimension(3) :: last_output_shape = 0
+    !! Manager for checkpoint file operations (writing and reading).
+    !!
+    !! Handles all aspects of checkpoint I/O including periodic writes
+    !! during simulation and restoration during restart. Maintains state
+    !! needed for consistent checkpoint operations across multiple writes.
+    type(checkpoint_config_t) :: config              !! Checkpoint configuration settings
+    integer :: last_checkpoint_step = -1             !! Timestep of last checkpoint written
+    integer, dimension(3) :: full_resolution = [1, 1, 1] !! Global domain resolution [nx, ny, nz]
+    type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O
+    integer(i8), dimension(3) :: last_shape_dims = 0 !! Shape dimensions from last write
+    integer, dimension(3) :: last_stride_factors = 0 !! Stride factors from last write
+    integer(i8), dimension(3) :: last_output_shape = 0 !! Output shape from last write
   contains
-    procedure :: init
-    procedure :: handle_restart
-    procedure :: handle_checkpoint_step
-    procedure :: is_restart
-    procedure :: finalise
-    procedure, private :: write_checkpoint
-    procedure, private :: restart_checkpoint
-    procedure, private :: write_fields
-    procedure, private :: cleanup_output_buffers
+    procedure :: init                          !! Initialise checkpoint manager
+    procedure :: handle_restart                !! Restore from checkpoint file
+    procedure :: handle_checkpoint_step        !! Write checkpoint if needed at timestep
+    procedure :: is_restart                    !! Check if this is a restart run
+    procedure :: finalise                      !! Clean up and finalise
+    procedure, private :: write_checkpoint     !! Write checkpoint file (internal)
+    procedure, private :: restart_checkpoint   !! Read checkpoint file (internal)
+    procedure, private :: write_fields         !! Write field data to file (internal)
+    procedure, private :: cleanup_output_buffers !! Free output buffers (internal)
   end type checkpoint_manager_t
 
 contains
 
   subroutine init(self, comm)
-    !! Initialise checkpoint manager
-    class(checkpoint_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise checkpoint manager from configuration.
+    !!
+    !! Reads checkpoint settings from input namelist and configures
+    !! output if checkpoint frequency is positive. Prints checkpoint
+    !! settings on root process.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    integer, intent(in) :: comm                         !! MPI communicator
 
     self%config = checkpoint_config_t()
     call self%config%read(nml_file=get_argument(1))
@@ -72,10 +93,13 @@ subroutine init(self, comm)
   end subroutine init
 
   subroutine configure_output(self, comm)
-    !! Configure checkpoint output settings
+    !! Configure and print checkpoint output settings.
+    !!
+    !! Displays checkpoint configuration on root process including
+    !! frequency, retention policy, and file prefix.
     use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY
-    class(checkpoint_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    integer, intent(in) :: comm                         !! MPI communicator
 
     integer :: myrank, ierr
 
@@ -89,18 +113,25 @@ subroutine configure_output(self, comm)
   end subroutine configure_output
 
   function is_restart(self) result(restart)
-    !! Check if this is a restart run
-    class(checkpoint_manager_t), intent(in) :: self
-    logical :: restart
+    !! Check if this is a restart run.
+    !!
+    !! Queries configuration to determine if simulation should restart
+    !! from an existing checkpoint file.
+    class(checkpoint_manager_t), intent(in) :: self !! Checkpoint manager instance
+    logical :: restart                               !! True if restarting from checkpoint
 
     restart = self%config%restart_from_checkpoint
   end function is_restart
 
   subroutine handle_restart(self, solver, comm)
-    !! Handle restart from checkpoint
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    integer, intent(in), optional :: comm
+    !! Restore solver state from checkpoint file.
+    !!
+    !! Reads velocity fields, timestep, and time from the checkpoint file
+    !! specified in configuration. Updates solver's current iteration counter.
+    !! Prints restart information on root process.
+    class(checkpoint_manager_t), intent(inout) :: self    !! Checkpoint manager instance
+    class(solver_t), intent(inout) :: solver              !! Solver to restore state into
+    integer, intent(in), optional :: comm                 !! MPI communicator (optional)
 
     character(len=256) :: restart_file
     integer :: restart_timestep
@@ -123,11 +154,15 @@ subroutine handle_restart(self, solver, comm)
   end subroutine handle_restart
 
   subroutine handle_checkpoint_step(self, solver, timestep, comm)
-    !! Handle checkpoint writing at a given timestep
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Write checkpoint if frequency condition is met.
+    !!
+    !! Checks if current timestep is a checkpoint interval (divisible by
+    !! checkpoint_freq) and writes checkpoint file if so. Called each
+    !! timestep from main simulation loop.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(in) :: solver              !! Solver containing current state
+    integer, intent(in) :: timestep                     !! Current timestep number
+    integer, intent(in), optional :: comm               !! MPI communicator (optional)
 
     integer :: comm_to_use
 
@@ -138,11 +173,26 @@ subroutine handle_checkpoint_step(self, solver, timestep, comm)
   end subroutine handle_checkpoint_step
 
   subroutine write_checkpoint(self, solver, timestep, comm)
-    !! Write a checkpoint file for simulation restart
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in) :: comm
+    !! Write checkpoint file using safe-write strategy (internal).
+    !!
+    !! Implements the checkpoint writing logic with atomic file operations
+    !! to prevent corruption. The procedure:
+    !! 1. Check if checkpoint is due (frequency condition)
+    !! 2. Write to temporary file (_temp.bp)
+    !! 3. Write metadata (timestep, time, dt, data location)
+    !! 4. Write velocity fields (u, v, w) via write_fields
+    !! 5. Write time integrator state (AB scheme coefficients if applicable)
+    !! 6. Close temporary file
+    !! 7. Atomic rename: temp file to final name
+    !! 8. Optionally delete previous checkpoint if keep_checkpoint=false
+    !!
+    !! **Safe-Write Strategy:** Writing to a temporary file and then renaming
+    !! ensures that if a crash occurs during write, the previous valid
+    !! checkpoint remains intact.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(in) :: solver              !! Solver with state to save
+    integer, intent(in) :: timestep                     !! Current timestep number
+    integer, intent(in) :: comm                         !! MPI communicator
 
     character(len=256) :: filename, temp_filename, old_filename
     integer :: ierr, myrank
@@ -307,13 +357,29 @@ end subroutine write_checkpoint
   subroutine restart_checkpoint( &
     self, solver, filename, timestep, restart_time, comm &
     )
-    !! Restart simulation state from checkpoint file
-    class(checkpoint_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    character(len=*), intent(in) :: filename
-    integer, intent(out) :: timestep
-    real(dp), intent(out) :: restart_time
-    integer, intent(in) :: comm
+    !! Restore simulation state from checkpoint file (internal).
+    !!
+    !! Reads all data from checkpoint file and restores solver state:
+    !! 1. Verify checkpoint file exists (abort if missing)
+    !! 2. Open checkpoint file for reading
+    !! 3. Read metadata (timestep, time, dt, data location)
+    !! 4. Read time integrator state (AB coefficients, order, step counters)
+    !! 5. Read velocity fields (u, v, w) with correct dimensions
+    !! 6. Restore time integrator state including history (olds arrays)
+    !! 7. Set solver data location to match checkpoint
+    !!
+    !! **Data Location:** Checkpoint records whether fields were stored at
+    !! vertices (VERT) or cell centers (CELL), and restoration preserves this.
+    !!
+    !! **Time Integrator State:** For Adams-Bashforth schemes, restores the
+    !! history of old field values (du_olds, dv_olds, dw_olds) needed for
+    !! multi-step time integration.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
+    class(solver_t), intent(inout) :: solver           !! Solver to restore state into
+    character(len=*), intent(in) :: filename           !! Checkpoint file path
+    integer, intent(out) :: timestep                    !! Timestep from checkpoint
+    real(dp), intent(out) :: restart_time              !! Simulation time from checkpoint
+    integer, intent(in) :: comm                         !! MPI communicator
 
     type(reader_session_t) :: reader_session
     integer :: ierr, myrank, data_loc
@@ -456,13 +522,28 @@ end subroutine restart_checkpoint
   subroutine write_fields( &
     self, field_names, host_fields, solver, writer_session, data_loc &
     )
-    !! Write field data for checkpoints (no striding)
-    class(checkpoint_manager_t), intent(inout) :: self
-    character(len=*), dimension(:), intent(in) :: field_names
-    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields
-    class(solver_t), intent(in) :: solver
-    type(writer_session_t), intent(inout) :: writer_session
-    integer, intent(in) :: data_loc
+    !! Write velocity field data to checkpoint file (internal).
+    !!
+    !! Writes field data at full resolution (no striding for checkpoints).
+    !! The procedure:
+    !! 1. Prepare field buffers for full resolution output
+    !! 2. Calculate output dimensions and hyperslab selection
+    !! 3. For each field (u, v, w):
+    !!    - Copy field data to output buffer
+    !!    - Write buffer to file with proper hyperslab parameters
+    !!
+    !! **Full Resolution:** Unlike snapshots (which can be strided),
+    !! checkpoints always write full-resolution data to enable exact restart.
+    !!
+    !! **Parallel I/O:** Each MPI rank writes its local subdomain using
+    !! hyperslab selection (output_start, output_count) to assemble the
+    !! global field in the file.
+    class(checkpoint_manager_t), intent(inout) :: self      !! Checkpoint manager instance
+    character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"]
+    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers
+    class(solver_t), intent(in) :: solver                   !! Solver containing mesh info
+    type(writer_session_t), intent(inout) :: writer_session !! I/O writer session
+    integer, intent(in) :: data_loc                         !! Data location (VERT or CELL)
 
     integer :: i_field
     integer(i8), dimension(3) :: output_start, output_count
@@ -505,15 +586,22 @@ subroutine write_fields( &
   end subroutine write_fields
 
   subroutine cleanup_output_buffers(self)
-    !! Clean up dynamic field buffers
-    class(checkpoint_manager_t), intent(inout) :: self
+    !! Clean up dynamically allocated field buffers (internal).
+    !!
+    !! Frees memory allocated for field I/O buffers. Called during
+    !! finalisation to prevent memory leaks.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
 
     call cleanup_field_buffers(self%field_buffers)
   end subroutine cleanup_output_buffers
 
   subroutine finalise(self)
-    !! Clean up checkpoint manager
-    class(checkpoint_manager_t), intent(inout) :: self
+    !! Finalise checkpoint manager and free resources.
+    !!
+    !! Cleans up all dynamically allocated buffers. Should be called
+    !! at the end of simulation or when checkpoint manager is no longer
+    !! needed.
+    class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance
 
     call self%cleanup_output_buffers()
   end subroutine finalise
diff --git a/src/io/dummy/io.f90 b/src/io/dummy/io.f90
index 69dbb3c73..b75bbc325 100644
--- a/src/io/dummy/io.f90
+++ b/src/io/dummy/io.f90
@@ -1,20 +1,28 @@
 module m_io_backend
-!! @brief Provides a dummy, non-functional I/O backend for when an I/O backend
-!! is not available
-!!
-!! @details This module provides a fallback implementation of the I/O backend
-!! interface. It is used when no real I/O backend (e.g. ADIOS2) is enabled at
-!! compile time.
-!!
-!! The primary purpose of this dummy backend is to allow the full program to
-!! compile and link against the session interface (`m_io_session`) without
-!! requiring a functional I/O library.
-!!
-!! @warning This is a non-functional stub. Calling any of its I/O procedures
-!! will immediately terminate the program with an error message.
-!!
-!! @note If you require file I/O, you must recompile the code with a functional
-!! backend
+  !! Dummy (non-functional) I/O backend for when no real backend is available.
+  !!
+  !! This module provides a fallback implementation of the I/O backend
+  !! interface used when no real I/O backend (e.g., ADIOS2) is enabled at
+  !! compile time. It allows the code to compile and link without a functional
+  !! I/O library.
+  !!
+  !! **Purpose:**
+  !! - Enables compilation without external I/O library dependencies
+  !! - Provides informative error messages when I/O operations are attempted
+  !! - Allows code structure to remain consistent regardless of I/O backend
+  !!
+  !! **Behaviour:**
+  !! - Write operations are silently ignored (no-op)
+  !! - Read operations terminate with error message directing user to recompile
+  !! - File open/close operations are tracked but perform no actual I/O
+  !!
+  !! **Use Cases:**
+  !! - Testing/debugging without I/O overhead
+  !! - Systems where ADIOS2 is unavailable
+  !! - Dry runs to validate simulation setup
+  !!
+  !! **Warning:** This is a non-functional stub. If you require actual file I/O,
+  !! recompile with `-DWITH_ADIOS2=ON` to enable the ADIOS2 backend.
   use iso_fortran_env, only: stderr => error_unit
   use m_io_base, only: io_reader_t, io_writer_t, io_file_t, io_mode_read, &
                        io_mode_write
@@ -26,45 +34,48 @@ module m_io_backend
   public :: allocate_io_reader, allocate_io_writer
   public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2
 
-  logical, save :: write_warning_shown = .false.
+  logical, save :: write_warning_shown = .false. !! Track if warning has been displayed
 
-  integer, parameter :: IO_BACKEND_DUMMY = 0
-  integer, parameter :: IO_BACKEND_ADIOS2 = 1
+  integer, parameter :: IO_BACKEND_DUMMY = 0   !! Dummy backend identifier
+  integer, parameter :: IO_BACKEND_ADIOS2 = 1  !! ADIOS2 backend identifier
 
   type, extends(io_file_t) :: io_dummy_file_t
-    logical :: is_open = .false.
+    !! Dummy file handle (tracks state but performs no I/O).
+    logical :: is_open = .false. !! File open state flag
   contains
-    procedure :: close => file_close_dummy
-    procedure :: begin_step => file_begin_step_dummy
-    procedure :: end_step => file_end_step_dummy
-    procedure :: is_file_functional => is_file_functional_dummy
+    procedure :: close => file_close_dummy                  !! Close file (no-op)
+    procedure :: begin_step => file_begin_step_dummy        !! Begin step (no-op)
+    procedure :: end_step => file_end_step_dummy            !! End step (no-op)
+    procedure :: is_file_functional => is_file_functional_dummy !! Check if functional
   end type io_dummy_file_t
 
   type, extends(io_reader_t) :: io_dummy_reader_t
-    logical :: initialised = .false.
+    !! Dummy reader (errors on read attempts).
+    logical :: initialised = .false. !! Initialisation state flag
   contains
-    procedure :: init => reader_init_dummy
-    procedure :: open => reader_open_dummy
-    procedure :: finalise => reader_finalise_dummy
-    procedure :: read_data_i8 => read_data_i8_dummy
-    procedure :: read_data_integer => read_data_integer_dummy
-    procedure :: read_data_real => read_data_real_dummy
-    procedure :: read_data_array_3d => read_data_array_3d_dummy
+    procedure :: init => reader_init_dummy                  !! Initialise reader
+    procedure :: open => reader_open_dummy                  !! Open file (returns non-functional handle)
+    procedure :: finalise => reader_finalise_dummy          !! Finalise (no-op)
+    procedure :: read_data_i8 => read_data_i8_dummy         !! Read i8 (errors)
+    procedure :: read_data_integer => read_data_integer_dummy !! Read integer (errors)
+    procedure :: read_data_real => read_data_real_dummy     !! Read real (errors)
+    procedure :: read_data_array_3d => read_data_array_3d_dummy !! Read 3D array (errors)
   end type io_dummy_reader_t
 
   type, extends(io_writer_t) :: io_dummy_writer_t
-    logical :: initialised = .false.
+    !! Dummy writer (silently ignores write operations).
+    logical :: initialised = .false. !! Initialisation state flag
   contains
-    procedure :: init => writer_init_dummy
-    procedure :: open => writer_open_dummy
-    procedure :: finalise => writer_finalise_dummy
-    procedure :: write_data_i8 => write_data_i8_dummy
-    procedure :: write_data_integer => write_data_integer_dummy
-    procedure :: write_data_real => write_data_real_dummy
-    procedure :: write_data_array_3d => write_data_array_3d_dummy
-    procedure :: write_attribute_string => write_attribute_string_dummy
+    procedure :: init => writer_init_dummy                  !! Initialise writer
+    procedure :: open => writer_open_dummy                  !! Open file (returns non-functional handle)
+    procedure :: finalise => writer_finalise_dummy          !! Finalise (no-op)
+    procedure :: write_data_i8 => write_data_i8_dummy       !! Write i8 (no-op)
+    procedure :: write_data_integer => write_data_integer_dummy !! Write integer (no-op)
+    procedure :: write_data_real => write_data_real_dummy   !! Write real (no-op)
+    procedure :: write_data_array_3d => write_data_array_3d_dummy !! Write 3D array (no-op)
+    procedure :: write_attribute_string => write_attribute_string_dummy !! Write string attribute (no-op)
     procedure :: write_attribute_array_1d_real => &
-      write_attribute_array_1d_real_dummy
+      write_attribute_array_1d_real_dummy                   !! Write 1D real array attribute (no-op)
   end type io_dummy_writer_t
 
 contains
diff --git a/src/io/io_base.f90 b/src/io/io_base.f90
index c6860409f..84537be06 100644
--- a/src/io/io_base.f90
+++ b/src/io/io_base.f90
@@ -1,32 +1,31 @@
 module m_io_base
-!! @brief Provides the abstract base types and interfaces for the session-based
-!! I/O architecture.
-!!
-!! @details This internal module defines the fundamental building blocks of
-!! the I/O system. It establishes a polymorphic layer that allows the
-!! high-level user session to interact with various I/O backends through a
-!! consistent interface.
-!!
-!! The architecture is designed in distinct layers:
-!! User code
-!! - interacts only with the Session layer
-!!
-!! Session layer (`m_io_session`)
-!! - manages all I/O complexity (file handles, state, etc.)
-!! - instantiates the I/O backend selected at compile-time
-!! - provides `reader_session_t` and `writer_session_t` for users
-!!
-!! Backend layer (`m_io_backend`)
-!! - concrete implementation of an I/O backed (e.g., ADIOS2)
-!! - extends the abstract base types defined in this module
-!!
-!! Base layer (`m_io_base`, this module)
-!! - provides abstract `reader_base_t` and `writer_base_t` types
-!! - enforces a consistent interface for all backends
-!!
-!! @note This is an internal module and should not be used directly by users.
-!! The sole public interface for I/O is the high-level session API provided in
-!! `m_io_session`.
+  !! Abstract base types and interfaces for session-based I/O architecture.
+  !!
+  !! This internal module defines the fundamental building blocks of the I/O
+  !! system. It establishes a polymorphic layer that allows the high-level
+  !! user session to interact with various I/O backends (e.g., ADIOS2, dummy)
+  !! through a consistent interface.
+  !!
+  !! **Architecture Layers:**
+  !!
+  !! 1. **User Code** - interacts only with the Session layer
+  !!
+  !! 2. **Session Layer** (`m_io_session`)
+  !!    - Manages all I/O complexity (file handles, state, etc.)
+  !!    - Instantiates the I/O backend selected at compile-time
+  !!    - Provides `reader_session_t` and `writer_session_t` for users
+  !!
+  !! 3. **Backend Layer** (`m_io_backend`)
+  !!    - Concrete implementation of an I/O backend (e.g., ADIOS2)
+  !!    - Extends the abstract base types defined in this module
+  !!
+  !! 4. **Base Layer** (`m_io_base`, this module)
+  !!    - Provides abstract `io_reader_t` and `io_writer_t` types
+  !!    - Enforces a consistent interface for all backends
+  !!
+  !! **Note:** This is an internal module and should not be used directly by
+  !! users. The sole public interface for I/O is the high-level session API
+  !! provided in `m_io_session`.
 
   use m_common, only: dp, i8
 
@@ -36,50 +35,64 @@ module m_io_base
   public :: io_reader_t, io_writer_t, io_file_t
   public :: io_mode_read, io_mode_write
 
-  integer, parameter :: io_mode_read = 1
-  integer, parameter :: io_mode_write = 2
+  integer, parameter :: io_mode_read = 1   !! Read mode flag for opening files
+  integer, parameter :: io_mode_write = 2  !! Write mode flag for opening files
 
-  !> Base file handle for I/O operations
   type :: io_file_t
+    !! Base file handle for I/O operations.
+    !!
+    !! This abstract type represents an open file handle. Concrete backends
+    !! extend this type to implement backend-specific file operations.
+    !! Provides step-based I/O for time-series data.
   contains
-    procedure :: close => base_close
-    procedure :: begin_step => base_begin_step
-    procedure :: end_step => base_end_step
-    procedure :: is_file_functional => base_is_file_functional
+    procedure :: close => base_close                     !! Close the file
+    procedure :: begin_step => base_begin_step           !! Begin a new I/O step
+    procedure :: end_step => base_end_step               !! End current I/O step
+    procedure :: is_file_functional => base_is_file_functional !! Check if file is operational
   end type io_file_t
 
-  !> Base I/O reader type for polymorphic usage
   type :: io_reader_t
+    !! Base I/O reader type for polymorphic usage.
+    !!
+    !! This abstract type provides the interface for reading data from files.
+    !! Concrete backends (e.g., ADIOS2) extend this type to implement
+    !! backend-specific reading operations. Supports reading scalars and
+    !! 3D arrays with optional hyperslab selection.
   contains
-    procedure :: init => base_reader_init
-    procedure :: open => base_reader_open
-    procedure :: finalise => base_reader_finalise
+    procedure :: init => base_reader_init                !! Initialise reader
+    procedure :: open => base_reader_open                !! Open file for reading
+    procedure :: finalise => base_reader_finalise        !! Finalise and clean up
     ! Generic interfaces for session usage
     generic :: read_data => read_data_i8, read_data_integer, read_data_real, &
-      read_data_array_3d
-    procedure :: read_data_i8
-    procedure :: read_data_integer
-    procedure :: read_data_real
-    procedure :: read_data_array_3d
+      read_data_array_3d                                 !! Read data (generic interface)
+    procedure :: read_data_i8                            !! Read 64-bit integer
+    procedure :: read_data_integer                       !! Read default integer
+    procedure :: read_data_real                          !! Read double precision real
+    procedure :: read_data_array_3d                      !! Read 3D array
   end type io_reader_t
 
-  !> Base I/O writer type for polymorphic usage
   type :: io_writer_t
+    !! Base I/O writer type for polymorphic usage.
+    !!
+    !! This abstract type provides the interface for writing data to files.
+    !! Concrete backends (e.g., ADIOS2) extend this type to implement
+    !! backend-specific writing operations. Supports writing scalars,
+    !! 3D arrays, and attributes.
   contains
-    procedure :: init => base_writer_init
-    procedure :: open => base_writer_open
-    procedure :: finalise => base_writer_finalise
+    procedure :: init => base_writer_init                !! Initialise writer
+    procedure :: open => base_writer_open                !! Open file for writing
+    procedure :: finalise => base_writer_finalise        !! Finalise and clean up
     generic :: write_data => write_data_i8, write_data_integer, &
       write_data_real, &
-      write_data_array_3d
-    procedure :: write_data_i8
-    procedure :: write_data_integer
-    procedure :: write_data_real
-    procedure :: write_data_array_3d
+      write_data_array_3d                                !! Write data (generic interface)
+    procedure :: write_data_i8                           !! Write 64-bit integer
+    procedure :: write_data_integer                      !! Write default integer
+    procedure :: write_data_real                         !! Write double precision real
+    procedure :: write_data_array_3d                     !! Write 3D array
     generic :: write_attribute => write_attribute_string, &
-      write_attribute_array_1d_real
-    procedure :: write_attribute_string
-    procedure :: write_attribute_array_1d_real
+      write_attribute_array_1d_real                      !! Write attribute (generic interface)
+    procedure :: write_attribute_string                  !! Write string attribute
+    procedure :: write_attribute_array_1d_real           !! Write 1D real array attribute
   end type io_writer_t
 
 contains
diff --git a/src/io/io_manager.f90 b/src/io/io_manager.f90
index a9b50a72f..1093482a7 100644
--- a/src/io/io_manager.f90
+++ b/src/io/io_manager.f90
@@ -1,12 +1,27 @@
 module m_io_manager
-!! @brief Provides a high-level manager that orchestrates all checkpoint and
-!! snapshot operations.
-!!
-!! @details This module acts as a facade to the I/O subsystem.
-!! Its purpose is to simplify the main simulation loop by providing
-!! a single point of contact for all I/O-related actions. The mainprogram only
-!! needs to interact with the `io_manager_t` type, which then delegates tasks
-!! to the specialised checkpoint and snapshot managers.
+  !! High-level manager orchestrating checkpoint and snapshot operations.
+  !!
+  !! This module acts as a facade to the I/O subsystem, simplifying the main
+  !! simulation loop by providing a single point of contact for all I/O-related
+  !! actions. The main program only needs to interact with `io_manager_t`, which
+  !! delegates tasks to specialised checkpoint and snapshot managers.
+  !!
+  !! **Responsibilities:**
+  !! - Initialise checkpoint and snapshot managers
+  !! - Coordinate restart from checkpoints
+  !! - Orchestrate periodic checkpoint and snapshot writes
+  !! - Finalise I/O operations and clean up resources
+  !!
+  !! **Usage Pattern:**
+  !! ```fortran
+  !! type(io_manager_t) :: io_mgr
+  !! call io_mgr%init(comm)
+  !! if (io_mgr%is_restart()) call io_mgr%handle_restart(solver, comm)
+  !! do timestep = 1, n_steps
+  !!   call io_mgr%handle_io_step(solver, timestep, comm)
+  !! end do
+  !! call io_mgr%finalise()
+  !! ```
   use m_checkpoint_manager, only: checkpoint_manager_t
   use m_snapshot_manager, only: snapshot_manager_t
   use m_solver, only: solver_t
@@ -17,53 +32,90 @@ module m_io_manager
   public :: io_manager_t
 
   type :: io_manager_t
-    type(checkpoint_manager_t) :: checkpoint_mgr
-    type(snapshot_manager_t) :: snapshot_mgr
+    !! Unified manager for checkpoint and snapshot operations.
+    !!
+    !! Contains both checkpoint and snapshot managers and provides
+    !! a simplified interface for the main simulation loop.
+    type(checkpoint_manager_t) :: checkpoint_mgr !! Manages restart and checkpoint files
+    type(snapshot_manager_t) :: snapshot_mgr     !! Manages visualisation output files
   contains
-    procedure :: init => io_init
-    procedure :: handle_restart => io_handle_restart
-    procedure :: handle_io_step => io_handle_step
-    procedure :: finalise => io_finalise
-    procedure :: is_restart => io_is_restart
+    procedure :: init => io_init                   !! Initialise I/O managers
+    procedure :: handle_restart => io_handle_restart !! Load restart data if needed
+    procedure :: handle_io_step => io_handle_step  !! Process checkpoints/snapshots for timestep
+    procedure :: finalise => io_finalise           !! Finalise and clean up
+    procedure :: is_restart => io_is_restart       !! Check if simulation is restarting
   end type io_manager_t
 
 contains
 
   subroutine io_init(self, comm)
-    class(io_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise checkpoint and snapshot managers.
+    !!
+    !! Sets up both managers by passing the MPI communicator. Each manager
+    !! reads its configuration and prepares for I/O operations.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
+    integer, intent(in) :: comm                 !! MPI communicator
 
     call self%checkpoint_mgr%init(comm)
     call self%snapshot_mgr%init(comm)
   end subroutine io_init
 
   subroutine io_handle_restart(self, solver, comm)
-    class(io_manager_t), intent(inout) :: self
-    class(solver_t), intent(inout) :: solver
-    integer, intent(in), optional :: comm
+    !! Handle restart by loading checkpoint data.
+    !!
+    !! Delegates to the checkpoint manager to load solver state from
+    !! the most recent checkpoint file. Should only be called if
+    !! `is_restart()` returns true.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self    !! I/O manager instance
+    class(solver_t), intent(inout) :: solver      !! Solver to load state into
+    integer, intent(in), optional :: comm         !! MPI communicator (optional)
 
     call self%checkpoint_mgr%handle_restart(solver, comm)
   end subroutine io_handle_restart
 
   subroutine io_handle_step(self, solver, timestep, comm)
-    class(io_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Handle I/O operations for current timestep.
+    !!
+    !! Checks if checkpoint or snapshot output is required at this timestep
+    !! and writes data accordingly. Typically called at the end of each
+    !! timestep in the main simulation loop.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
+    class(solver_t), intent(in) :: solver      !! Solver containing current state
+    integer, intent(in) :: timestep             !! Current timestep number
+    integer, intent(in), optional :: comm       !! MPI communicator (optional)
 
     call self%checkpoint_mgr%handle_checkpoint_step(solver, timestep, comm)
     call self%snapshot_mgr%handle_snapshot_step(solver, timestep, comm)
   end subroutine io_handle_step
 
   function io_is_restart(self) result(is_restart)
-    class(io_manager_t), intent(in) :: self
-    logical :: is_restart
+    !! Check if simulation is restarting from checkpoint.
+    !!
+    !! Queries the checkpoint manager to determine if a restart file
+    !! exists and should be loaded.
+    implicit none
+
+    class(io_manager_t), intent(in) :: self !! I/O manager instance
+    logical :: is_restart                    !! True if restarting from checkpoint
 
     is_restart = self%checkpoint_mgr%is_restart()
   end function io_is_restart
 
   subroutine io_finalise(self)
-    class(io_manager_t), intent(inout) :: self
+    !! Finalise I/O operations and clean up resources.
+    !!
+    !! Closes any open files and releases resources held by both
+    !! checkpoint and snapshot managers. Should be called at the end
+    !! of the simulation.
+    implicit none
+
+    class(io_manager_t), intent(inout) :: self !! I/O manager instance
 
     call self%checkpoint_mgr%finalise()
     call self%snapshot_mgr%finalise()
diff --git a/src/io/snapshot_manager.f90 b/src/io/snapshot_manager.f90
index 68ddfa841..ee2c2f8ae 100644
--- a/src/io/snapshot_manager.f90
+++ b/src/io/snapshot_manager.f90
@@ -1,11 +1,29 @@
 module m_snapshot_manager
-!! @brief Manages the creation of simulation snapshots for post-processing
-!! and visualisation.
-!!
-!! @details This module is responsible for periodically writing simulation
-!! data to files intended for analysis and visualisation
-!! Unlike checkpoints, which are always full-resolution for exact restarts,
-!! snapshots can be strided to reduce file size.
+  !! Manages creation of simulation snapshots for post-processing and visualisation.
+  !!
+  !! This module periodically writes simulation data to files intended for
+  !! analysis and visualisation. Unlike checkpoints (full-resolution for exact
+  !! restarts), snapshots can be strided to reduce file size while retaining
+  !! sufficient resolution for visualisation.
+  !!
+  !! **Key Differences from Checkpoints:**
+  !! - **Purpose**: Visualisation/analysis vs exact restart
+  !! - **Resolution**: Can be strided (e.g., every 2nd point) vs full resolution
+  !! - **Frequency**: Typically more frequent than checkpoints
+  !! - **File Management**: Single persistent file with multiple timesteps vs
+  !!   separate files per checkpoint
+  !!
+  !! **Features:**
+  !! - Configurable spatial striding to reduce output size
+  !! - Persistent file handle (stays open across multiple writes)
+  !! - Generates VTK-compatible XML for ParaView visualisation
+  !! - Writes velocity fields at each snapshot interval
+  !!
+  !! **Configuration:**
+  !! Controlled via `checkpoint_config_t` read from input namelist:
+  !! - snapshot_freq: write interval (iterations)
+  !! - snapshot_prefix: filename prefix
+  !! - output_stride: spatial stride factors [sx, sy, sz]
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank
   use m_common, only: dp, i8, DIR_C, VERT, get_argument
   use m_field, only: field_t
@@ -24,34 +42,43 @@ module m_snapshot_manager
   public :: snapshot_manager_t
 
   type :: snapshot_manager_t
-    type(checkpoint_config_t) :: config
-    integer, dimension(3) :: output_stride = [1, 1, 1]
-    type(field_buffer_map_t), allocatable :: field_buffers(:)
-    integer(i8), dimension(3) :: last_shape_dims = 0
-    integer, dimension(3) :: last_stride_factors = 0
-    integer(i8), dimension(3) :: last_output_shape = 0
-    character(len=4096) :: vtk_xml = ""
-    logical :: is_snapshot_file_open = .false.
-    type(writer_session_t) :: snapshot_writer
-    logical :: convert_to_sp = .false.              !! Flag for single precision snapshots
+    !! Manager for snapshot file operations (periodic visualisation output).
+    !!
+    !! Handles periodic writing of visualisation data with optional striding.
+    !! Maintains a persistent file handle that stays open across multiple
+    !! snapshot writes for efficient I/O.
+    type(checkpoint_config_t) :: config                  !! Configuration settings
+    integer, dimension(3) :: output_stride = [1, 1, 1]   !! Spatial stride factors [sx, sy, sz]
+    type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O
+    integer(i8), dimension(3) :: last_shape_dims = 0     !! Shape dimensions from last write
+    integer, dimension(3) :: last_stride_factors = 0     !! Stride factors from last write
+    integer(i8), dimension(3) :: last_output_shape = 0   !! Output shape from last write
+    character(len=4096) :: vtk_xml = ""                  !! VTK XML metadata for ParaView
+    logical :: is_snapshot_file_open = .false.           !! File handle state flag
+    type(writer_session_t) :: snapshot_writer            !! I/O session writer
+    logical :: convert_to_sp = .false.                   !! Flag for single precision snapshots
   contains
-    procedure :: init
-    procedure :: handle_snapshot_step
-    procedure :: finalise
-    procedure, private :: write_snapshot
-    procedure, private :: write_fields
-    procedure, private :: cleanup_output_buffers
-    procedure, private :: generate_vtk_xml
-    procedure, private :: open_snapshot_file
-    procedure, private :: close_snapshot_file
+    procedure :: init                          !! Initialise snapshot manager
+    procedure :: handle_snapshot_step          !! Write snapshot if needed at timestep
+    procedure :: finalise                      !! Clean up and finalise
+    procedure, private :: write_snapshot       !! Write snapshot file (internal)
+    procedure, private :: write_fields         !! Write field data to file (internal)
+    procedure, private :: cleanup_output_buffers !! Free output buffers (internal)
+    procedure, private :: generate_vtk_xml     !! Generate VTK XML metadata (internal)
+    procedure, private :: open_snapshot_file   !! Open snapshot file (internal)
+    procedure, private :: close_snapshot_file  !! Close snapshot file (internal)
   end type snapshot_manager_t
 
 contains
 
   subroutine init(self, comm)
-    !! Initialise snapshot manager
-    class(snapshot_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    !! Initialise snapshot manager from configuration.
+    !!
+    !! Reads snapshot settings from input namelist and configures
+    !! output if snapshot frequency is positive. Prints snapshot
+    !! settings including stride factors on root process.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    integer, intent(in) :: comm                       !! MPI communicator
 
     self%config = checkpoint_config_t()
     call self%config%read(nml_file=get_argument(1))
@@ -62,10 +89,13 @@ subroutine init(self, comm)
   end subroutine init
 
   subroutine configure_output(self, comm)
-    !! Configure snapshot output settings
+    !! Configure and print snapshot output settings.
+    !!
+    !! Displays snapshot configuration on root process including
+    !! frequency, file prefix, and output stride factors.
     use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY
-    class(snapshot_manager_t), intent(inout) :: self
-    integer, intent(in) :: comm
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    integer, intent(in) :: comm                       !! MPI communicator
 
     integer :: myrank, ierr
 
@@ -84,11 +114,15 @@ subroutine configure_output(self, comm)
   end subroutine configure_output
 
   subroutine handle_snapshot_step(self, solver, timestep, comm)
-    !! Handle snapshot writing at a given timestep
-    class(snapshot_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in), optional :: comm
+    !! Write snapshot if frequency condition is met.
+    !!
+    !! Checks if current timestep is a snapshot interval (divisible by
+    !! snapshot_freq) and writes snapshot if so. Called each timestep
+    !! from main simulation loop.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    class(solver_t), intent(in) :: solver            !! Solver containing current state
+    integer, intent(in) :: timestep                   !! Current timestep number
+    integer, intent(in), optional :: comm             !! MPI communicator (optional)
 
     integer :: comm_to_use
 
@@ -99,13 +133,15 @@ subroutine handle_snapshot_step(self, solver, timestep, comm)
   end subroutine handle_snapshot_step
 
   subroutine write_snapshot(self, solver, timestep, comm)
-    !! Write a snapshot file for visualisation
-    !! Uses a persistent file that stays open across multiple snapshots
-    !! Each snapshot is written as a separate timestep in the file
-    class(snapshot_manager_t), intent(inout) :: self
-    class(solver_t), intent(in) :: solver
-    integer, intent(in) :: timestep
-    integer, intent(in) :: comm
+    !! Write a snapshot file for visualisation.
+    !!
+    !! Uses a persistent file that stays open across multiple snapshots.
+    !! Each snapshot is written as a separate timestep within the file.
+    !! Data can be strided according to output_stride configuration.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    class(solver_t), intent(in) :: solver            !! Solver containing field data
+    integer, intent(in) :: timestep                   !! Current timestep number
+    integer, intent(in) :: comm                       !! MPI communicator
 
     character(len=*), parameter :: field_names(*) = ["u", "v", "w"]
     integer :: myrank, ierr
@@ -179,11 +215,26 @@ subroutine write_snapshot(self, solver, timestep, comm)
   end subroutine write_snapshot
 
   subroutine generate_vtk_xml(self, dims, fields, origin, spacing)
-    !! Generate VTK XML string for ImageData format for ParaView's ADIOS2VTXReader
-    class(snapshot_manager_t), intent(inout) :: self
-    integer(i8), dimension(3), intent(in) :: dims
-    character(len=*), dimension(:), intent(in) :: fields
-    real(dp), dimension(3), intent(in) :: origin, spacing
+    !! Generate VTK XML metadata for ParaView visualization (internal).
+    !!
+    !! Creates VTK ImageData XML string that describes the structured grid
+    !! for ParaView's ADIOS2VTXReader. This enables direct visualization of
+    !! ADIOS2 files in ParaView without conversion.
+    !!
+    !! **VTK ImageData Format:**
+    !! - Defines structured rectilinear grid with uniform spacing
+    !! - Extent: grid dimensions from 0 to N-1 in (z,y,x) order
+    !! - Origin: physical coordinates of first grid point
+    !! - Spacing: grid resolution (dx, dy, dz)
+    !! - Point data: velocity fields (u, v, w) stored at grid points
+    !!
+    !! **Note:** VTK uses (x,y,z) order while X3D2 uses (z,y,x) internally,
+    !! requiring dimension reordering in the extent string.
+    class(snapshot_manager_t), intent(inout) :: self        !! Snapshot manager instance
+    integer(i8), dimension(3), intent(in) :: dims           !! Grid dimensions [nx, ny, nz]
+    character(len=*), dimension(:), intent(in) :: fields    !! Field names ["u", "v", "w"]
+    real(dp), dimension(3), intent(in) :: origin            !! Grid origin [x0, y0, z0]
+    real(dp), dimension(3), intent(in) :: spacing           !! Grid spacing [dx, dy, dz]
 
     character(len=4096) :: xml
     character(len=96) :: extent_str, origin_str, spacing_str
@@ -223,13 +274,28 @@ end subroutine generate_vtk_xml
   subroutine write_fields( &
     self, field_names, host_fields, solver, writer_session, data_loc &
     )
-    !! Write field data with striding for snapshots
-    class(snapshot_manager_t), intent(inout) :: self
-    character(len=*), dimension(:), intent(in) :: field_names
-    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields
-    class(solver_t), intent(in) :: solver
-    type(writer_session_t), intent(inout) :: writer_session
-    integer, intent(in) :: data_loc
+    !! Write field data with optional striding for snapshots (internal).
+    !!
+    !! Writes field data with spatial striding to reduce file size while
+    !! maintaining sufficient resolution for visualization. The procedure:
+    !! 1. Prepare field buffers with configured stride factors
+    !! 2. Calculate strided output dimensions and hyperslab selection
+    !! 3. For each field (u, v, w):
+    !!    - Copy strided field data to output buffer
+    !!    - Write buffer to file with proper hyperslab parameters
+    !!
+    !! **Striding:** Unlike checkpoints (full resolution), snapshots can
+    !! subsample data. For example, stride [2,2,2] writes every 2nd point
+    !! in each direction, reducing file size by factor of 8.
+    !!
+    !! **Parallel I/O:** Each MPI rank writes its strided local subdomain
+    !! using hyperslab selection to assemble the strided global field.
+    class(snapshot_manager_t), intent(inout) :: self       !! Snapshot manager instance
+    character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"]
+    class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers
+    class(solver_t), intent(in) :: solver                  !! Solver containing mesh info
+    type(writer_session_t), intent(inout) :: writer_session !! I/O writer session
+    integer, intent(in) :: data_loc                        !! Data location (VERT or CELL)
 
     integer :: i_field
     integer(i8), dimension(3) :: output_start, output_count
@@ -272,26 +338,48 @@ subroutine write_fields( &
   end subroutine write_fields
 
   subroutine cleanup_output_buffers(self)
-    !! Clean up dynamic field buffers
-    class(snapshot_manager_t), intent(inout) :: self
+    !! Clean up dynamically allocated field buffers (internal).
+    !!
+    !! Frees memory allocated for field I/O buffers. Called during
+    !! finalisation to prevent memory leaks.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
 
     call cleanup_field_buffers(self%field_buffers)
   end subroutine cleanup_output_buffers
 
   subroutine finalise(self)
-    !! Clean up snapshot manager
-    class(snapshot_manager_t), intent(inout) :: self
+    !! Finalise snapshot manager and free resources.
+    !!
+    !! Cleans up all dynamically allocated buffers and closes the
+    !! persistent snapshot file. Should be called at the end of
+    !! simulation or when snapshot manager is no longer needed.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
 
     call self%cleanup_output_buffers()
     call self%close_snapshot_file()
   end subroutine finalise
 
   subroutine open_snapshot_file(self, filename, comm)
-    !! Open a persistent snapshot file
-    !! ADIOS2 handles both creating new files and appending to existing ones
-    class(snapshot_manager_t), intent(inout) :: self
-    character(len=*), intent(in) :: filename
-    integer, intent(in) :: comm
+    !! Open persistent snapshot file for appending timesteps (internal).
+    !!
+    !! Opens or creates a snapshot file that remains open across multiple
+    !! snapshot writes. Each snapshot is written as a new timestep within
+    !! the same file, enabling efficient time-series visualization.
+    !!
+    !! **Persistent File Strategy:**
+    !! - File opened once at first snapshot
+    !! - Remains open for subsequent snapshots (append mode)
+    !! - Each write adds a new timestep to the file
+    !! - Closed only during finalisation
+    !!
+    !! **Benefits:** Reduces file open/close overhead and keeps all snapshots
+    !! in a single file for easy ParaView animation.
+    !!
+    !! **ADIOS2 Behaviour:** Automatically handles both creating new files
+    !! and appending to existing ones based on file existence.
+    class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance
+    character(len=*), intent(in) :: filename         !! Snapshot file path
+    integer, intent(in) :: comm                      !! MPI communicator
 
     logical :: file_exists
     integer :: myrank, ierr

From 9fbe9b1bf9cfdd1f389c6aac9a4f7b48dcecd819 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 12:48:40 +0000
Subject: [PATCH 07/12] docs: add comments to omp backend

---
 src/backend/omp/backend.f90                   | 564 +++++++++++++-----
 src/backend/omp/common.f90                    |  11 +-
 src/backend/omp/exec_dist.f90                 |  82 ++-
 src/backend/omp/exec_thom.f90                 |  33 +-
 src/backend/omp/kernels/distributed.f90       |  45 +-
 .../omp/kernels/spectral_processing.f90       |  74 ++-
 src/backend/omp/kernels/thomas.f90            |  76 ++-
 src/backend/omp/poisson_fft.f90               | 109 +++-
 src/backend/omp/sendrecv.f90                  |  36 +-
 9 files changed, 784 insertions(+), 246 deletions(-)

diff --git a/src/backend/omp/backend.f90 b/src/backend/omp/backend.f90
index 370108a5a..9afbeec41 100644
--- a/src/backend/omp/backend.f90
+++ b/src/backend/omp/backend.f90
@@ -1,4 +1,35 @@
 module m_omp_backend
+  !! OpenMP/CPU backend implementation for X3D2 solver operations.
+  !!
+  !! This module provides the CPU-based backend using OpenMP for shared-memory
+  !! parallelism and MPI for distributed-memory parallelism. It implements all
+  !! abstract backend operations defined in `base_backend_t`.
+  !!
+  !! **Parallelisation Strategy:**
+  !! - **MPI**: Domain decomposition across nodes/processes
+  !! - **OpenMP**: Thread parallelism within each MPI rank
+  !! - **Hybrid MPI+OpenMP**: Enables efficient use of multi-core clusters
+  !!
+  !! **Key Features:**
+  !! - Compact finite difference operators (tridiagonal solves)
+  !! - Halo exchange for distributed derivatives
+  !! - FFT-based Poisson solver integration
+  !! - Vectorised array operations
+  !! - Optimised data reordering between decomposition directions
+  !!
+  !! **Memory Management:**
+  !! - Send/receive buffers for MPI halo exchange (u, v, w, du, dud, d2u)
+  !! - Buffers sized based on largest decomposition direction
+  !! - Persistent buffers to avoid repeated allocation
+  !!
+  !! **Solver Operations:**
+  !! - transeq: Transport equation terms with halo exchange
+  !! - tds_solve: Tridiagonal system solves (Thomas algorithm)
+  !! - reorder: Data layout transformations (DIR_X, DIR_Y, DIR_Z)
+  !! - Field operations: copy, add, multiply, integrate, etc.
+  !!
+  !! **Note:** This backend requires 2DECOMP&FFT library for FFT operations
+  !! when using the spectral Poisson solver.
   use mpi
 
   use m_allocator, only: allocator_t
@@ -20,37 +51,53 @@ module m_omp_backend
   private :: transeq_halo_exchange, transeq_dist_component
 
   type, extends(base_backend_t) :: omp_backend_t
-    !character(len=*), parameter :: name = 'omp'
+    !! OpenMP/CPU backend for solver operations.
+    !!
+    !! Extends `base_backend_t` with CPU-specific implementations using
+    !! OpenMP for threading and MPI for distributed parallelism. Maintains
+    !! communication buffers for halo exchange operations.
+    !!
+    !! **Communication Buffers:**
+    !! Arrays sized (SZ, n_halo, n_groups) where:
+    !! - SZ: maximum pencil size for data reordering
+    !! - n_halo: halo region depth (typically 4 for compact schemes)
+    !! - n_groups: maximum number of groups across all directions
+    !!
+    !! Buffer naming convention: {field}_{send/recv}_{s/e}
+    !! - field: u, v, w (velocity), du, dud, d2u (derivatives)
+    !! - send/recv: data direction
+    !! - s/e: start/end of domain (neighbouring ranks)
+    !character(len=*), parameter :: name = 'omp' !! Backend identifier
     real(dp), allocatable, dimension(:, :, :) :: &
-      u_recv_s, u_recv_e, u_send_s, u_send_e, &
-      v_recv_s, v_recv_e, v_send_s, v_send_e, &
-      w_recv_s, w_recv_e, w_send_s, w_send_e, &
-      du_send_s, du_send_e, du_recv_s, du_recv_e, &
-      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
-      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
+      u_recv_s, u_recv_e, u_send_s, u_send_e, &     !! Velocity u halo buffers
+      v_recv_s, v_recv_e, v_send_s, v_send_e, &     !! Velocity v halo buffers
+      w_recv_s, w_recv_e, w_send_s, w_send_e, &     !! Velocity w halo buffers
+      du_send_s, du_send_e, du_recv_s, du_recv_e, & !! First derivative buffers
+      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & !! Mixed derivative buffers
+      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e    !! Second derivative buffers
   contains
-    procedure :: alloc_tdsops => alloc_omp_tdsops
-    procedure :: transeq_x => transeq_x_omp
-    procedure :: transeq_y => transeq_y_omp
-    procedure :: transeq_z => transeq_z_omp
-    procedure :: transeq_species => transeq_species_omp
-    procedure :: tds_solve => tds_solve_omp
-    procedure :: reorder => reorder_omp
-    procedure :: sum_yintox => sum_yintox_omp
-    procedure :: sum_zintox => sum_zintox_omp
-    procedure :: veccopy => veccopy_omp
-    procedure :: vecadd => vecadd_omp
-    procedure :: vecmult => vecmult_omp
-    procedure :: scalar_product => scalar_product_omp
-    procedure :: field_max_mean => field_max_mean_omp
-    procedure :: field_scale => field_scale_omp
-    procedure :: field_shift => field_shift_omp
-    procedure :: field_set_face => field_set_face_omp
-    procedure :: field_volume_integral => field_volume_integral_omp
-    procedure :: copy_data_to_f => copy_data_to_f_omp
-    procedure :: copy_f_to_data => copy_f_to_data_omp
-    procedure :: init_poisson_fft => init_omp_poisson_fft
-    procedure :: transeq_omp_dist
+    procedure :: alloc_tdsops => alloc_omp_tdsops          !! Allocate tridiagonal operators
+    procedure :: transeq_x => transeq_x_omp                !! Transport equation in X
+    procedure :: transeq_y => transeq_y_omp                !! Transport equation in Y
+    procedure :: transeq_z => transeq_z_omp                !! Transport equation in Z
+    procedure :: transeq_species => transeq_species_omp    !! Transport for species/scalars
+    procedure :: tds_solve => tds_solve_omp                !! Tridiagonal solve
+    procedure :: reorder => reorder_omp                    !! Data reordering
+    procedure :: sum_yintox => sum_yintox_omp              !! Sum Y data into X
+    procedure :: sum_zintox => sum_zintox_omp              !! Sum Z data into X
+    procedure :: veccopy => veccopy_omp                    !! Vector copy
+    procedure :: vecadd => vecadd_omp                      !! Vector add
+    procedure :: vecmult => vecmult_omp                    !! Vector multiply
+    procedure :: scalar_product => scalar_product_omp      !! Scalar product
+    procedure :: field_max_mean => field_max_mean_omp      !! Compute max and mean
+    procedure :: field_scale => field_scale_omp            !! Scale field
+    procedure :: field_shift => field_shift_omp            !! Shift field values
+    procedure :: field_set_face => field_set_face_omp      !! Set face values
+    procedure :: field_volume_integral => field_volume_integral_omp !! Volume integral
+    procedure :: copy_data_to_f => copy_data_to_f_omp      !! Copy data to field
+    procedure :: copy_f_to_data => copy_f_to_data_omp      !! Copy field to data
+    procedure :: init_poisson_fft => init_omp_poisson_fft  !! Initialise FFT Poisson
+    procedure :: transeq_omp_dist                          !! Distributed transeq (internal)
   end type omp_backend_t
 
   interface omp_backend_t
@@ -60,11 +107,21 @@ module m_omp_backend
 contains
 
   function init(mesh, allocator) result(backend)
+    !! Initialise OpenMP backend with mesh and allocator.
+    !!
+    !! Sets up the backend by:
+    !! 1. Calling base initialisation
+    !! 2. Linking mesh and allocator
+    !! 3. Determining maximum number of groups across directions
+    !! 4. Allocating communication buffers for halo exchange
+    !!
+    !! **Buffer Sizing:** Buffers are sized based on the largest decomposition
+    !! direction to handle all reordering operations efficiently.
     implicit none
 
-    type(mesh_t), target, intent(inout) :: mesh
-    class(allocator_t), target, intent(inout) :: allocator
-    type(omp_backend_t) :: backend
+    type(mesh_t), target, intent(inout) :: mesh         !! Mesh with decomposition
+    class(allocator_t), target, intent(inout) :: allocator !! Memory allocator
+    type(omp_backend_t) :: backend                       !! Initialised backend instance
 
     integer :: n_groups
 
@@ -113,19 +170,24 @@ subroutine alloc_omp_tdsops( &
     self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     )
+    !! Allocate and initialise tridiagonal operator for OMP backend.
+    !!
+    !! Creates a `tdsops_t` object configured for the specified operation
+    !! (derivative, interpolation) with chosen compact scheme and boundary
+    !! conditions. Handles grid stretching and viscous corrections.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(tdsops_t), allocatable, intent(inout) :: tdsops
-    integer, intent(in) :: n_tds
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end
-    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)
-    integer, optional, intent(in) :: n_halo
-    character(*), optional, intent(in) :: from_to
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(omp_backend_t) :: self                        !! Backend instance
+    class(tdsops_t), allocatable, intent(inout) :: tdsops !! Tridiagonal operator to allocate
+    integer, intent(in) :: n_tds                        !! Number of points in direction
+    real(dp), intent(in) :: delta                       !! Grid spacing
+    character(*), intent(in) :: operation, scheme       !! Operation type and scheme name
+    integer, intent(in) :: bc_start, bc_end             !! Boundary condition codes
+    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) !! Grid stretching
+    integer, optional, intent(in) :: n_halo             !! Halo depth
+    character(*), optional, intent(in) :: from_to       !! Data location transition
+    logical, optional, intent(in) :: sym                !! Symmetry flag
+    real(dp), optional, intent(in) :: c_nu, nu0_nu      !! Viscous correction parameters
 
     allocate (tdsops_t :: tdsops)
 
@@ -139,26 +201,40 @@ subroutine alloc_omp_tdsops( &
   end subroutine alloc_omp_tdsops
 
   subroutine transeq_x_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in X direction.
+    !!
+    !! Evaluates convection and diffusion terms for momentum equations:
+    !! \( du/dt = -u \cdot \nabla u + \nu \nabla^2 u \)
+    !!
+    !! Delegates to `transeq_omp_dist` which handles halo exchange and
+    !! distributed compact schemes.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self                  !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: velocity RHS
+    class(field_t), intent(in) :: u, v, w        !! Input: velocity fields
+    real(dp), intent(in) :: nu                    !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps            !! Directional operators
 
     call self%transeq_omp_dist(du, dv, dw, u, v, w, nu, dirps)
 
   end subroutine transeq_x_omp
 
   subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in Y direction.
+    !!
+    !! Calculates convective and viscous terms for Y-pencil decomposition.
+    !! Velocity components are reordered (v, u, w) to align primary
+    !! direction with pencil orientation before calling distributed kernel.
+    !!
+    !! See [[transeq_x_omp]] for transport equation formulation.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     ! u, v, w is reordered so that we pass v, u, w
     call self%transeq_omp_dist(dv, du, dw, v, u, w, nu, dirps)
@@ -166,13 +242,20 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_y_omp
 
   subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation RHS in Z direction.
+    !!
+    !! Calculates convective and viscous terms for Z-pencil decomposition.
+    !! Velocity components are reordered (w, u, v) to align primary
+    !! direction with pencil orientation before calling distributed kernel.
+    !!
+    !! See [[transeq_x_omp]] for transport equation formulation.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     ! u, v, w is reordered so that we pass w, u, v
     call self%transeq_omp_dist(dw, du, dv, w, u, v, nu, dirps)
@@ -180,18 +263,24 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_z_omp
 
   subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync)
-    !! Compute the convection and diffusion for the given field
-    !! in the given direction.
-    !! Halo exchange for the given field is necessary
-    !! When sync is true, halo exchange of momentum is necessary
+    !! Compute transport equation RHS for scalar species.
+    !!
+    !! Calculates convective and diffusive terms for a passive scalar
+    !! (temperature, concentration, etc.) transported by velocity field.
+    !!
+    !! **Equation:** `$\partial\phi/\partial t = -\mathbf{u}\cdot\nabla\phi + \nu\nabla^2\phi$` where $\phi$ is the scalar species.
+    !!
+    !! **Synchronisation:** When `sync=.true.`, performs halo exchange
+    !! for velocity field before computation. Always exchanges scalar halos.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: dspec
-    class(field_t), intent(in) :: uvw, spec
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    logical, intent(in) :: sync
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: dspec     !! Time derivative of species (output)
+    class(field_t), intent(in) :: uvw          !! Velocity component in pencil direction
+    class(field_t), intent(in) :: spec         !! Species concentration/temperature
+    real(dp), intent(in) :: nu                 !! Diffusivity coefficient
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
+    logical, intent(in) :: sync                !! Perform velocity halo exchange if true
 
     integer :: n_groups
 
@@ -229,13 +318,21 @@ subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync)
   end subroutine transeq_species_omp
 
   subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Internal: Distributed transport equation implementation.
+    !!
+    !! Orchestrates the complete transport equation calculation for
+    !! all velocity components. First performs halo exchange for
+    !! distributed compact derivatives, then computes each component's
+    !! RHS using transeq_dist_component.
+    !!
+    !! **Called by:** transeq_x/y/z_omp after velocity reordering
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du, dv, dw  !! Time derivatives (output)
+    class(field_t), intent(in) :: u, v, w      !! Velocity components (reordered for pencil direction)
+    real(dp), intent(in) :: nu                 !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps         !! Spectral operators
 
     call transeq_halo_exchange(self, u, v, w, dirps%dir)
 
@@ -258,9 +355,17 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_omp_dist
 
   subroutine transeq_halo_exchange(self, u, v, w, dir)
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: u, v, w
-    integer, intent(in) :: dir
+    !! Internal: Perform halo exchange for all velocity components.
+    !!
+    !! Exchanges 4-point halos between neighbouring MPI processes for
+    !! distributed compact finite difference stencils. Copies boundary
+    !! data into send buffers, performs MPI sendrecv, stores in receive
+    !! buffers for use in derivative calculations.
+    !!
+    !! **Operation:** Copy to buffers $\rightarrow$ MPI_Sendrecv $\rightarrow$ Store halos
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(in) :: u, v, w      !! Velocity components
+    integer, intent(in) :: dir                 !! Communication direction
     integer :: n, nproc_dir, pprev, pnext
     integer :: n_groups
 
@@ -296,20 +401,27 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, &
                                     u_recv_s, u_recv_e, &
                                     conv_recv_s, conv_recv_e, &
                                     tdsops_du, tdsops_dud, tdsops_d2u, dir)
-    !! Computes RHS_x^u following:
+    !! Internal: Compute single component of transport equation RHS.
+    !!
+    !! Calculates RHS for one velocity component using skew-symmetric form:
+    !!
+    !! **Formula:** `rhs = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2`
+    !!
+    !! Uses distributed compact FD kernels with halo data from neighbours.
+    !! Allocates temporary storage for derivatives and releases after use.
     !!
-    !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
-    class(omp_backend_t) :: self
+    !! **Skew-symmetric:** Reduces aliasing errors in nonlinear convection.
+    class(omp_backend_t) :: self               !! Backend instance
     !> The result field, it is also used as temporary storage
-    class(field_t), intent(inout) :: rhs_du
-    class(field_t), intent(in) :: u, conv
-    real(dp), intent(in) :: nu
+    class(field_t), intent(inout) :: rhs_du   !! RHS output (also temp storage)
+    class(field_t), intent(in) :: u, conv     !! Velocity component and convecting velocity
+    real(dp), intent(in) :: nu                !! Kinematic viscosity
     real(dp), dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e, &
-                                                conv_recv_s, conv_recv_e
-    class(tdsops_t), intent(in) :: tdsops_du
-    class(tdsops_t), intent(in) :: tdsops_dud
-    class(tdsops_t), intent(in) :: tdsops_d2u
-    integer, intent(in) :: dir
+                                                conv_recv_s, conv_recv_e  !! Halo data from neighbours
+    class(tdsops_t), intent(in) :: tdsops_du      !! First derivative operator
+    class(tdsops_t), intent(in) :: tdsops_dud     !! Product derivative operator
+    class(tdsops_t), intent(in) :: tdsops_d2u     !! Second derivative operator
+    integer, intent(in) :: dir                    !! Direction index
     class(field_t), pointer :: d2u, dud
 
     dud => self%allocator%get_block(dir)
@@ -334,12 +446,20 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, &
   end subroutine transeq_dist_component
 
   subroutine tds_solve_omp(self, du, u, tdsops)
+    !! Solve tridiagonal system for compact finite difference operation.
+    !!
+    !! Applies compact scheme operator to field using Thomas algorithm.
+    !! Handles both local (single-process) and distributed (multi-process)
+    !! solves depending on decomposition configuration.
+    !!
+    !! **Data Location:** Updates output data location based on operator's
+    !! `move` specification (e.g., CELL to VERT for interpolation).
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(omp_backend_t) :: self              !! Backend instance
+    class(field_t), intent(inout) :: du      !! Output field
+    class(field_t), intent(in) :: u          !! Input field
+    class(tdsops_t), intent(in) :: tdsops    !! Tridiagonal operator
 
     ! Check if direction matches for both in/out fields
     if (u%dir /= du%dir) then
@@ -355,12 +475,22 @@ subroutine tds_solve_omp(self, du, u, tdsops)
   end subroutine tds_solve_omp
 
   subroutine tds_solve_dist(self, du, u, tdsops)
+    !! Internal: Distributed tridiagonal solve with halo exchange.
+    !!
+    !! Solves compact finite difference system across multiple MPI processes.
+    !! Performs halo exchange before calling distributed Thomas algorithm
+    !! kernel. Used when domain decomposition splits the pencil direction.
+    !!
+    !! **Algorithm:**
+    !! 1. Copy boundary data into send buffers
+    !! 2. MPI_Sendrecv for halo exchange
+    !! 3. Distributed Thomas algorithm with boundary coupling
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(omp_backend_t) :: self               !! Backend instance
+    class(field_t), intent(inout) :: du       !! Solution field (output)
+    class(field_t), intent(in) :: u           !! RHS field
+    class(tdsops_t), intent(in) :: tdsops     !! Tridiagonal operator
     integer :: n_groups, dir
 
     dir = u%dir
@@ -387,12 +517,24 @@ subroutine tds_solve_dist(self, du, u, tdsops)
   end subroutine tds_solve_dist
 
   subroutine reorder_omp(self, u_, u, direction)
+    !! Reorder field data between different pencil decompositions.
+    !!
+    !! Transforms field layout from one decomposition direction to another
+    !! (e.g., X-pencils to Y-pencils). Uses MPI All-to-All communication
+    !! to redistribute data across processes.
+    !!
+    !! **Directions:** DIR_X, DIR_Y, DIR_Z specify pencil orientations.
+    !! Each pencil is contiguous along its direction and distributed in
+    !! the other two dimensions.
+    !!
+    !! **Performance:** Critical operation for multi-dimensional algorithms.
+    !! Uses `get_index_reordering` for efficient cache-friendly reordering.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u_
-    class(field_t), intent(in) :: u
-    integer, intent(in) :: direction
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u_   !! Output field (reordered)
+    class(field_t), intent(in) :: u       !! Input field
+    integer, intent(in) :: direction      !! Reordering direction code
     integer, dimension(3) :: dims, cart_padded
     integer :: i, j, k
     integer :: out_i, out_j, out_k
@@ -420,33 +562,50 @@ subroutine reorder_omp(self, u_, u, direction)
   end subroutine reorder_omp
 
   subroutine sum_yintox_omp(self, u, u_)
+    !! Sum Y-pencils into X-pencils through reordering.
+    !!
+    !! Performs directional reduction by reordering from Y to X pencils
+    !! and summing the result into the destination field. Useful for
+    !! integrating quantities along the Y direction.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (X-pencils, accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field (Y-pencils)
 
     call sum_intox_omp(self, u, u_, DIR_Y)
 
   end subroutine sum_yintox_omp
 
   subroutine sum_zintox_omp(self, u, u_)
+    !! Sum Z-pencils into X-pencils through reordering.
+    !!
+    !! Performs directional reduction by reordering from Z to X pencils
+    !! and summing the result into the destination field. Useful for
+    !! integrating quantities along the Z direction.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (X-pencils, accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field (Z-pencils)
 
     call sum_intox_omp(self, u, u_, DIR_Z)
 
   end subroutine sum_zintox_omp
 
   subroutine sum_intox_omp(self, u, u_, dir_to)
+    !! Internal helper: Sum reordered field into X-pencils.
+    !!
+    !! Reorders source field from X-pencils to specified direction,
+    !! then accumulates into destination field. Called by sum_yintox_omp
+    !! and sum_zintox_omp for directional integration.
+    !!
+    !! **Algorithm:** Reorder with index mapping, accumulate with +=
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: u
-    class(field_t), intent(in) :: u_
-    integer, intent(in) :: dir_to
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: u    !! Destination field (accumulates result)
+    class(field_t), intent(in) :: u_      !! Source field
+    integer, intent(in) :: dir_to         !! Target direction (DIR_Y or DIR_Z)
 
     integer :: dir_from
     integer, dimension(3) :: dims, cart_padded
@@ -473,12 +632,16 @@ subroutine sum_intox_omp(self, u, u_, dir_to)
   end subroutine sum_intox_omp
 
   subroutine veccopy_omp(self, dst, src)
+    !! Copy field data from source to destination.
+    !!
+    !! Element-wise copy with OpenMP parallelisation. Both fields
+    !! must have the same decomposition direction and dimensions.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: dst
-    class(field_t), intent(in) :: src
-    integer :: i, j, k
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: dst  !! Destination field
+    class(field_t), intent(in) :: src     !! Source field
+    integer :: i, j, k                     !! Loop indices
 
     if (src%dir /= dst%dir) then
       error stop "Called vector copy with incompatible fields"
@@ -501,13 +664,19 @@ subroutine veccopy_omp(self, dst, src)
   end subroutine veccopy_omp
 
   subroutine vecadd_omp(self, a, x, b, y)
+    !! Vector addition: y = a*x + b*y (in-place AXPBY).
+    !!
+    !! Scaled in-place vector addition with OpenMP parallelisation
+    !! and SIMD vectorisation. Implements the BLAS AXPBY operation.
+    !!
+    !! **Formula:** `y := a*x + b*y` where a, b are scalars.
     implicit none
 
-    class(omp_backend_t) :: self
-    real(dp), intent(in) :: a
-    class(field_t), intent(in) :: x
-    real(dp), intent(in) :: b
-    class(field_t), intent(inout) :: y
+    class(omp_backend_t) :: self           !! Backend instance
+    real(dp), intent(in) :: a              !! Scalar multiplier for x
+    class(field_t), intent(in) :: x        !! First input field
+    real(dp), intent(in) :: b              !! Scalar multiplier for y
+    class(field_t), intent(inout) :: y     !! Second input field (overwritten with result)
     integer :: i, j, k
 
     if (x%dir /= y%dir) then
@@ -531,13 +700,18 @@ subroutine vecadd_omp(self, a, x, b, y)
   end subroutine vecadd_omp
 
   subroutine vecmult_omp(self, y, x)
+    !! Element-wise multiplication: y = y * x (in-place).
+    !!
+    !! In-place element-wise multiplication with OpenMP parallelisation
+    !! and SIMD vectorisation. Often used for applying masks or
+    !! multiplying solution components.
     !! [[m_base_backend(module):vecmult(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: y
-    class(field_t), intent(in) :: x
-    integer :: i, j, k
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: y    !! Field to multiply and store result
+    class(field_t), intent(in) :: x        !! Multiplier field
+    integer :: i, j, k                     !! Loop indices
 
     if (x%dir /= y%dir) then
       error stop "Called vector multiply with incompatible fields"
@@ -560,16 +734,24 @@ subroutine vecmult_omp(self, y, x)
   end subroutine vecmult_omp
 
   real(dp) function scalar_product_omp(self, x, y) result(s)
+    !! Compute global scalar product (dot product) of two fields.
+    !!
+    !! Calculates the dot product $\sum(x_i \times y_i)$ across all grid points
+    !! and all MPI processes. Uses OpenMP parallelisation with reduction
+    !! and MPI_Allreduce for global sum.
+    !!
+    !! **Algorithm:** Local parallel reduction $\rightarrow$ MPI_Allreduce
+    !! **Data location:** Both fields must be at the same location (CELL/VERT).
     !! [[m_base_backend(module):scalar_product(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: x, y
-    class(field_t), pointer :: x_, y_
-    integer, dimension(3) :: dims
-    integer :: i, j, k, ii
-    integer :: nvec, remstart
-    integer :: ierr
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: x, y     !! Input fields
+    class(field_t), pointer :: x_, y_      !! Pointers for data access
+    integer, dimension(3) :: dims          !! Field dimensions
+    integer :: i, j, k, ii                 !! Loop indices
+    integer :: nvec, remstart              !! Vectorisation variables
+    integer :: ierr                        !! MPI error code
 
     if ((x%data_loc == NULL_LOC) .or. (y%data_loc == NULL_LOC)) then
       error stop "You must set the data_loc before calling scalar product"
@@ -623,14 +805,22 @@ real(dp) function scalar_product_omp(self, x, y) result(s)
   end function scalar_product_omp
 
   subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups)
+    !! Internal helper: Copy halo data into send buffers.
+    !!
+    !! Extracts 4-point halos from start and end of domain for
+    !! MPI communication. Used in transeq_halo_exchange to prepare
+    !! boundary data for neighbour processes.
+    !!
+    !! **Buffer layout:** (SZ, 4, n_groups) for cache efficiency
     implicit none
 
-    real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e
-    real(dp), dimension(:, :, :), intent(in) :: u
-    integer, intent(in) :: n
-    integer, intent(in) :: n_groups
-    integer :: i, j, k
-    integer :: n_halo = 4
+    real(dp), dimension(:, :, :), intent(out) :: u_send_s  !! Send buffer for start boundary
+    real(dp), dimension(:, :, :), intent(out) :: u_send_e  !! Send buffer for end boundary
+    real(dp), dimension(:, :, :), intent(in) :: u          !! Field data
+    integer, intent(in) :: n                               !! Domain size in communication direction
+    integer, intent(in) :: n_groups                        !! Number of pencil groups
+    integer :: i, j, k                                     !! Loop indices
+    integer :: n_halo = 4                                  !! Halo width (compact scheme stencil)
 
     !$omp parallel do
     do k = 1, n_groups
@@ -648,13 +838,25 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups)
   end subroutine copy_into_buffers
 
   subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc)
+    !! Compute global maximum and mean of a field.
+    !!
+    !! Calculates maximum and mean values across all grid points and
+    !! MPI processes. Uses data location (CELL/VERT) to determine
+    !! valid domain extents, excluding padding and ghost cells.
+    !!
+    !! **Algorithm:**
+    !! 1. Local parallel max/sum reduction with OpenMP
+    !! 2. MPI_Allreduce for global max/sum
+    !! 3. Mean = global_sum / global_count
+    !!
+    !! **Data location:** Can be enforced or read from field metadata.
     !! [[m_base_backend(module):field_max_mean(interface)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    real(dp), intent(out) :: max_val, mean_val
-    class(field_t), intent(in) :: f
-    integer, optional, intent(in) :: enforced_data_loc
+    class(omp_backend_t) :: self                   !! Backend instance
+    real(dp), intent(out) :: max_val, mean_val     !! Global maximum and mean values
+    class(field_t), intent(in) :: f                !! Input field
+    integer, optional, intent(in) :: enforced_data_loc  !! Override data location if provided
 
     real(dp) :: val, max_p, sum_p, max_pncl, sum_pncl
     integer :: data_loc, dims(3), dims_padded(3), n, n_i, n_i_pad, n_j
@@ -721,33 +923,48 @@ subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc)
   end subroutine field_max_mean_omp
 
   subroutine field_scale_omp(self, f, a)
+    !! Scale field by constant: f = a * f.
+    !!
+    !! Multiplies all field values by scalar a in-place.
+    !! Uses Fortran array syntax for simplicity.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to scale (modified in-place)
+    real(dp), intent(in) :: a              !! Scaling factor
 
     f%data = a*f%data
   end subroutine field_scale_omp
 
   subroutine field_shift_omp(self, f, a)
+    !! Shift field by constant: f = f + a.
+    !!
+    !! Adds scalar a to all field values in-place.
+    !! Uses Fortran array syntax for simplicity.
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to shift (modified in-place)
+    real(dp), intent(in) :: a              !! Shift amount
 
     f%data = f%data + a
   end subroutine field_shift_omp
 
   subroutine field_set_face_omp(self, f, c_start, c_end, face)
+    !! Set boundary face values to specified constants.
+    !!
+    !! Sets values on a specified domain face (X/Y/Z start/end)
+    !! to given constants. Used for boundary condition enforcement.
+    !!
+    !! **Faces:** VERT_START_FACE, VERT_END_FACE, etc.
     !! [[m_base_backend(module):field_set_face(subroutine)]]
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), intent(in) :: c_start, c_end
-    integer, intent(in) :: face
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(inout) :: f     !! Field to modify
+    real(dp), intent(in) :: c_start        !! Value for start side of face
+    real(dp), intent(in) :: c_end          !! Value for end side of face
+    integer, intent(in) :: face            !! Face identifier constant
 
     integer :: dims(3), k, j, i_mod, k_end
 
@@ -784,11 +1001,18 @@ subroutine field_set_face_omp(self, f, c_start, c_end, face)
   end subroutine field_set_face_omp
 
   real(dp) function field_volume_integral_omp(self, f) result(s)
-    !! volume integral of a field
+    !! Compute volume integral of field over domain.
+    !!
+    !! Calculates $\int f \,dV$ by summing all field values (at cell centres)
+    !! and multiplying by grid cell volumes. Uses MPI_Allreduce for
+    !! global sum across all processes.
+    !!
+    !! **Formula:** $\int f \,dV = \sum(f_i \times \Delta V_i)$ where $\Delta V$ from mesh
+    !! **Assumption:** Field at cell centres (data_loc = CELL)
     implicit none
 
-    class(omp_backend_t) :: self
-    class(field_t), intent(in) :: f
+    class(omp_backend_t) :: self           !! Backend instance
+    class(field_t), intent(in) :: f        !! Field to integrate
 
     real(dp) :: sum_p, sum_pncl
     integer :: dims(3), stacked, i, j, k, k_i, k_j, ierr
@@ -829,32 +1053,48 @@ real(dp) function field_volume_integral_omp(self, f) result(s)
   end function field_volume_integral_omp
 
   subroutine copy_data_to_f_omp(self, f, data)
-    class(omp_backend_t), intent(inout) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), dimension(:, :, :), intent(in) :: data
+    !! Copy raw array into field structure.
+    !!
+    !! Simple wrapper for field initialisation from external data.
+    !! Uses Fortran array assignment for efficiency.
+    class(omp_backend_t), intent(inout) :: self  !! Backend instance
+    class(field_t), intent(inout) :: f           !! Target field
+    real(dp), dimension(:, :, :), intent(in) :: data  !! Source data array
 
     f%data = data
   end subroutine copy_data_to_f_omp
 
   subroutine copy_f_to_data_omp(self, data, f)
-    class(omp_backend_t), intent(inout) :: self
-    real(dp), dimension(:, :, :), intent(out) :: data
-    class(field_t), intent(in) :: f
+    !! Copy field structure into raw array.
+    !!
+    !! Simple wrapper for field extraction to external data.
+    !! Uses Fortran array assignment for efficiency.
+    class(omp_backend_t), intent(inout) :: self  !! Backend instance
+    real(dp), dimension(:, :, :), intent(out) :: data  !! Destination data array
+    class(field_t), intent(in) :: f              !! Source field
 
     data = f%data
   end subroutine copy_f_to_data_omp
 
   subroutine init_omp_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+    !! Initialise FFT-based Poisson solver for OMP backend.
+    !!
+    !! Creates and configures omp_poisson_fft_t solver for pressure
+    !! correction step. Uses 2DECOMP&FFT library for parallel FFTs
+    !! in pencil decomposition.
+    !!
+    !! **Requirement:** WITH_2DECOMPFFT must be defined at compile time.
+    !! **Low-memory mode:** Optional flag to reduce memory footprint.
 #ifdef WITH_2DECOMPFFT
     use m_omp_poisson_fft, only: omp_poisson_fft_t
 #endif
 
     implicit none
 
-    class(omp_backend_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    class(omp_backend_t) :: self                   !! Backend instance
+    type(mesh_t), intent(in) :: mesh               !! Mesh with grid spacing
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Spectral operators for each direction
+    logical, optional, intent(in) :: lowmem        !! Enable low-memory mode
 
 #ifdef WITH_2DECOMPFFT
     allocate (omp_poisson_fft_t :: self%poisson_fft)
diff --git a/src/backend/omp/common.f90 b/src/backend/omp/common.f90
index 6d3df179a..b687782d8 100644
--- a/src/backend/omp/common.f90
+++ b/src/backend/omp/common.f90
@@ -1,6 +1,15 @@
 module m_omp_common
+  !! Common constants for OpenMP backend implementation.
+  !!
+  !! Defines compile-time constants used throughout the OMP backend
+  !! for performance tuning and buffer sizing.
+  !!
+  !! **SZ (pencil size):** Maximum pencil dimension for data reordering
+  !! operations. Set to 16 for optimal cache utilisation and vectorisation
+  !! on typical CPU architectures. Larger values may improve performance
+  !! for very large problems but increase memory overhead.
   implicit none
 
-  integer, parameter :: SZ = 16
+  integer, parameter :: SZ = 16 !! Maximum pencil size for reordering buffers
 
 end module m_omp_common
diff --git a/src/backend/omp/exec_dist.f90 b/src/backend/omp/exec_dist.f90
index d1334512e..9c4018def 100644
--- a/src/backend/omp/exec_dist.f90
+++ b/src/backend/omp/exec_dist.f90
@@ -1,4 +1,21 @@
 module m_omp_exec_dist
+  !! Distributed compact finite difference execution for OMP backend.
+  !!
+  !! Orchestrates parallel execution of distributed compact schemes across
+  !! MPI processes. Manages OpenMP threading, halo exchanges, forward/backward
+  !! sweeps, and boundary system solves for multi-process compact operators.
+  !!
+  !! **Key features:**
+  !! - Forward/backward elimination with boundary coupling
+  !! - Non-blocking MPI communication for 2x2 boundary systems
+  !! - OpenMP parallelisation over pencil groups
+  !! - Fused kernels for transport equation efficiency
+  !!
+  !! **Distributed algorithm:**
+  !! 1. Forward/backward sweep on local domain $\rightarrow$ generate boundary systems
+  !! 2. MPI exchange boundary data between neighbours
+  !! 3. Solve coupled 2x2 systems at process interfaces
+  !! 4. Substitution sweep to complete solution
   use mpi
 
   use m_common, only: dp
@@ -15,21 +32,34 @@ module m_omp_exec_dist
   subroutine exec_dist_tds_compact( &
     du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, &
     tdsops, nproc, pprev, pnext, n_groups)
+    !! Execute distributed compact finite difference operation.
+    !!
+    !! Applies compact scheme operator across multiple MPI processes using
+    !! distributed Thomas algorithm. Performs forward/backward elimination,
+    !! exchanges boundary systems via MPI, then completes with substitution.
+    !!
+    !! **Algorithm:**
+    !! 1. `der_univ_dist`: Forward/backward sweep $\rightarrow$ boundary 2x2 systems
+    !! 2. `sendrecv_fields`: MPI exchange boundary data with neighbours
+    !! 3. `der_univ_subs`: Solve boundaries $\rightarrow$ back-substitution
+    !!
+    !! **Parallelisation:** OpenMP over pencil groups, MPI across processes
     implicit none
 
     ! du = d(u)
-    real(dp), dimension(:, :, :), intent(out) :: du
-    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
+    real(dp), dimension(:, :, :), intent(out) :: du          !! Derivative output
+    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Field and halos
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care about the data they pass back
     real(dp), dimension(:, :, :), intent(out) :: &
-      du_send_s, du_send_e, du_recv_s, du_recv_e
+      du_send_s, du_send_e, du_recv_s, du_recv_e  !! Boundary system buffers (scratch)
 
-    type(tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: nproc, pprev, pnext
-    integer, intent(in) :: n_groups
+    type(tdsops_t), intent(in) :: tdsops  !! Compact scheme operator
+    integer, intent(in) :: nproc          !! Number of processes in direction
+    integer, intent(in) :: pprev, pnext   !! Previous/next neighbour ranks
+    integer, intent(in) :: n_groups       !! Number of pencil groups
 
     integer :: n_data
     integer :: k
@@ -71,31 +101,49 @@ subroutine exec_dist_transeq_compact( &
     u, u_recv_s, u_recv_e, &
     v, v_recv_s, v_recv_e, &
     tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_groups)
+    !! Execute distributed transport equation RHS calculation.
+    !!
+    !! Computes three compact derivative operations required for transport
+    !! equation in skew-symmetric form, then fuses final RHS assembly.
+    !! All three derivatives (du, d(u*v), d2u) computed in parallel with
+    !! single halo exchange pass.
+    !!
+    !! **Derivatives computed:**
+    !! - `du`: First derivative of u
+    !! - `dud`: First derivative of u*v (product computed locally with halos)
+    !! - `d2u`: Second derivative of u (viscous term)
+    !!
+    !! **Fused assembly:** Final RHS combines all three derivatives with
+    !! viscosity scaling in single kernel (der_univ_fused_subs).
+    !!
+    !! **Optimisation:** Product u*v computed on-the-fly to avoid storing
+    !! extra field. Reduces memory footprint.
 
     implicit none
 
     !> The result array, it is also used as temporary storage
-    real(dp), dimension(:, :, :), intent(out) :: rhs_du
+    real(dp), dimension(:, :, :), intent(out) :: rhs_du  !! Transport equation RHS output
     !> Temporary storage arrays
-    real(dp), dimension(:, :, :), intent(out) :: dud, d2u
+    real(dp), dimension(:, :, :), intent(out) :: dud, d2u  !! Product derivative and second derivative
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care about the data they pass back
     real(dp), dimension(:, :, :), intent(out) :: &
-      du_send_s, du_send_e, du_recv_s, du_recv_e
+      du_send_s, du_send_e, du_recv_s, du_recv_e      !! Boundary buffers for du (scratch)
     real(dp), dimension(:, :, :), intent(out) :: &
-      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e
+      dud_send_s, dud_send_e, dud_recv_s, dud_recv_e  !! Boundary buffers for dud (scratch)
     real(dp), dimension(:, :, :), intent(out) :: &
-      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
+      d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e  !! Boundary buffers for d2u (scratch)
 
-    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
-    real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e
+    real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Velocity component and halos
+    real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e  !! Convecting velocity and halos
 
-    type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    real(dp), intent(in) :: nu
-    integer, intent(in) :: nproc, pprev, pnext
-    integer, intent(in) :: n_groups
+    type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for each derivative
+    real(dp), intent(in) :: nu       !! Kinematic viscosity
+    integer, intent(in) :: nproc     !! Number of processes in direction
+    integer, intent(in) :: pprev, pnext  !! Previous/next neighbour ranks
+    integer, intent(in) :: n_groups  !! Number of pencil groups
 
     real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e
 
diff --git a/src/backend/omp/exec_thom.f90 b/src/backend/omp/exec_thom.f90
index b1f0c6028..fb61485c1 100644
--- a/src/backend/omp/exec_thom.f90
+++ b/src/backend/omp/exec_thom.f90
@@ -1,4 +1,18 @@
 module m_exec_thom
+  !! Local Thomas algorithm execution for OMP backend.
+  !!
+  !! Provides parallel execution of compact finite difference schemes using
+  !! standard Thomas algorithm (tridiagonal solver). Used when domain is not
+  !! decomposed in the derivative direction (all data local to process).
+  !!
+  !! **Two variants:**
+  !! - **Non-periodic:** Standard Thomas with arbitrary boundary conditions
+  !! - **Periodic:** Modified Thomas for cyclic tridiagonal systems
+  !!
+  !! **Parallelisation:** OpenMP over pencil groups (no MPI needed)
+  !!
+  !! **Contrast with distributed:** exec_dist handles multi-process case,
+  !! this module handles single-process-per-direction case.
 
   use m_common, only: dp
   use m_tdsops, only: tdsops_t
@@ -13,11 +27,22 @@ module m_exec_thom
 contains
 
   subroutine exec_thom_tds_compact(du, u, tdsops, n_groups)
+    !! Execute local Thomas algorithm for compact scheme.
+    !!
+    !! Applies compact finite difference operator using tridiagonal solver.
+    !! Chooses periodic or non-periodic variant based on operator configuration.
+    !! All computation local to process (no MPI communication).
+    !!
+    !! **Algorithm selection:**
+    !! - `periodic=.true.`: Sherman-Morrison formula for cyclic system
+    !! - `periodic=.false.`: Standard forward/backward Thomas algorithm
+    !!
+    !! **Parallelisation:** OpenMP parallel loop over pencil groups
 
-    real(dp), dimension(:, :, :), intent(out) :: du
-    real(dp), dimension(:, :, :), intent(in) :: u
-    type(tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: n_groups
+    real(dp), dimension(:, :, :), intent(out) :: du  !! Derivative output
+    real(dp), dimension(:, :, :), intent(in) :: u    !! Input field
+    type(tdsops_t), intent(in) :: tdsops             !! Compact scheme operator
+    integer, intent(in) :: n_groups                  !! Number of pencil groups
 
     integer :: k
 
diff --git a/src/backend/omp/kernels/distributed.f90 b/src/backend/omp/kernels/distributed.f90
index a02f39f8d..acb1e8024 100644
--- a/src/backend/omp/kernels/distributed.f90
+++ b/src/backend/omp/kernels/distributed.f90
@@ -1,4 +1,28 @@
 module m_omp_kernels_dist
+  !! Distributed compact finite difference kernels for OpenMP backend.
+  !!
+  !! This module implements high-performance kernels for distributed compact
+  !! finite difference operators. These operators require halo exchange across
+  !! MPI ranks to compute derivatives near subdomain boundaries.
+  !!
+  !! **Key Features:**
+  !! - 9-point stencil compact schemes (4th-6th order accuracy)
+  !! - Explicit vectorisation with OpenMP SIMD directives
+  !! - Near and far boundary treatments for non-periodic domains
+  !! - Forward and backward elimination phases for distributed solves
+  !!
+  !! **Kernels:**
+  !! - `der_univ_dist`: Universal derivative (1st/2nd) with halo exchange
+  !! - `interpl_dist`: Interpolation from cell to vertices or vice versa
+  !!
+  !! **Distributed Algorithm:**
+  !! Compact schemes couple neighbouring points via implicit systems.
+  !! In distributed memory:
+  !! 1. Near-boundary points use special coefficients incorporating halo data
+  !! 2. Interior points use standard bulk coefficients
+  !! 3. Modified Thomas algorithm handles cross-process dependencies
+  !!
+  !! **Performance:** Explicitly vectorized inner loops for SIMD execution.
   use omp_lib
 
   use m_common, only: dp
@@ -12,15 +36,24 @@ subroutine der_univ_dist( &
     du, send_u_s, send_u_e, u, u_s, u_e, &
     n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf &
     )
+    !! Compute distributed compact derivative (1st or 2nd order).
+    !!
+    !! Evaluates derivative using compact finite difference scheme across
+    !! distributed domain. Handles boundary points with halo data and applies
+    !! appropriate scaling factors.
+    !!
+    !! **Stencil:** 9-point compact scheme requiring 4-point halo on each side.
+    !! Near boundaries (first/last 4 points): use boundary-specific coefficients.
+    !! Interior: use uniform bulk coefficients for efficiency.
     implicit none
 
     ! Arguments
-    real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e
-    real(dp), intent(in), dimension(:, :) :: u, u_s, u_e
-    integer, intent(in) :: n_tds, n_rhs
-    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in), dimension(:) :: ffr, fbc, faf
+    real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e !! Output derivative and send buffers
+    real(dp), intent(in), dimension(:, :) :: u, u_s, u_e !! Input field and halo data (start/end)
+    integer, intent(in) :: n_tds, n_rhs                   !! System sizes
+    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Boundary coefficients
+    real(dp), intent(in), dimension(:) :: coeffs          !! Bulk stencil coefficients
+    real(dp), intent(in), dimension(:) :: ffr, fbc, faf   !! Scaling factors
 
     ! Local variables
     integer :: i, j
diff --git a/src/backend/omp/kernels/spectral_processing.f90 b/src/backend/omp/kernels/spectral_processing.f90
index 75d3bbd1a..411024e88 100644
--- a/src/backend/omp/kernels/spectral_processing.f90
+++ b/src/backend/omp/kernels/spectral_processing.f90
@@ -1,4 +1,19 @@
 module m_omp_spectral
+  !! Spectral space processing for FFT-based Poisson solver.
+  !!
+  !! Provides kernels for solving Poisson equation in Fourier space with
+  !! spectral equivalence transformations. Handles different boundary
+  !! condition combinations: fully periodic (000) and Dirichlet in Y (010).
+  !!
+  !! **Spectral equivalence:** Modified wavenumbers for finite-difference
+  !! grid (Lele 1992). Ensures spectral solver matches compact FD schemes.
+  !!
+  !! **Reference:** JCP 228 (2009), 5989-6015, Section 4
+  !!
+  !! **Processing steps:**
+  !! 1. Forward spectral equivalence transform (physical $\rightarrow$ modified wavenumbers)
+  !! 2. Solve: $\hat{\phi}_k = -\hat{f}_k / k^2$
+  !! 3. Backward spectral equivalence transform (modified wavenumbers $\rightarrow$ physical)
   use m_common, only: dp
   implicit none
 
@@ -8,22 +23,34 @@ subroutine process_spectral_000( &
     div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, &
     nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-process div U* in spectral space for all periodic BCs.
+    !! Solve Poisson in spectral space for (0,0,0) boundary conditions.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Processes fully periodic case. Applies spectral equivalence transforms
+    !! in all three directions, divides by squared wavenumber, then applies
+    !! inverse transforms.
+    !!
+    !! **Algorithm:**
+    !! 1. Normalise by grid size (FFT convention)
+    !! 2. Forward spectral equivalence: physical $\rightarrow$ modified waves (Z, Y, X order)
+    !! 3. Solve: $\phi_k = -f_k / k^2$ (handle zero mode specially)
+    !! 4. Backward spectral equivalence: modified waves $\rightarrow$ physical
+    !!
+    !! **Special case:** Zero wavenumber (k=0) set to zero to remove constant mode.
+    !!
+    !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4
     implicit none
 
     !> Divergence of velocity in spectral space
-    complex(dp), intent(inout), dimension(:, :, :) :: div_u
+    complex(dp), intent(inout), dimension(:, :, :) :: div_u  !! In: RHS, Out: Solution
     !> Spectral equivalence constants
-    complex(dp), intent(in), dimension(:, :, :) :: waves
-    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), intent(in), dimension(:, :, :) :: waves  !! Modified wavenumbers squared
+    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Spectral equivalence coefficients
     !> Grid size in spectral space
-    integer, intent(in) :: nx_spec, ny_spec, nz_spec
+    integer, intent(in) :: nx_spec, ny_spec, nz_spec  !! Local spectral dimensions
     !> Offsets in the permuted pencils in spectral space
-    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st
+    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st  !! Global offsets
     !> Global cell size
-    integer, intent(in) :: nx, ny, nz
+    integer, intent(in) :: nx, ny, nz  !! Global grid dimensions
 
     integer :: i, j, k, ix, iy, iz
     real(dp) :: tmp_r, tmp_c, div_r, div_c
@@ -109,22 +136,37 @@ subroutine process_spectral_010( &
     div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, &
     nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-process div U* in spectral space, for non-periodic BC in y-dir.
+    !! Solve Poisson in spectral space for (0,1,0) boundary conditions.
+    !!
+    !! Processes Dirichlet in Y, periodic in X and Z. Uses sine series
+    !! in Y-direction (symmetry/antisymmetry transform) combined with
+    !! Fourier in X and Z.
+    !!
+    !! **Algorithm:**
+    !! 1. Normalise by grid size
+    !! 2. Forward spectral equivalence in Z and X (not Y, handled separately)
+    !! 3. Apply Y symmetry transform (combine left/right halves)
+    !! 4. Solve: $\phi_k = -f_k / k^2$
+    !! 5. Inverse Y symmetry transform
+    !! 6. Backward spectral equivalence in X and Z
+    !!
+    !! **Y-direction:** Sine series requires special symmetric processing
+    !! to maintain real-valued solution with Dirichlet BCs.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4
     implicit none
 
     !> Divergence of velocity in spectral space
-    complex(dp), intent(inout), dimension(:, :, :) :: div_u
+    complex(dp), intent(inout), dimension(:, :, :) :: div_u  !! In: RHS, Out: Solution
     !> Spectral equivalence constants
-    complex(dp), intent(in), dimension(:, :, :) :: waves
-    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), intent(in), dimension(:, :, :) :: waves  !! Modified wavenumbers squared
+    real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Spectral equivalence coefficients
     !> Grid size in spectral space
-    integer, intent(in) :: nx_spec, ny_spec, nz_spec
+    integer, intent(in) :: nx_spec, ny_spec, nz_spec  !! Local spectral dimensions
     !> Offsets in the permuted pencils in spectral space
-    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st
+    integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st  !! Global offsets
     !> Global cell size
-    integer, intent(in) :: nx, ny, nz
+    integer, intent(in) :: nx, ny, nz  !! Global grid dimensions
 
     integer :: i, j, k, ix, iy, iz, iy_r
     real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
diff --git a/src/backend/omp/kernels/thomas.f90 b/src/backend/omp/kernels/thomas.f90
index 88ec8d771..bc0d5c69f 100644
--- a/src/backend/omp/kernels/thomas.f90
+++ b/src/backend/omp/kernels/thomas.f90
@@ -1,4 +1,18 @@
 module m_omp_kernels_thom
+  !! Thomas algorithm kernels for local compact finite differences.
+  !!
+  !! Implements tridiagonal solvers for compact schemes when domain is
+  !! not decomposed in derivative direction. Provides both standard
+  !! (non-periodic) and cyclic (periodic) Thomas algorithm variants.
+  !!
+  !! **Thomas algorithm:** Standard forward elimination and backward
+  !! substitution for tridiagonal systems, O(n) complexity.
+  !!
+  !! **Periodic Thomas:** Sherman-Morrison formula to handle cyclic
+  !! tridiagonal systems arising from periodic boundary conditions.
+  !!
+  !! **Vectorisation:** Explicit SIMD directives for SZ-wide vectors,
+  !! processing multiple pencils simultaneously.
   use m_common, only: dp
   use m_omp_common, only: SZ
 
@@ -8,14 +22,32 @@ module m_omp_kernels_thom
 
   subroutine der_univ_thom(du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, &
                            thom_f, thom_s, thom_w, strch)
+    !! Thomas algorithm for non-periodic compact finite differences.
+    !!
+    !! Solves tridiagonal system arising from compact scheme with arbitrary
+    !! boundary conditions. Uses standard forward elimination followed by
+    !! backward substitution.
+    !!
+    !! **Algorithm:**
+    !! 1. Forward pass: Eliminate lower diagonal, form modified RHS
+    !! 2. Backward pass: Back-substitution with grid stretching correction
+    !!
+    !! **Boundary treatment:** Special stencils at start (j=1..4) and
+    !! end (j=n-3..n) to handle non-periodic boundaries.
+    !!
+    !! **Stretching:** Applied during backward pass via `strch` array.
     implicit none
 
-    real(dp), dimension(:, :), intent(out) :: du
-    real(dp), dimension(:, :), intent(in) :: u
-    integer, intent(in) :: n_tds, n_rhs
-    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch
+    real(dp), dimension(:, :), intent(out) :: du  !! Solution (derivative)
+    real(dp), dimension(:, :), intent(in) :: u    !! Input field
+    integer, intent(in) :: n_tds                  !! Number of unknowns (tridiagonal size)
+    integer, intent(in) :: n_rhs                  !! Number of RHS points (stencil size)
+    real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Start/end stencil coefficients
+    real(dp), intent(in), dimension(:) :: coeffs  !! Interior stencil coefficients (9-point)
+    real(dp), intent(in), dimension(:) :: thom_f  !! Forward elimination factors
+    real(dp), intent(in), dimension(:) :: thom_s  !! Subdiagonal elimination factors
+    real(dp), intent(in), dimension(:) :: thom_w  !! Diagonal weights for back-substitution
+    real(dp), intent(in), dimension(:) :: strch   !! Grid stretching correction factors
 
     integer :: i, j
     real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4
@@ -132,14 +164,34 @@ end subroutine der_univ_thom
   subroutine der_univ_thom_per( &
     du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch &
     )
+    !! Periodic Thomas algorithm for cyclic tridiagonal systems.
+    !!
+    !! Solves compact scheme with periodic boundary conditions using
+    !! Sherman-Morrison formula. Handles wraparound coupling between
+    !! first and last grid points.
+    !!
+    !! **Algorithm:**
+    !! 1. Forward pass: Standard elimination with periodic indexing
+    !! 2. Backward pass: Standard back-substitution
+    !! 3. Periodic correction: Sherman-Morrison adjustment for cyclic coupling
+    !!
+    !! **Periodic indexing:** Uses modulo arithmetic for stencil access
+    !! to handle wraparound at domain boundaries.
+    !!
+    !! **Sherman-Morrison:** Adds rank-1 correction to handle tridiagonal
+    !! system modified by periodic coupling terms.
     implicit none
 
-    real(dp), dimension(:, :), intent(out) :: du
-    real(dp), dimension(:, :), intent(in) :: u
-    integer, intent(in) :: n
-    real(dp), intent(in), dimension(:) :: coeffs
-    real(dp), intent(in) :: alpha
-    real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, thom_p, strch
+    real(dp), dimension(:, :), intent(out) :: du  !! Solution (derivative)
+    real(dp), dimension(:, :), intent(in) :: u    !! Input field
+    integer, intent(in) :: n                      !! Number of grid points
+    real(dp), intent(in), dimension(:) :: coeffs  !! Stencil coefficients (9-point)
+    real(dp), intent(in) :: alpha                 !! Tridiagonal sub/super-diagonal value
+    real(dp), intent(in), dimension(:) :: thom_f  !! Forward elimination factors
+    real(dp), intent(in), dimension(:) :: thom_s  !! Subdiagonal elimination factors
+    real(dp), intent(in), dimension(:) :: thom_w  !! Diagonal weights
+    real(dp), intent(in), dimension(:) :: thom_p  !! Periodic correction vector
+    real(dp), intent(in), dimension(:) :: strch   !! Grid stretching correction factors
 
     integer :: i, j
     integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4
diff --git a/src/backend/omp/poisson_fft.f90 b/src/backend/omp/poisson_fft.f90
index 72a94521c..4266c5f5f 100644
--- a/src/backend/omp/poisson_fft.f90
+++ b/src/backend/omp/poisson_fft.f90
@@ -1,4 +1,22 @@
 module m_omp_poisson_fft
+  !! FFT-based Poisson solver for OMP backend.
+  !!
+  !! Solves $\nabla^2 \phi = f$ using spectral methods with 2DECOMP&FFT library.
+  !! Transforms to Fourier space, solves diagonal system in spectral space,
+  !! then transforms back to physical space.
+  !!
+  !! **Algorithm:**
+  !! 1. Forward FFT: physical $\rightarrow$ spectral space
+  !! 2. Spectral solve: $\phi_k = f_k / k^2$ (with modifications for boundary conditions)
+  !! 3. Backward FFT: spectral $\rightarrow$ physical space
+  !!
+  !! **Boundary conditions:**
+  !! - (0,0,0): Periodic in all directions
+  !! - (0,1,0): Dirichlet in Y, periodic in X/Z (uses symmetry transform)
+  !!
+  !! **Parallelisation:** MPI via 2DECOMP&FFT pencil decomposition
+  !!
+  !! **Limitation:** Does not support Y-direction grid stretching
 
   use decomp_2d_constants, only: PHYSICAL_IN_X
   use decomp_2d_fft, only: decomp_2d_fft_init, decomp_2d_fft_3d, &
@@ -16,14 +34,14 @@ module m_omp_poisson_fft
 
   type, extends(poisson_fft_t) :: omp_poisson_fft_t
       !! FFT based Poisson solver
-    complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z
+    complex(dp), allocatable, dimension(:, :, :) :: c_x  !! Spectral space buffer (X-pencil oriented)
   contains
-    procedure :: fft_forward => fft_forward_omp
-    procedure :: fft_backward => fft_backward_omp
-    procedure :: fft_postprocess_000 => fft_postprocess_000_omp
-    procedure :: fft_postprocess_010 => fft_postprocess_010_omp
-    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp
-    procedure :: undo_periodicity_y => undo_periodicity_y_omp
+    procedure :: fft_forward => fft_forward_omp           !! Transform to spectral space
+    procedure :: fft_backward => fft_backward_omp         !! Transform to physical space
+    procedure :: fft_postprocess_000 => fft_postprocess_000_omp  !! Spectral solve for (0,0,0) BCs
+    procedure :: fft_postprocess_010 => fft_postprocess_010_omp  !! Spectral solve for (0,1,0) BCs
+    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp  !! Symmetry transform for Y Dirichlet
+    procedure :: undo_periodicity_y => undo_periodicity_y_omp        !! Inverse symmetry transform
   end type omp_poisson_fft_t
 
   interface omp_poisson_fft_t
@@ -35,15 +53,22 @@ module m_omp_poisson_fft
 contains
 
   function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft)
+    !! Initialise FFT-based Poisson solver.
+    !!
+    !! Sets up 2DECOMP&FFT library and allocates spectral space buffers.
+    !! Computes wavenumbers and coefficients for spectral solve.
+    !!
+    !! **Error checking:** Fails if Y-direction grid stretching requested
+    !! (not supported by FFT method).
     implicit none
 
-    type(mesh_t), intent(in) :: mesh
-    class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
-    integer, dimension(3) :: istart, iend, isize
-    integer :: dims(3)
+    type(mesh_t), intent(in) :: mesh                 !! Mesh with grid spacing
+    class(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Spectral operators
+    logical, optional, intent(in) :: lowmem          !! Low-memory flag (ignored for OMP)
+    integer, dimension(3) :: istart, iend, isize     !! Local spectral dimensions
+    integer :: dims(3)                               !! Global grid dimensions
 
-    type(omp_poisson_fft_t) :: poisson_fft
+    type(omp_poisson_fft_t) :: poisson_fft           !! Initialised solver
 
     if (mesh%par%is_root()) then
       print *, "Initialising 2decomp&fft"
@@ -75,29 +100,43 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft)
   end function init
 
   subroutine fft_forward_omp(self, f_in)
+    !! Forward FFT: physical space to spectral space.
+    !!
+    !! Transforms input field from physical (real) to spectral (complex)
+    !! representation using 2DECOMP&FFT. Result stored in `self%c_x`.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self      !! Solver instance
+    class(field_t), intent(in) :: f_in    !! Physical space field (RHS)
 
     call decomp_2d_fft_3d(f_in%data, self%c_x)
 
   end subroutine fft_forward_omp
 
   subroutine fft_backward_omp(self, f_out)
+    !! Backward FFT: spectral space to physical space.
+    !!
+    !! Transforms spectral solution back to physical (real) space using
+    !! inverse FFT. Reads from `self%c_x`, writes to output field.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
+    class(omp_poisson_fft_t) :: self         !! Solver instance
+    class(field_t), intent(inout) :: f_out   !! Physical space solution
 
     call decomp_2d_fft_3d(self%c_x, f_out%data)
 
   end subroutine fft_backward_omp
 
   subroutine fft_postprocess_000_omp(self)
+    !! Spectral solve for (0,0,0) boundary conditions.
+    !!
+    !! Solves Poisson equation in spectral space for fully periodic domain.
+    !! Divides each Fourier mode by its corresponding $k^2$ eigenvalue.
+    !!
+    !! **Formula:** $\hat{\phi}_k = \hat{f}_k / (k_x^2 + k_y^2 + k_z^2)$
     implicit none
 
-    class(omp_poisson_fft_t) :: self
+    class(omp_poisson_fft_t) :: self  !! Solver instance
 
     call process_spectral_000( &
       self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, &
@@ -109,9 +148,16 @@ subroutine fft_postprocess_000_omp(self)
   end subroutine fft_postprocess_000_omp
 
   subroutine fft_postprocess_010_omp(self)
+    !! Spectral solve for (0,1,0) boundary conditions.
+    !!
+    !! Solves Poisson equation with Dirichlet BCs in Y-direction,
+    !! periodic in X and Z. Uses modified wavenumbers accounting for
+    !! symmetry transformation (sine series in Y).
+    !!
+    !! **Formula:** Modified $k_y$ for sine series representation
     implicit none
 
-    class(omp_poisson_fft_t) :: self
+    class(omp_poisson_fft_t) :: self  !! Solver instance
 
     call process_spectral_010( &
       self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, &
@@ -123,11 +169,18 @@ subroutine fft_postprocess_010_omp(self)
   end subroutine fft_postprocess_010_omp
 
   subroutine enforce_periodicity_y_omp(self, f_out, f_in)
+    !! Apply symmetry transform for Y Dirichlet boundary conditions.
+    !!
+    !! Converts physical field to symmetric/antisymmetric representation
+    !! suitable for sine series FFT. Used before forward FFT when Y-direction
+    !! has Dirichlet (non-periodic) BCs.
+    !!
+    !! **Transformation:** Maps domain to symmetric extension for sine basis.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self       !! Solver instance
+    class(field_t), intent(inout) :: f_out  !! Transformed field
+    class(field_t), intent(in) :: f_in      !! Original field
 
     integer :: i, j, k
 
@@ -149,11 +202,17 @@ subroutine enforce_periodicity_y_omp(self, f_out, f_in)
   end subroutine enforce_periodicity_y_omp
 
   subroutine undo_periodicity_y_omp(self, f_out, f_in)
+    !! Inverse symmetry transform for Y Dirichlet boundary conditions.
+    !!
+    !! Converts symmetric/antisymmetric representation back to physical
+    !! field. Used after backward FFT when Y-direction has Dirichlet BCs.
+    !!
+    !! **Transformation:** Extracts physical domain from symmetric extension.
     implicit none
 
-    class(omp_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(omp_poisson_fft_t) :: self       !! Solver instance
+    class(field_t), intent(inout) :: f_out  !! Physical field
+    class(field_t), intent(in) :: f_in      !! Transformed field
 
     integer :: i, j, k
 
diff --git a/src/backend/omp/sendrecv.f90 b/src/backend/omp/sendrecv.f90
index 3aba6e82c..39d8c6caa 100644
--- a/src/backend/omp/sendrecv.f90
+++ b/src/backend/omp/sendrecv.f90
@@ -1,4 +1,14 @@
 module m_omp_sendrecv
+  !! MPI halo exchange utilities for OMP backend.
+  !!
+  !! Provides non-blocking point-to-point communication for exchanging
+  !! boundary halos between neighbouring MPI processes. Used in distributed
+  !! compact finite difference schemes that require off-process data.
+  !!
+  !! **Communication pattern:** Bidirectional simultaneous send/recv with
+  !! neighbours in one decomposition direction.
+  !!
+  !! **Single-process optimisation:** Direct copy when no MPI communication needed.
   use mpi
 
   use m_common, only: dp, MPI_X3D2_DP
@@ -9,11 +19,31 @@ module m_omp_sendrecv
 
   subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, &
                              n_data, nproc, prev, next)
+    !! Exchange boundary halos with neighbouring MPI processes.
+    !!
+    !! Performs bidirectional halo exchange using non-blocking MPI
+    !! communication (MPI_Isend/MPI_Irecv). Sends data to both neighbours
+    !! simultaneously and receives from both, then waits for all operations
+    !! to complete.
+    !!
+    !! **Special case:** Single-process (nproc=1) uses direct memory copy
+    !! for periodic boundaries without MPI overhead.
+    !!
+    !! **Communication pattern:**
+    !! - Send start halo to previous process
+    !! - Receive end halo from next process
+    !! - Send end halo to next process
+    !! - Receive start halo from previous process
+    !!
+    !! **Non-blocking:** All 4 operations initiated before waiting for completion.
     implicit none
 
-    real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e
-    real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+    real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e  !! Receive buffers (start/end halos)
+    real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e   !! Send buffers (start/end halos)
+    integer, intent(in) :: n_data    !! Number of data elements to transfer
+    integer, intent(in) :: nproc     !! Number of processes in this direction
+    integer, intent(in) :: prev      !! Rank of previous neighbour
+    integer, intent(in) :: next      !! Rank of next neighbour
 
     integer :: req(4), err(4), ierr, tag = 1234
 

From f1479c968578443acdcf38c85a2504e1008c2fad Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 14:02:20 +0000
Subject: [PATCH 08/12] docs: add ford documentation to cuda backend files

---
 src/backend/cuda/allocator.f90                |  90 ++++--
 src/backend/cuda/backend.f90                  | 306 ++++++++++++------
 src/backend/cuda/common.f90                   |  12 +-
 src/backend/cuda/exec_dist.f90                |  47 ++-
 src/backend/cuda/exec_thom.f90                |  15 +-
 src/backend/cuda/kernels/distributed.f90      | 104 +++---
 src/backend/cuda/kernels/fieldops.f90         | 161 ++++++---
 src/backend/cuda/kernels/reorder.f90          | 134 +++++---
 .../cuda/kernels/spectral_processing.f90      | 162 +++++-----
 src/backend/cuda/kernels/thomas.f90           |  55 ++--
 src/backend/cuda/poisson_fft.f90              | 122 ++++---
 src/backend/cuda/sendrecv.f90                 |  45 ++-
 src/backend/cuda/tdsops.f90                   |  26 +-
 13 files changed, 850 insertions(+), 429 deletions(-)

diff --git a/src/backend/cuda/allocator.f90 b/src/backend/cuda/allocator.f90
index 1d21de6e3..16d4014cb 100644
--- a/src/backend/cuda/allocator.f90
+++ b/src/backend/cuda/allocator.f90
@@ -1,4 +1,17 @@
 module m_cuda_allocator
+  !! GPU memory allocator for CUDA backend.
+  !!
+  !! GPU memory (device memory) is physically separate from CPU memory (host).
+  !! This allocator manages device-side storage, ensuring field data resides
+  !! in GPU memory for kernel execution. Explicit device allocation avoids
+  !! expensive implicit host-device transfers that would kill performance.
+  !!
+  !! **Design rationale:**
+  !! - cuda_field_t extends field_t with device pointers (p_data_d, data_d)
+  !! - Maintains both 1D and 3D views of same memory for flexibility
+  !! - Reference counting prevents premature deallocation
+  !! - Block-based allocation reduces allocation overhead
+  !!
   use m_allocator, only: allocator_t
   use m_common, only: dp
   use m_field, only: field_t
@@ -7,8 +20,9 @@ module m_cuda_allocator
   implicit none
 
   type, extends(allocator_t) :: cuda_allocator_t
+    !! GPU memory allocator extending base allocator
   contains
-    procedure :: create_block => create_cuda_block
+    procedure :: create_block => create_cuda_block  !! Allocate GPU field block
   end type cuda_allocator_t
 
   interface cuda_allocator_t
@@ -16,12 +30,13 @@ module m_cuda_allocator
   end interface cuda_allocator_t
 
   type, extends(field_t) :: cuda_field_t
-    real(dp), device, pointer, private :: p_data_d(:)
-    real(dp), device, pointer, contiguous :: data_d(:, :, :)
+    !! Field residing in GPU device memory
+    real(dp), device, pointer, private :: p_data_d(:)  !! 1D device memory pointer (raw allocation)
+    real(dp), device, pointer, contiguous :: data_d(:, :, :)  !! 3D device view (for kernel access)
   contains
-    procedure :: fill => fill_cuda
-    procedure :: get_shape => get_shape_cuda
-    procedure :: set_shape => set_shape_cuda
+    procedure :: fill => fill_cuda               !! Fill with constant value
+    procedure :: get_shape => get_shape_cuda     !! Query 3D dimensions
+    procedure :: set_shape => set_shape_cuda     !! Reshape 3D view
   end type cuda_field_t
 
   interface cuda_field_t
@@ -31,9 +46,15 @@ module m_cuda_allocator
 contains
 
   function cuda_field_init(ngrid, next, id) result(f)
-    integer, intent(in) :: ngrid, id
-    type(cuda_field_t), pointer, intent(in) :: next
-    type(cuda_field_t) :: f
+    !! Initialise GPU field with device memory allocation.
+    !!
+    !! Device memory must be explicitly allocated before use. This constructor
+    !! allocates the 1D device array and sets up metadata for later reshaping
+    !! to 3D when dimensions are known.
+    integer, intent(in) :: ngrid  !! Total number of grid points
+    integer, intent(in) :: id     !! Unique field identifier
+    type(cuda_field_t), pointer, intent(in) :: next  !! Next field in linked list
+    type(cuda_field_t) :: f       !! Initialised field
 
     allocate (f%p_data_d(ngrid))
     f%refcount = 0
@@ -42,47 +63,74 @@ function cuda_field_init(ngrid, next, id) result(f)
   end function cuda_field_init
 
   subroutine fill_cuda(self, c)
+    !! Fill entire field with constant value on GPU.
+    !!
+    !! Initialising fields directly on GPU avoids transferring initialisation
+    !! data from host. Single assignment to device array leverages GPU's
+    !! memory controllers for efficient broadcast to all elements.
     implicit none
 
-    class(cuda_field_t) :: self
-    real(dp), intent(in) :: c
+    class(cuda_field_t) :: self   !! Field to fill
+    real(dp), intent(in) :: c     !! Constant value
 
     self%p_data_d = c
 
   end subroutine fill_cuda
 
   function get_shape_cuda(self) result(dims)
+    !! Query current 3D dimensions of field.
+    !!
+    !! Fields are allocated with total size but reshaped dynamically based
+    !! on decomposition. This query enables algorithms to adapt to actual
+    !! current dimensions without hard-coding sizes.
     implicit none
 
-    class(cuda_field_t) :: self
-    integer :: dims(3)
+    class(cuda_field_t) :: self   !! Field to query
+    integer :: dims(3)            !! Current dimensions
 
     dims = shape(self%data_d)
 
   end function get_shape_cuda
 
   subroutine set_shape_cuda(self, dims)
+    !! Reshape 3D view of device memory.
+    !!
+    !! Same 1D device allocation is reused for different pencil orientations
+    !! (X-pencils, Y-pencils, Z-pencils). Reshaping avoids reallocating GPU
+    !! memory, which is expensive. Fortran pointer remapping is essentially
+    !! free, just changing metadata not data.
     implicit none
 
-    class(cuda_field_t) :: self
-    integer, intent(in) :: dims(3)
+    class(cuda_field_t) :: self   !! Field to reshape
+    integer, intent(in) :: dims(3)  !! New dimensions
 
     self%data_d(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data_d
 
   end subroutine set_shape_cuda
 
   function cuda_allocator_init(dims, sz) result(allocator)
-    integer, intent(in) :: dims(3), sz
-    type(cuda_allocator_t) :: allocator
+    !! Initialise CUDA allocator with grid dimensions.
+    !!
+    !! Base allocator handles dimension calculations and block management
+    !! logic. CUDA allocator only needs to override block creation to use
+    !! device memory, avoiding code duplication.
+    integer, intent(in) :: dims(3)  !! Grid dimensions
+    integer, intent(in) :: sz       !! Pencil size (SZ)
+    type(cuda_allocator_t) :: allocator  !! Initialised allocator
 
     allocator%allocator_t = allocator_t(dims, sz)
   end function cuda_allocator_init
 
   function create_cuda_block(self, next) result(ptr)
-    class(cuda_allocator_t), intent(inout) :: self
-    type(cuda_field_t), pointer, intent(in) :: next
-    type(cuda_field_t), pointer :: newblock
-    class(field_t), pointer :: ptr
+    !! Create new field block in GPU memory.
+    !!
+    !! Central allocation point ensures consistent initialisation and enables
+    !! tracking (via IDs) for debugging memory issues. Returning base class
+    !! pointer maintains polymorphism for generic algorithm code.
+    class(cuda_allocator_t), intent(inout) :: self  !! Allocator instance
+    type(cuda_field_t), pointer, intent(in) :: next  !! Next in linked list
+    type(cuda_field_t), pointer :: newblock  !! Newly allocated block
+    class(field_t), pointer :: ptr           !! Polymorphic return pointer
     allocate (newblock)
     self%next_id = self%next_id + 1
     newblock = cuda_field_t(self%ngrid, next, id=self%next_id)
diff --git a/src/backend/cuda/backend.f90 b/src/backend/cuda/backend.f90
index 8efa5c041..7c67f09b2 100644
--- a/src/backend/cuda/backend.f90
+++ b/src/backend/cuda/backend.f90
@@ -1,4 +1,14 @@
 module m_cuda_backend
+  !! CUDA backend implementing GPU-accelerated solver operations.
+  !!
+  !! Extends base_backend_t with GPU kernel launches and device memory
+  !! management. Transport equations, tridiagonal solves, FFT operations,
+  !! and field manipulations execute on GPU.
+  !!
+  !! **MPI Communication:** Halo exchange passes device pointers directly to
+  !! MPI calls. With GPU-aware MPI implementations (OpenMPI with CUDA support,
+  !! MVAPICH2-GDR), data transfers directly between GPU memories. Without
+  !! GPU-aware MPI, the implementation stages through host memory automatically.
   use iso_fortran_env, only: stderr => error_unit
   use cudafor
   use mpi
@@ -35,6 +45,10 @@ module m_cuda_backend
   private :: transeq_halo_exchange, transeq_dist_component
 
   type, extends(base_backend_t) :: cuda_backend_t
+    !! GPU backend with device communication buffers and kernel configurations.
+    !!
+    !! Extends [[m_base_backend(module):base_backend_t(type)]] with CUDA-specific
+    !! implementations and device memory buffers for halo exchange.
     !character(len=*), parameter :: name = 'cuda'
     real(dp), device, allocatable, dimension(:, :, :) :: &
       u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev, &
@@ -78,11 +92,16 @@ module m_cuda_backend
 contains
 
   function init(mesh, allocator) result(backend)
+    !! Initialise CUDA backend with kernel configurations and communication buffers.
+    !!
+    !! Sets up CUDA thread blocks ([[m_cuda_common(module):SZ(variable)]] threads per
+    !! warp-aligned block) and allocates device buffers for halo exchange. Buffer size
+    !! accommodates largest pencil direction to support all three orientations.
     implicit none
 
-    type(mesh_t), target, intent(inout) :: mesh
-    class(allocator_t), target, intent(inout) :: allocator
-    type(cuda_backend_t) :: backend
+    type(mesh_t), target, intent(inout) :: mesh  !! Computational mesh
+    class(allocator_t), target, intent(inout) :: allocator  !! GPU memory allocator
+    type(cuda_backend_t) :: backend  !! Initialised CUDA backend
 
     type(cuda_poisson_fft_t) :: cuda_poisson_fft
     integer :: n_groups
@@ -140,19 +159,25 @@ subroutine alloc_cuda_tdsops( &
     self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     )
+    !! Allocate and initialise CUDA tridiagonal operators.
+    !!
+    !! Implements [[m_base_backend(module):alloc_tdsops(interface)]] for GPU.
+    !! Allocates [[m_cuda_tdsops(module):cuda_tdsops_t(type)]] with device-resident
+    !! coefficient arrays.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(tdsops_t), allocatable, intent(inout) :: tdsops
-    integer, intent(in) :: n_tds
-    real(dp), intent(in) :: delta
-    character(*), intent(in) :: operation, scheme
-    integer, intent(in) :: bc_start, bc_end
-    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)
-    integer, optional, intent(in) :: n_halo
-    character(*), optional, intent(in) :: from_to
-    logical, optional, intent(in) :: sym
-    real(dp), optional, intent(in) :: c_nu, nu0_nu
+    class(tdsops_t), allocatable, intent(inout) :: tdsops  !! Output: allocated CUDA operators
+    integer, intent(in) :: n_tds  !! Number of tridiagonal systems
+    real(dp), intent(in) :: delta  !! Grid spacing
+    character(*), intent(in) :: operation  !! Operation type (derivative/interpolation)
+    character(*), intent(in) :: scheme  !! Scheme name
+    integer, intent(in) :: bc_start, bc_end  !! Boundary condition flags
+    real(dp), optional, intent(in) :: stretch(:), stretch_correct(:)  !! Grid stretching factors
+    integer, optional, intent(in) :: n_halo  !! Halo width for distributed schemes
+    character(*), optional, intent(in) :: from_to  !! Interpolation direction
+    logical, optional, intent(in) :: sym  !! Symmetry flag
+    real(dp), optional, intent(in) :: c_nu, nu0_nu  !! Viscosity parameters
 
     allocate (cuda_tdsops_t :: tdsops)
 
@@ -166,13 +191,18 @@ subroutine alloc_cuda_tdsops( &
   end subroutine alloc_cuda_tdsops
 
   subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in x-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Routes to distributed or Thomas algorithm based on
+    !! [[m_tdsops(module):dirps_t(type)]] configuration.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     call self%transeq_cuda_dist(du, dv, dw, u, v, w, nu, dirps, &
                                 self%xblocks, self%xthreads)
@@ -180,13 +210,17 @@ subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_x_cuda
 
   subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in y-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Arguments reordered (v, u, w) to match y-pencil orientation.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     ! u, v, w is reordered so that we pass v, u, w
     call self%transeq_cuda_dist(dv, du, dw, v, u, w, nu, dirps, &
@@ -195,13 +229,17 @@ subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps)
   end subroutine transeq_y_cuda
 
   subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, nu, dirps)
+    !! Compute transport equation in z-direction using CUDA.
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders(interface)]].
+    !! Arguments reordered (w, u, v) to match z-pencil orientation.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
     ! u, v, w is reordered so that we pass w, u, v
     call self%transeq_cuda_dist(dw, du, dv, w, u, v, nu, dirps, &
@@ -212,16 +250,19 @@ end subroutine transeq_z_cuda
   subroutine transeq_species_cuda(self, dspec, uvw, spec, nu, dirps, sync)
     !! Compute the convection and diffusion for the given field
     !! in the given direction.
-    !! Halo exchange for the given field is necessary
-    !! When sync is true, halo exchange of momentum is necessary
+    !!
+    !! Implements [[m_base_backend(module):transeq_ders_spec(interface)]].
+    !! Halo exchange for the given field is necessary.
+    !! When sync is true, halo exchange of momentum is necessary.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: dspec
-    class(field_t), intent(in) :: uvw, spec
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    logical, intent(in) :: sync
+    class(field_t), intent(inout) :: dspec  !! Output: RHS contribution for species
+    class(field_t), intent(in) :: uvw  !! Input: velocity component in transport direction
+    class(field_t), intent(in) :: spec  !! Input: species concentration field
+    real(dp), intent(in) :: nu  !! Diffusivity (kinematic viscosity)
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
+    logical, intent(in) :: sync  !! If true, also exchange momentum halos
 
     integer :: n_groups
     type(cuda_tdsops_t), pointer :: der1st, der1st_sym, der2nd, der2nd_sym
@@ -282,14 +323,19 @@ end subroutine transeq_species_cuda
 
   subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, &
                                blocks, threads)
+    !! Compute transport equation using distributed compact scheme on GPU.
+    !!
+    !! Handles halo exchange with [[m_cuda_sendrecv(module):sendrecv_3fields(interface)]],
+    !! launches [[m_cuda_exec_dist(module):exec_dist_transeq_3fused(interface)]] kernel,
+    !! and gathers derivatives.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    real(dp), intent(in) :: nu
-    type(dirps_t), intent(in) :: dirps
-    type(dim3), intent(in) :: blocks, threads
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     real(dp), device, pointer, dimension(:, :, :) :: u_dev, v_dev, w_dev, &
                                                      du_dev, dv_dev, dw_dev
@@ -342,9 +388,13 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, &
   end subroutine transeq_cuda_dist
 
   subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dir)
+    !! Exchange velocity field halos using MPI with device pointers.
+    !!
+    !! Packs boundary data into communication buffers and exchanges with
+    !! neighbouring ranks. Uses sendrecv_3fields for batched communication.
     class(cuda_backend_t) :: self
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev
-    integer, intent(in) :: dir
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev  !! Velocity components on device
+    integer, intent(in) :: dir  !! Direction for halo exchange
     integer :: n, nproc_dir, pprev, pnext
     integer :: n_groups
 
@@ -376,20 +426,21 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, &
                                     conv_recv_s_dev, conv_recv_e_dev, &
                                     tdsops_du, tdsops_dud, tdsops_d2u, &
                                     dir, blocks, threads)
-    !! Computes RHS_x^u following:
+    !! Compute transport equation RHS component using distributed compact schemes.
     !!
-    !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2
+    !! Computes: $\text{rhs} = -\frac{1}{2}(\text{conv} \frac{\partial u}{\partial x} + \frac{\partial (u \cdot \text{conv})}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$
     class(cuda_backend_t) :: self
-    !> The result field, it is also used as temporary storage
-    real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev, conv_dev
-    real(dp), intent(in) :: nu
+    real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev  !! Output: transport equation RHS
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev  !! Input: velocity component field
+    real(dp), device, dimension(:, :, :), intent(in) :: conv_dev  !! Input: convecting velocity field
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
     real(dp), device, dimension(:, :, :), intent(in) :: &
-      u_recv_s_dev, u_recv_e_dev, &
-      conv_recv_s_dev, conv_recv_e_dev
-    class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    integer, intent(in) :: dir
-    type(dim3), intent(in) :: blocks, threads
+      u_recv_s_dev, u_recv_e_dev  !! Halo data for u from neighbours
+    real(dp), device, dimension(:, :, :), intent(in) :: &
+      conv_recv_s_dev, conv_recv_e_dev  !! Halo data for conv from neighbours
+    class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for derivatives
+    integer, intent(in) :: dir  !! Direction index
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     class(field_t), pointer :: dud, d2u
 
@@ -425,25 +476,31 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, &
   end subroutine transeq_dist_component
 
   subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps)
-      !! Thomas algorithm implementation. So much more easier than the
-      !! distributed algorithm. It is intended to work only on a single rank
-      !! so there is no MPI communication.
+    !! Compute transport equation using Thomas algorithm.
+    !!
+    !! Simpler than distributed scheme - no MPI communication, uses
+    !! [[m_cuda_exec_thom(module):exec_thom_tds_compact(interface)]] kernel.
+    !! Intended for single-rank execution only.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du, dv, dw
-    class(field_t), intent(in) :: u, v, w
-    type(dirps_t), intent(in) :: dirps
+    class(field_t), intent(inout) :: du, dv, dw  !! Output: RHS contributions
+    class(field_t), intent(in) :: u, v, w  !! Input: velocity components
+    type(dirps_t), intent(in) :: dirps  !! Directional operators
 
   end subroutine transeq_cuda_thom
 
   subroutine tds_solve_cuda(self, du, u, tdsops)
+    !! Solve tridiagonal systems using CUDA kernels.
+    !!
+    !! Implements [[m_base_backend(module):tds_solve(interface)]].
+    !! Dispatches to appropriate CUDA kernel based on pencil direction.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
+    class(field_t), intent(inout) :: du  !! Output: solution
+    class(field_t), intent(in) :: u  !! Input: RHS
+    class(tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
 
     type(dim3) :: blocks, threads
 
@@ -464,13 +521,17 @@ subroutine tds_solve_cuda(self, du, u, tdsops)
   end subroutine tds_solve_cuda
 
   subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads)
+    !! Solve distributed tridiagonal systems using CUDA kernels and MPI.
+    !!
+    !! Performs forward sweep, exchanges boundary data via MPI (using device
+    !! pointers for potential GPU-aware MPI benefit), then backward substitution.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: du
-    class(field_t), intent(in) :: u
-    class(tdsops_t), intent(in) :: tdsops
-    type(dim3), intent(in) :: blocks, threads
+    class(field_t), intent(inout) :: du  !! Output: solution
+    class(field_t), intent(in) :: u  !! Input: RHS
+    class(tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     real(dp), device, pointer, dimension(:, :, :) :: du_dev, u_dev
 
@@ -512,12 +573,16 @@ subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads)
   end subroutine tds_solve_dist
 
   subroutine reorder_cuda(self, u_o, u_i, direction)
+    !! Reorder field data between pencil orientations using CUDA kernels.
+    !!
+    !! Implements [[m_base_backend(module):reorder(interface)]].
+    !! Calls appropriate [[m_cuda_kernels_reorder(module)]] kernel based on direction.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: u_o
-    class(field_t), intent(in) :: u_i
-    integer, intent(in) :: direction
+    class(field_t), intent(inout) :: u_o  !! Output: reordered field
+    class(field_t), intent(in) :: u_i  !! Input: source field
+    integer, intent(in) :: direction  !! Reordering direction (RDR_X2Y, RDR_Y2Z, etc)
 
     real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d
     class(field_t), pointer :: u_temp
@@ -632,9 +697,12 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
   end subroutine reorder_cuda
 
   subroutine sum_yintox_cuda(self, u, u_y)
+    !! Sum y-pencil field into x-pencil using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: u  !! Output: x-pencil result
+    class(field_t), intent(in) :: u_y  !! Input: y-pencil field to sum
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_y
 
@@ -654,9 +722,12 @@ subroutine sum_yintox_cuda(self, u, u_y)
   end subroutine sum_yintox_cuda
 
   subroutine sum_zintox_cuda(self, u, u_z)
+    !! Sum z-pencil field into x-pencil using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: u  !! Output: x-pencil result
+    class(field_t), intent(in) :: u_z  !! Input: z-pencil field to sum
     class(field_t), intent(inout) :: u
     class(field_t), intent(in) :: u_z
 
@@ -676,11 +747,15 @@ subroutine sum_zintox_cuda(self, u, u_z)
   end subroutine sum_zintox_cuda
 
   subroutine veccopy_cuda(self, dst, src)
+    !! Copy field data using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):veccopy(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):buffer_copy(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: dst
-    class(field_t), intent(in) :: src
+    class(field_t), intent(inout) :: dst  !! Output: destination field
+    class(field_t), intent(in) :: src  !! Input: source field
 
     real(dp), device, pointer, dimension(:, :, :) :: dst_d, src_d
     type(dim3) :: blocks, threads
@@ -697,10 +772,14 @@ subroutine veccopy_cuda(self, dst, src)
   end subroutine veccopy_cuda
 
   subroutine vecadd_cuda(self, a, x, b, y)
+    !! Compute linear combination $y = ax + by$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):vecadd(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):axpby(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    real(dp), intent(in) :: a
+    real(dp), intent(in) :: a  !! Scalar coefficient for x
     class(field_t), intent(in) :: x
     real(dp), intent(in) :: b
     class(field_t), intent(inout) :: y
@@ -720,10 +799,15 @@ subroutine vecadd_cuda(self, a, x, b, y)
   end subroutine vecadd_cuda
 
   subroutine vecmult_cuda(self, y, x)
-    !! [[m_base_backend(module):vecmult(interface)]]
+    !! Compute element-wise product $y = x \cdot y$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):vecmult(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):pwmul(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
+    class(field_t), intent(inout) :: y  !! Input/Output: multiplied in-place
+    class(field_t), intent(in) :: x  !! Input: multiplier
     class(field_t), intent(inout) :: y
     class(field_t), intent(in) :: x
     real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
@@ -741,11 +825,14 @@ subroutine vecmult_cuda(self, y, x)
   end subroutine vecmult_cuda
 
   real(dp) function scalar_product_cuda(self, x, y) result(s)
-    !! [[m_base_backend(module):scalar_product(interface)]]
+    !! Compute global scalar product $\langle x, y \rangle$ using CUDA kernel and MPI reduction.
+    !!
+    !! Implements [[m_base_backend(module):scalar_product(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):scalar_product(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: x, y
+    class(field_t), intent(in) :: x, y  !! Input fields
 
     real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
     real(dp), device, allocatable :: sum_d
@@ -791,12 +878,12 @@ real(dp) function scalar_product_cuda(self, x, y) result(s)
   end function scalar_product_cuda
 
   subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
+    !! Copy boundary data into MPI send buffers using CUDA kernel.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, &
-                                                         u_send_e_dev
-    real(dp), device, dimension(:, :, :), intent(in) :: u_dev
-    integer, intent(in) :: n
+    real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, u_send_e_dev  !! Send buffers
+    real(dp), device, dimension(:, :, :), intent(in) :: u_dev  !! Source field
+    integer, intent(in) :: n  !! Grid dimension
 
     type(dim3) :: blocks, threads
     integer :: n_halo = 4
@@ -809,13 +896,16 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
   end subroutine copy_into_buffers
 
   subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc)
-    !! [[m_base_backend(module):field_max_mean(interface)]]
+    !! Compute field maximum and mean using CUDA kernel and MPI reductions.
+    !!
+    !! Implements [[m_base_backend(module):field_max_mean(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):field_max_sum(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    real(dp), intent(out) :: max_val, mean_val
-    class(field_t), intent(in) :: f
-    integer, optional, intent(in) :: enforced_data_loc
+    real(dp), intent(out) :: max_val, mean_val  !! Output: global maximum and mean
+    class(field_t), intent(in) :: f  !! Input field
+    integer, optional, intent(in) :: enforced_data_loc  !! Override field data location
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     real(dp), device, allocatable :: max_d, sum_d
@@ -871,11 +961,15 @@ subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc)
   end subroutine field_max_mean_cuda
 
   subroutine field_scale_cuda(self, f, a)
+    !! Scale field by constant $f = a \cdot f$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):field_ops(interface)]] (field_scale binding).
+    !! Uses [[m_cuda_kernels_fieldops(module):field_scale(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(field_t), intent(in) :: f  !! Field to scale in-place
+    real(dp), intent(in) :: a  !! Scaling factor
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -891,11 +985,15 @@ subroutine field_scale_cuda(self, f, a)
   end subroutine field_scale_cuda
 
   subroutine field_shift_cuda(self, f, a)
+    !! Shift field by constant $f = f + a$ using CUDA kernel.
+    !!
+    !! Implements [[m_base_backend(module):field_ops(interface)]] (field_shift binding).
+    !! Uses [[m_cuda_kernels_fieldops(module):field_shift(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
-    real(dp), intent(in) :: a
+    class(field_t), intent(in) :: f  !! Field to shift in-place
+    real(dp), intent(in) :: a  !! Shift amount
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -911,13 +1009,13 @@ subroutine field_shift_cuda(self, f, a)
   end subroutine field_shift_cuda
 
   subroutine field_set_face_cuda(self, f, c_start, c_end, face)
-    !! [[m_base_backend(module):field_set_face(subroutine)]]
+    !! Set boundary face values using CUDA kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), intent(in) :: c_start, c_end
-    integer, intent(in) :: face
+    class(field_t), intent(inout) :: f  !! Field to modify
+    real(dp), intent(in) :: c_start, c_end  !! Values for start and end faces
+    integer, intent(in) :: face  !! Face identifier (X_FACE, Y_FACE, Z_FACE)
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     type(dim3) :: blocks, threads
@@ -952,11 +1050,14 @@ subroutine field_set_face_cuda(self, f, c_start, c_end, face)
   end subroutine field_set_face_cuda
 
   real(dp) function field_volume_integral_cuda(self, f) result(s)
-    !! volume integral of a field
+    !! Compute volume integral using CUDA kernel and MPI reduction.
+    !!
+    !! Implements [[m_base_backend(module):field_reduce(interface)]].
+    !! Uses [[m_cuda_kernels_fieldops(module):volume_integral(interface)]] kernel.
     implicit none
 
     class(cuda_backend_t) :: self
-    class(field_t), intent(in) :: f
+    class(field_t), intent(in) :: f  !! Input field
 
     real(dp), device, pointer, dimension(:, :, :) :: f_d
     real(dp), device, allocatable :: integral_d
@@ -991,28 +1092,34 @@ real(dp) function field_volume_integral_cuda(self, f) result(s)
   end function field_volume_integral_cuda
 
   subroutine copy_data_to_f_cuda(self, f, data)
+    !! Copy host array to device field.
     class(cuda_backend_t), intent(inout) :: self
-    class(field_t), intent(inout) :: f
-    real(dp), dimension(:, :, :), intent(inout) :: data
+    class(field_t), intent(inout) :: f  !! Target device field
+    real(dp), dimension(:, :, :), intent(inout) :: data  !! Source host array
 
     select type (f); type is (cuda_field_t); f%data_d = data; end select
   end subroutine copy_data_to_f_cuda
 
   subroutine copy_f_to_data_cuda(self, data, f)
+    !! Copy device field to host array.
     class(cuda_backend_t), intent(inout) :: self
-    real(dp), dimension(:, :, :), intent(out) :: data
-    class(field_t), intent(in) :: f
+    real(dp), dimension(:, :, :), intent(out) :: data  !! Target host array
+    class(field_t), intent(in) :: f  !! Source device field
 
     select type (f); type is (cuda_field_t); data = f%data_d; end select
   end subroutine copy_f_to_data_cuda
 
   subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+    !! Initialise CUDA FFT Poisson solver.
+    !!
+    !! Implements [[m_base_backend(module):init_poisson_fft(interface)]].
+    !! Allocates [[m_cuda_poisson_fft(module):cuda_poisson_fft_t(type)]] instance.
     implicit none
 
     class(cuda_backend_t) :: self
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    type(mesh_t), intent(in) :: mesh  !! Computational mesh
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Directional operators
+    logical, optional, intent(in) :: lowmem  !! Low memory mode flag
 
     allocate (cuda_poisson_fft_t :: self%poisson_fft)
 
@@ -1024,8 +1131,9 @@ subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
   end subroutine init_cuda_poisson_fft
 
   subroutine resolve_field_t(u_dev, u)
-    real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev
-    class(field_t), intent(in) :: u
+    !! Helper to extract device pointer from cuda_field_t.
+    real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev  !! Device pointer
+    class(field_t), intent(in) :: u  !! Field object
 
     select type (u)
     type is (cuda_field_t)
diff --git a/src/backend/cuda/common.f90 b/src/backend/cuda/common.f90
index 6165c38a7..d67dc477a 100644
--- a/src/backend/cuda/common.f90
+++ b/src/backend/cuda/common.f90
@@ -1,6 +1,16 @@
 module m_cuda_common
+  !! Common constants for CUDA backend.
+  !!
+  !! CUDA GPUs execute threads in groups of 32 called warps. Setting the
+  !! pencil size to 32 ensures coalesced memory access patterns, where all
+  !! threads in a warp access consecutive memory locations simultaneously.
+  !! This is critical for GPU memory bandwidth efficiency.
+  !!
+  !! **Performance impact:** Matching the hardware warp size eliminates
+  !! divergence and maximises memory throughput, typically improving
+  !! performance by 2-3x compared to non-coalesced access.
   implicit none
 
-  integer, parameter :: SZ = 32
+  integer, parameter :: SZ = 32  !! Pencil size matching GPU warp width
 
 end module m_cuda_common
diff --git a/src/backend/cuda/exec_dist.f90 b/src/backend/cuda/exec_dist.f90
index 5a71bcc76..08048481c 100644
--- a/src/backend/cuda/exec_dist.f90
+++ b/src/backend/cuda/exec_dist.f90
@@ -1,4 +1,9 @@
 module m_cuda_exec_dist
+  !! Distributed compact scheme execution on GPU.
+  !!
+  !! Orchestrates CUDA kernel launches and MPI halo exchange for distributed
+  !! compact finite difference schemes. Handles both generic derivative operations
+  !! and fused transport equation computation.
   use cudafor
   use mpi
 
@@ -17,21 +22,28 @@ subroutine exec_dist_tds_compact( &
     du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, &
     tdsops, nproc, pprev, pnext, blocks, threads &
     )
+    !! Execute distributed compact scheme derivative $du = d(u)$ on GPU.
+    !!
+    !! Calls distributed kernel, exchanges halo data for $2 \times 2$ boundary
+    !! systems, then applies substitution kernel.
     implicit none
 
     ! du = d(u)
-    real(dp), device, dimension(:, :, :), intent(out) :: du
-    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
 
     ! The ones below are intent(out) just so that we can write data in them,
     ! not because we actually need the data they store later where this
     ! subroutine is called. We absolutely don't care the data they pass back
+    real(dp), device, dimension(:, :, :), intent(out) :: du  !! Output: derivative
+    real(dp), device, dimension(:, :, :), intent(in) :: u  !! Input: field with local data
+    real(dp), device, dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e  !! Halo data from neighbours
+
+    ! Temporary buffers for halo exchange (overwritten during computation)
     real(dp), device, dimension(:, :, :), intent(out) :: &
       du_send_s, du_send_e, du_recv_s, du_recv_e
 
-    type(cuda_tdsops_t), intent(in) :: tdsops
-    integer, intent(in) :: nproc, pprev, pnext
-    type(dim3), intent(in) :: blocks, threads
+    type(cuda_tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    integer, intent(in) :: nproc, pprev, pnext  !! MPI ranks (total, previous, next)
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     integer :: n_data
 
@@ -64,27 +76,28 @@ subroutine exec_dist_transeq_3fused( &
     tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, &
     blocks, threads &
     )
+    !! Execute fused transport equation computation on GPU with distributed compact scheme.
+    !!
+    !! Computes $r\_du = -\frac{1}{2}(v \frac{\partial u}{\partial x} + \frac{\partial (uv)}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$
+    !! Launches distributed kernel for three operators (du, dud, d2u), exchanges halo data for all
+    !! boundary systems in one batch, then applies substitution kernel.
     implicit none
 
-    ! r_du = -1/2*(v*d1(u) + d1(u*v)) + nu*d2(u)
-    !> The result array, it is also used as temporary storage
-    real(dp), device, dimension(:, :, :), intent(out) :: r_du
-    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e
-    real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e
+    real(dp), device, dimension(:, :, :), intent(out) :: r_du  !! Output: transport equation RHS
+    real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e  !! Field u with halos
+    real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e  !! Field v with halos
 
-    ! The ones below are intent(out) just so that we can write data in them,
-    ! not because we actually need the data they store later where this
-    ! subroutine is called. We absolutely don't care the data they pass back
+    ! Temporary storage for derivatives and halo exchange buffers
     real(dp), device, dimension(:, :, :), intent(out) :: dud, d2u
     real(dp), device, dimension(:, :, :), intent(out) :: &
       du_send_s, du_send_e, du_recv_s, du_recv_e, &
       dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
       d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
 
-    type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u
-    real(dp), intent(in) :: nu
-    integer, intent(in) :: nproc, pprev, pnext
-    type(dim3), intent(in) :: blocks, threads
+    type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u  !! Operators for each derivative
+    real(dp), intent(in) :: nu  !! Kinematic viscosity
+    integer, intent(in) :: nproc, pprev, pnext  !! MPI ranks
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     integer :: n_data
 
diff --git a/src/backend/cuda/exec_thom.f90 b/src/backend/cuda/exec_thom.f90
index 50e757579..3834a20ac 100644
--- a/src/backend/cuda/exec_thom.f90
+++ b/src/backend/cuda/exec_thom.f90
@@ -1,4 +1,8 @@
 module m_cuda_exec_thom
+  !! Thomas algorithm execution on GPU for local tridiagonal systems.
+  !!
+  !! Dispatches to periodic or non-periodic Thomas kernels based on
+  !! boundary conditions. No MPI communication required.
   use cudafor
 
   use m_common, only: dp
@@ -10,12 +14,15 @@ module m_cuda_exec_thom
 contains
 
   subroutine exec_thom_tds_compact(du, u, tdsops, blocks, threads)
+    !! Execute Thomas algorithm for compact scheme derivative $du = d(u)$ on GPU.
+    !!
+    !! Selects periodic or non-periodic kernel variant based on operator configuration.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: du
-    real(dp), device, dimension(:, :, :), intent(in) :: u
-    type(cuda_tdsops_t), intent(in) :: tdsops
-    type(dim3), intent(in) :: blocks, threads
+    real(dp), device, dimension(:, :, :), intent(out) :: du  !! Output: derivative
+    real(dp), device, dimension(:, :, :), intent(in) :: u  !! Input: field
+    type(cuda_tdsops_t), intent(in) :: tdsops  !! Tridiagonal operators
+    type(dim3), intent(in) :: blocks, threads  !! CUDA kernel configuration
 
     if (tdsops%periodic) then
       call der_univ_thom_per<<<blocks, threads>>>( & !&
diff --git a/src/backend/cuda/kernels/distributed.f90 b/src/backend/cuda/kernels/distributed.f90
index 8e7a1ba94..4cf97fc19 100644
--- a/src/backend/cuda/kernels/distributed.f90
+++ b/src/backend/cuda/kernels/distributed.f90
@@ -1,4 +1,9 @@
 module m_cuda_kernels_dist
+  !! CUDA kernels for distributed compact finite difference schemes.
+  !!
+  !! GPU kernels implementing forward and backward sweeps for compact schemes
+  !! across MPI domain boundaries. Handles stencil application using halo data,
+  !! forward elimination, and backward substitution for distributed tridiagonal systems.
   use cudafor
 
   use m_common, only: dp
@@ -11,16 +16,20 @@ attributes(global) subroutine der_univ_dist( &
     du, send_u_s, send_u_e, u, u_s, u_e, &
     n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf &
     )
+    !! CUDA kernel for distributed compact scheme forward sweep and boundary setup.
+    !!
+    !! Applies compact stencils using local data (u) and halo data (u_s, u_e) from
+    !! neighbours. Performs forward elimination and prepares boundary data for MPI exchange.
     implicit none
 
-    ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, &
-                                                         send_u_e
-    real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e
-    integer, value, intent(in) :: n_tds, n_rhs
-    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: derivatives with forward elimination
+    real(dp), device, intent(out), dimension(:, :, :) :: send_u_s, send_u_e  !! Boundary data for MPI exchange
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: local field data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_s, u_e  !! Halo data from start/end neighbours
+    integer, value, intent(in) :: n_tds, n_rhs  !! Grid and RHS dimensions
+    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Boundary stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Bulk stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf  !! Forward elimination factors
 
     ! Local variables
     integer :: i, j, b, k, lj
@@ -148,17 +157,22 @@ end subroutine der_univ_dist
 
   attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, &
                                               n, dist_sa, dist_sc, strch)
+    !! Backward substitution for distributed compact scheme.
+    !!
+    !! Completes the tridiagonal solve using boundary solutions received from
+    !! neighbouring MPI ranks. Applies Sherman-Morrison-like correction for
+    !! distributed system using Toeplitz matrix symmetry properties.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e
-    real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch
-    integer, value, intent(in) :: n
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Final derivative solution
+    real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e  !! Boundary solutions from neighbours
+    real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch  !! Distributed coefficients and stretching
+    integer, value, intent(in) :: n  !! Number of local grid points
 
     ! Local variables
-    integer :: i, j, b
-    real(dp) :: ur, bl, recp, du_s, du_e
+    integer :: i, j, b  !! Thread, loop, and block indices
+    real(dp) :: ur, bl, recp, du_s, du_e  !! Upper-right, bottom-left, reciprocal, boundary solutions
 
     i = threadIdx%x
     b = blockIdx%x
@@ -201,39 +215,44 @@ attributes(global) subroutine transeq_3fused_dist( &
     dud_coeffs_s, dud_coeffs_e, dud_coeffs, dud_fw, dud_bw, dud_af, &
     d2u_coeffs_s, d2u_coeffs_e, d2u_coeffs, d2u_fw, d2u_bw, d2u_af &
     )
+    !! Distributed forward sweep for 3 fused transport equation derivatives.
+    !!
+    !! Computes du, dud (convective), and d2u simultaneously using independent
+    !! compact stencils. Performs forward elimination and prepares boundary data
+    !! for MPI exchange. Optimised for transport equation with convective terms.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u
+    real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u  !! Output: Three derivative fields
     real(dp), device, intent(out), dimension(:, :, :) :: &
-      send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e
+      send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e  !! Boundary data for MPI exchange
     real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e, &
-                                                        v, v_s, v_e
-    integer, value, intent(in) :: n_tds, n_rhs
+                                                        v, v_s, v_e  !! Input fields and halos
+    integer, value, intent(in) :: n_tds, n_rhs  !! Grid dimensions
     real(dp), device, intent(in) :: du_coeffs_s(:, :), du_coeffs_e(:, :), &
-                                    du_coeffs(:)
-    real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:)
+                                    du_coeffs(:)  !! du stencil coefficients
+    real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:)  !! du forward/backward/alpha factors
     real(dp), device, intent(in) :: dud_coeffs_s(:, :), dud_coeffs_e(:, :), &
-                                    dud_coeffs(:)
-    real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:)
+                                    dud_coeffs(:)  !! dud stencil coefficients
+    real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:)  !! dud forward/backward/alpha factors
     real(dp), device, intent(in) :: d2u_coeffs_s(:, :), d2u_coeffs_e(:, :), &
-                                    d2u_coeffs(:)
-    real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:)
+                                    d2u_coeffs(:)  !! d2u stencil coefficients
+    real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:)  !! d2u forward/backward/alpha factors
 
     ! Local variables
-    integer :: i, j, b
+    integer :: i, j, b  !! Thread, loop, and block indices
 
     real(dp) :: du_c_m4, du_c_m3, du_c_m2, du_c_m1, du_c_j, &
                 du_c_p1, du_c_p2, du_c_p3, du_c_p4, &
-                du_alpha, du_last_r
+                du_alpha, du_last_r  !! du stencil coefficients and factors
     real(dp) :: dud_c_m4, dud_c_m3, dud_c_m2, dud_c_m1, dud_c_j, &
                 dud_c_p1, dud_c_p2, dud_c_p3, dud_c_p4, &
-                dud_alpha, dud_last_r
+                dud_alpha, dud_last_r  !! dud stencil coefficients and factors
     real(dp) :: d2u_c_m4, d2u_c_m3, d2u_c_m2, d2u_c_m1, d2u_c_j, &
                 d2u_c_p1, d2u_c_p2, d2u_c_p3, d2u_c_p4, &
-                d2u_alpha, d2u_last_r
-    real(dp) :: temp_du, temp_dud, temp_d2u
-    real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4
+                d2u_alpha, d2u_last_r  !! d2u stencil coefficients and factors
+    real(dp) :: temp_du, temp_dud, temp_d2u  !! Temporary derivative values
+    real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4  !! Reused field values
     real(dp) :: v_m4, v_m3, v_m2, v_m1, v_j, v_p1, v_p2, v_p3, v_p4
     real(dp) :: old_du, old_dud, old_d2u
 
@@ -593,26 +612,31 @@ attributes(global) subroutine transeq_3fused_subs( &
     n, nu, du_sa, du_sc, du_strch, dud_sa, dud_sc, dud_strch, &
     d2u_sa, d2u_sc, d2u_strch, d2u_strch_cor &
     )
+    !! Backward substitution for 3 fused transport equation derivatives.
+    !!
+    !! Completes distributed tridiagonal solves for du, dud, d2u using boundary
+    !! solutions from neighbours. Combines results to form RHS of transport equation:
+    !! r_du = -conv*dud + nu*d2u. Applies Sherman-Morrison corrections for all three fields.
     implicit none
 
     ! Arguments
     !> The result array, it stores 'du' first then its overwritten
-    real(dp), device, intent(inout), dimension(:, :, :) :: r_du
-    real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u
+    real(dp), device, intent(inout), dimension(:, :, :) :: r_du  !! In/out: Stores du then overwritten with RHS
+    real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u  !! Input: Convection velocity and derivatives
     real(dp), device, intent(in), dimension(:, :, :) :: &
-      recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e
-    integer, value, intent(in) :: n
-    real(dp), value, intent(in) :: nu
+      recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e  !! Boundary solutions from neighbours
+    integer, value, intent(in) :: n  !! Number of local grid points
+    real(dp), value, intent(in) :: nu  !! Kinematic viscosity
     real(dp), device, intent(in), dimension(:) :: du_sa, du_sc, du_strch, &
                                                   dud_sa, dud_sc, dud_strch, &
                                                   d2u_sa, d2u_sc, d2u_strch, &
-                                                  d2u_strch_cor
+                                                  d2u_strch_cor  !! Distributed coefficients for all three fields
 
     ! Local variables
-    integer :: i, j, b
-    real(dp) :: ur, bl, recp
-    real(dp) :: du_temp, dud_temp, d2u_temp
-    real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e
+    integer :: i, j, b  !! Thread, loop, and block indices
+    real(dp) :: ur, bl, recp  !! Upper-right, bottom-left, reciprocal for Sherman-Morrison
+    real(dp) :: du_temp, dud_temp, d2u_temp  !! Temporary derivative values
+    real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e  !! Boundary solutions for all three fields
 
     i = threadIdx%x
     b = blockIdx%x
diff --git a/src/backend/cuda/kernels/fieldops.f90 b/src/backend/cuda/kernels/fieldops.f90
index 949bc6ab6..d3147fa5a 100644
--- a/src/backend/cuda/kernels/fieldops.f90
+++ b/src/backend/cuda/kernels/fieldops.f90
@@ -1,4 +1,10 @@
 module m_cuda_kernels_fieldops
+  !! CUDA kernels for field operations (copy, scale, vector arithmetic, reductions).
+  !!
+  !! Provides GPU kernels for basic field manipulation: copying, scaling, shifting,
+  !! linear combinations (AXPBY), pointwise multiplication, scalar products, and
+  !! reductions (max, sum, volume integral). All kernels use thread-per-pencil-point
+  !! parallelisation with [[m_cuda_common(module):SZ(variable)]] threads per block.
   use cudafor
 
   use m_common, only: dp
@@ -7,13 +13,16 @@ module m_cuda_kernels_fieldops
 contains
 
   attributes(global) subroutine copy(n, dst, src)
+    !! Copy field data: dst = src.
     implicit none
 
-    integer, value, intent(in) :: n
-    real(dp), device, intent(out), dimension(:, :, :) :: dst
-    real(dp), device, intent(in), dimension(:, :, :) :: src
+    integer, value, intent(in) :: n  !! Pencil length
+    real(dp), device, intent(out), dimension(:, :, :) :: dst  !! Destination array
+    real(dp), device, intent(in), dimension(:, :, :) :: src  !! Source array
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -25,14 +34,17 @@ attributes(global) subroutine copy(n, dst, src)
   end subroutine copy
 
   attributes(global) subroutine axpby(n, alpha, x, beta, y)
+    !! Compute linear combination: y = alpha*x + beta*y.
     implicit none
 
-    integer, value, intent(in) :: n
-    real(dp), value, intent(in) :: alpha, beta
-    real(dp), device, intent(in), dimension(:, :, :) :: x
-    real(dp), device, intent(inout), dimension(:, :, :) :: y
+    integer, value, intent(in) :: n  !! Pencil length
+    real(dp), value, intent(in) :: alpha, beta  !! Scalar coefficients
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! Input array
+    real(dp), device, intent(inout), dimension(:, :, :) :: y  !! Input/Output array
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -44,13 +56,16 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)
   end subroutine axpby
 
   attributes(global) subroutine pwmul(y, x, n)
+    !! Pointwise multiplication: y = y * x.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: y
-    real(dp), device, intent(in), dimension(:, :, :) :: x
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: y  !! Input/Output array
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! Multiplier array
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -62,13 +77,20 @@ attributes(global) subroutine pwmul(y, x, n)
   end subroutine pwmul
 
   attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
+    !! Copy halo regions into send buffers.
+    !!
+    !! Extracts first and last n_halo planes into separate buffers for MPI communication.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n, n_halo
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s  !! Start buffer
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_send_e  !! End buffer
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Source field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_halo  !! Halo width
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Halo plane index
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -81,13 +103,16 @@ attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
   end subroutine buffer_copy
 
   attributes(global) subroutine field_scale(f, alpha, n)
+    !! Scale field by constant: f = alpha * f.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: alpha
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to scale
+    real(dp), value, intent(in) :: alpha  !! Scaling factor
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -99,13 +124,16 @@ attributes(global) subroutine field_scale(f, alpha, n)
   end subroutine field_scale
 
   attributes(global) subroutine field_shift(f, const, n)
+    !! Shift field by constant: f = f + const.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: const
-    integer, value, intent(in) :: n
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to shift
+    real(dp), value, intent(in) :: const  !! Shift constant
+    integer, value, intent(in) :: n  !! Pencil length
 
-    integer :: i, j, b
+    integer :: i  !! Thread index (pencil point)
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
 
     i = threadIdx%x
     b = blockIdx%x
@@ -117,14 +145,24 @@ attributes(global) subroutine field_shift(f, const, n)
   end subroutine field_shift
 
   attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j)
+    !! Compute scalar product with atomic reduction: s += sum(x * y).
+    !!
+    !! Uses atomic addition to accumulate partial sums from each pencil.
     implicit none
 
-    real(dp), device, intent(inout) :: s
-    real(dp), device, intent(in), dimension(:, :, :) :: x, y
-    integer, value, intent(in) :: n, n_i_pad, n_j
+    real(dp), device, intent(inout) :: s  !! Accumulated scalar product
+    real(dp), device, intent(in), dimension(:, :, :) :: x  !! First field
+    real(dp), device, intent(in), dimension(:, :, :) :: y  !! Second field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
 
-    real(dp) :: s_pncl !! pencil sum
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp) :: s_pncl  !! Pencil sum
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
@@ -142,14 +180,26 @@ attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j)
   end subroutine scalar_product
 
   attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j)
+    !! Compute field maximum and sum with atomic reductions.
+    !!
+    !! Uses atomic max and add operations to accumulate pencil-wise results.
     implicit none
 
-    real(dp), device, intent(inout) :: max_f, sum_f
-    real(dp), device, intent(in), dimension(:, :, :) :: f
-    integer, value, intent(in) :: n, n_i_pad, n_j
-
-    real(dp) :: max_pncl, sum_pncl, val
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp), device, intent(inout) :: max_f  !! Accumulated maximum
+    real(dp), device, intent(inout) :: sum_f  !! Accumulated sum
+    real(dp), device, intent(in), dimension(:, :, :) :: f  !! Input field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
+
+    real(dp) :: max_pncl  !! Pencil maximum
+    real(dp) :: sum_pncl  !! Pencil sum
+    real(dp) :: val  !! Absolute value
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
@@ -171,15 +221,21 @@ attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j)
   end subroutine field_max_sum
 
   attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz)
-    !! Set domain Y_FACE to a constant
-    !! c_start at the bottom and c_end at the top
+    !! Set Y-face boundary values to constants.
+    !!
+    !! Sets bottom face (y=0) to c_start and top face (y=L) to c_end.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: f
-    real(dp), value, intent(in) :: c_start, c_end
-    integer, value, intent(in) :: nx, ny, nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: f  !! Field to modify
+    real(dp), value, intent(in) :: c_start  !! Bottom boundary value
+    real(dp), value, intent(in) :: c_end  !! Top boundary value
+    integer, value, intent(in) :: nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, b, n_mod, b_end
+    integer :: i  !! Thread index
+    integer :: j  !! X-coordinate
+    integer :: b  !! Z-coordinate block
+    integer :: n_mod  !! Modulo for top boundary indexing
+    integer :: b_end  !! Top boundary block index
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x ! from 1 to nx
     b = blockIdx%y ! from 1 to nz
@@ -195,14 +251,23 @@ attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz)
   end subroutine field_set_y_face
 
   attributes(global) subroutine volume_integral(s, f, n, n_i_pad, n_j)
+    !! Compute volume integral with atomic reduction: s += sum(f).
+    !!
+    !! Uses atomic addition to accumulate partial sums from each pencil.
     implicit none
 
-    real(dp), device, intent(inout) :: s
-    real(dp), device, intent(in), dimension(:, :, :) :: f
-    integer, value, intent(in) :: n, n_i_pad, n_j
-
-    real(dp) :: s_pncl !! pencil sum
-    integer :: i, j, b, b_i, b_j, ierr
+    real(dp), device, intent(inout) :: s  !! Accumulated integral
+    real(dp), device, intent(in), dimension(:, :, :) :: f  !! Input field
+    integer, value, intent(in) :: n  !! Pencil length
+    integer, value, intent(in) :: n_i_pad  !! Padded dimension for indexing
+    integer, value, intent(in) :: n_j  !! Active pencil count
+
+    real(dp) :: s_pncl  !! Pencil sum
+    integer :: i  !! Thread index
+    integer :: j  !! Pencil coordinate
+    integer :: b  !! Block index (pencil number)
+    integer :: b_i, b_j  !! 2D block indices
+    integer :: ierr  !! Atomic operation status
 
     i = threadIdx%x
     b_i = blockIdx%x
diff --git a/src/backend/cuda/kernels/reorder.f90 b/src/backend/cuda/kernels/reorder.f90
index 4065a2595..cd96a029b 100644
--- a/src/backend/cuda/kernels/reorder.f90
+++ b/src/backend/cuda/kernels/reorder.f90
@@ -1,4 +1,10 @@
 module m_cuda_kernels_reorder
+  !! CUDA kernels for pencil reordering and accumulation between X/Y/Z orientations.
+  !!
+  !! Provides GPU kernels for rearranging field data between different pencil decompositions
+  !! (X-pencils, Y-pencils, Z-pencils, and Cartesian). Most kernels use shared memory tiles
+  !! for coalesced memory access. Thread blocks use [[m_cuda_common(module):SZ(variable)]]
+  !! configuration (32x1 or 32x32 depending on operation).
   use cudafor
 
   use m_common, only: dp
@@ -7,14 +13,18 @@ module m_cuda_kernels_reorder
 contains
 
   attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
+    !! Reorder from Cartesian to X-pencil orientation.
+    !!
+    !! Uses shared memory transpose for efficient reordering.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_c
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_c  !! Input: Cartesian data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -42,14 +52,18 @@ attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
   end subroutine reorder_c2x
 
   attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
+    !! Reorder from X-pencil to Cartesian orientation.
+    !!
+    !! Inverse of reorder_c2x. Uses shared memory transpose.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_c
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_c  !! Output: Cartesian data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -77,14 +91,18 @@ attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
   end subroutine reorder_x2c
 
   attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
+    !! Reorder from X-pencil to Y-pencil orientation.
+    !!
+    !! Uses shared memory transpose for efficient reordering.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_y
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_y  !! Output: Y-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j  !! Thread indices
+    integer :: b_i, b_j, b_k  !! Block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -112,13 +130,19 @@ attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
   end subroutine reorder_x2y
 
   attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
+    !! Reorder from X-pencil to Z-pencil orientation.
+    !!
+    !! No shared memory needed - memory access pattern is already favourable.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_z
-    real(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_z  !! Output: Z-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_x  !! Input: X-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i  !! Thread index
+    integer :: j  !! Loop index
+    integer :: b_i, b_j  !! Block indices
+    integer :: nx  !! Grid X-dimension
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
@@ -132,14 +156,15 @@ attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
   end subroutine reorder_x2z
 
   attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
+    !! Reorder from Y-pencil to X-pencil orientation.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread and block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -167,14 +192,15 @@ attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
   end subroutine reorder_y2x
 
   attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
+    !! Reorder from Y-pencil to Z-pencil orientation.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_z
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nx, nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_z  !! Output: Z-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data
+    integer, value, intent(in) :: nx, nz  !! Grid dimensions
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread and block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -202,13 +228,16 @@ attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
   end subroutine reorder_y2z
 
   attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
+    !! Reorder from Z-pencil to X-pencil orientation.
+    !!
+    !! No shared memory needed - favourable memory access pattern.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_x  !! Output: X-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i, j, b_i, b_j, nx  !! Thread, loop, block indices and grid size
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
@@ -220,14 +249,17 @@ attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
   end subroutine reorder_z2x
 
   attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
+    !! Reorder from Z-pencil to Y-pencil orientation.
+    !!
+    !! Uses shared memory tile for coalesced access pattern.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: u_y
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nx, nz
+    real(dp), device, intent(out), dimension(:, :, :) :: u_y  !! Output: Y-pencil data
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data
+    integer, value, intent(in) :: nx, nz  !! X and Z dimension sizes
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory tile for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread, block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -255,14 +287,18 @@ attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
   end subroutine reorder_z2y
 
   attributes(global) subroutine sum_yintox(u_x, u_y, nz)
+    !! Accumulate Y-pencil contributions into X-pencil data.
+    !!
+    !! Performs u_x += u_y with reordering. Uses shared memory tile
+    !! for efficient transpose and coalesced memory access.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_x  !! In/out: X-pencil data to accumulate into
+    real(dp), device, intent(in), dimension(:, :, :) :: u_y  !! Input: Y-pencil data to add
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    real(dp), shared :: tile(SZ, SZ)
-    integer :: i, j, b_i, b_j, b_k
+    real(dp), shared :: tile(SZ, SZ)  !! Shared memory tile for transpose
+    integer :: i, j, b_i, b_j, b_k  !! Thread, block indices
 
     i = threadIdx%x; j = threadIdx%y; 
     b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
@@ -294,14 +330,18 @@ attributes(global) subroutine sum_yintox(u_x, u_y, nz)
   end subroutine sum_yintox
 
   attributes(global) subroutine sum_zintox(u_x, u_z, nz)
+    !! Accumulate Z-pencil contributions into X-pencil data.
+    !!
+    !! Performs u_x += u_z with reordering. No shared memory needed
+    !! due to favourable memory access pattern.
     implicit none
 
     ! Arguments
-    real(dp), device, intent(inout), dimension(:, :, :) :: u_x
-    real(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: u_x  !! In/out: X-pencil data to accumulate into
+    real(dp), device, intent(in), dimension(:, :, :) :: u_z  !! Input: Z-pencil data to add
+    integer, value, intent(in) :: nz  !! Z-dimension size
 
-    integer :: i, j, b_i, b_j, nx
+    integer :: i, j, b_i, b_j, nx  !! Thread, loop, block indices and grid size
 
     i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
     nx = gridDim%x
diff --git a/src/backend/cuda/kernels/spectral_processing.f90 b/src/backend/cuda/kernels/spectral_processing.f90
index 54b27b364..c8bd4256c 100644
--- a/src/backend/cuda/kernels/spectral_processing.f90
+++ b/src/backend/cuda/kernels/spectral_processing.f90
@@ -1,4 +1,13 @@
 module m_cuda_spectral
+  !! CUDA kernels for spectral space processing and FFT post-processing.
+  !!
+  !! This module contains kernels for:
+  !! - Post-processing spectral transforms (forward/backward)
+  !! - Solving Poisson equations in spectral space
+  !! - Enforcing and undoing periodicity in Y-direction
+  !!
+  !! Implements spectral equivalence method from JCP 228 (2009), 5989-6015, Sec 4.
+  !! Handles both periodic (000) and non-periodic (010) boundary conditions.
   use cudafor
 
   use m_common, only: dp
@@ -8,14 +17,16 @@ module m_cuda_spectral
 contains
 
   attributes(global) subroutine memcpy3D(dst, src, nx, ny, nz)
-    !! Copy data between x3d2 padded arrays and cuFFTMp descriptors
+    !! Copy data between x3d2 padded arrays and cuFFTMp descriptors.
+    !!
+    !! Each thread handles one Y-Z plane position, looping over X.
     implicit none
 
-    real(dp), device, intent(inout), dimension(:, :, :) :: dst
-    real(dp), device, intent(in), dimension(:, :, :) :: src
-    integer, value, intent(in) :: nx, ny, nz
+    real(dp), device, intent(inout), dimension(:, :, :) :: dst  !! Output: Destination array
+    real(dp), device, intent(in), dimension(:, :, :) :: src  !! Input: Source array
+    integer, value, intent(in) :: nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Loop and thread indices
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x !ny
     k = blockIdx%y !nz
@@ -34,23 +45,19 @@ attributes(global) subroutine process_spectral_000( &
     !! Post-processes the divergence of velocity in spectral space, including
     !! scaling w.r.t. grid size.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs forward post-processing, Poisson solve, and backward post-processing
+    !! using spectral equivalence method. Ref: JCP 228 (2009), 5989-6015, Sec 4.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    complex(dp), device, intent(in), dimension(:, :, :) :: waves
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
-    !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
-    !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence of velocity in spectral space
+    complex(dp), device, intent(in), dimension(:, :, :) :: waves  !! Input: Spectral wavenumbers for Poisson solve
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in the permuted slabs in spectral space
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz
-    real(dp) :: tmp_r, tmp_c, div_r, div_c
+    integer :: i, j, k, ix, iy, iz  !! Loop and spectral mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c  !! Temporary real/imaginary components
 
     j = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -130,26 +137,22 @@ attributes(global) subroutine process_spectral_010( &
     div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, &
     ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Post-process divergence field and solve Poisson equation in spectral space
+    !! for non-periodic boundary conditions in Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs forward post-processing with odd/even mode handling, Poisson solve,
+    !! and backward post-processing. Ref: JCP 228 (2009), 5989-6015, Sec 4.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    complex(dp), device, intent(in), dimension(:, :, :) :: waves
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
-    !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
-    !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence field / pressure solution
+    complex(dp), device, intent(in), dimension(:, :, :) :: waves  !! Input: Spectral wavenumbers for Poisson solve
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary components for left/right modes
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -288,25 +291,23 @@ end subroutine process_spectral_010
   attributes(global) subroutine process_spectral_010_fw( &
     div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Forward post-processing only for non-periodic Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs normalisation, post-processing in X and Z, and odd/even mode handling
+    !! in Y. Used when Poisson solve and backward processing are separate steps.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Divergence field to post-process
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
     !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
     !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
     !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary real/imaginary components
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -368,22 +369,19 @@ end subroutine process_spectral_010_fw
   attributes(global) subroutine process_spectral_010_poisson( &
     div_u, a_re, a_im, off, inc, nx_spec, n, nx, ny, nz &
     )
-    !! Solve the Poisson equation at cell centres with non-perioic BC along y
+    !! Solve Poisson equation for non-periodic Y-direction using pentadiagonal solver.
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Handles odd/even mode separation using offset and increment parameters.
+    !! Modifies pentadiagonal coefficients in-place during forward/backward passes.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im
-    !> offset and increment. increment is 2 when considering only odd or even
-    integer, value, intent(in) :: off, inc
-    !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, n, nx, ny, nz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: RHS / Solution
+    real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im  !! In/out: Pentadiagonal coefficients (real/imag)
+    integer, value, intent(in) :: off, inc  !! Offset and increment for odd/even modes
+    integer, value, intent(in) :: nx_spec, n, nx, ny, nz  !! Grid dimensions
 
-    integer :: i, j, k, jm, nm
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon
+    integer :: i, j, k, jm, nm  !! Loop indices and mapped indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon  !! Temporary variables and tolerance
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -527,25 +525,23 @@ end subroutine process_spectral_010_poisson
   attributes(global) subroutine process_spectral_010_bw( &
     div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz &
     )
-    !! Post-processes the divergence of velocity in spectral space, including
-    !! scaling w.r.t. grid size.
+    !! Backward post-processing only for non-periodic Y-direction (010).
     !!
-    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    !! Performs odd/even mode recombination and post-processing in X and Z directions.
+    !! Completes the spectral-to-physical transformation after Poisson solve.
     implicit none
 
-    !> Divergence of velocity in spectral space
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
-    !> Spectral equivalence constants
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u  !! In/out: Solution field to post-process
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz  !! Input: Spectral equivalence constants
     !> Grid size in spectral space
-    integer, value, intent(in) :: nx_spec, ny_spec
+    integer, value, intent(in) :: nx_spec, ny_spec  !! Spectral space grid size
     !> Offset in y direction in the permuted slabs in spectral space
-    integer, value, intent(in) :: y_sp_st
+    integer, value, intent(in) :: y_sp_st  !! Y-direction offset in spectral slabs
     !> Grid size
-    integer, value, intent(in) :: nx, ny, nz
+    integer, value, intent(in) :: nx, ny, nz  !! Physical space grid size
 
-    integer :: i, j, k, ix, iy, iz, iy_rev
-    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c
+    integer :: i, j, k, ix, iy, iz, iy_rev  !! Loop, spectral, and reversed mode indices
+    real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c  !! Temporary real/imaginary components
 
     i = threadIdx%x + (blockIdx%x - 1)*blockDim%x
     k = blockIdx%y ! nz_spec
@@ -605,13 +601,17 @@ attributes(global) subroutine process_spectral_010_bw( &
   end subroutine process_spectral_010_bw
 
   attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny)
+    !! Enforce Y-direction periodicity by reordering data for non-periodic transforms.
+    !!
+    !! Maps full domain [1:ny] to symmetric layout required by non-periodic FFT.
+    !! First half: odd points, second half: even points in reverse order.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: f_out
-    real(dp), device, intent(in), dimension(:, :, :) :: f_in
-    integer, value, intent(in) :: ny
+    real(dp), device, intent(out), dimension(:, :, :) :: f_out  !! Output: Reordered field
+    real(dp), device, intent(in), dimension(:, :, :) :: f_in  !! Input: Original field
+    integer, value, intent(in) :: ny  !! Y-dimension size
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Thread and loop indices
 
     i = threadIdx%x
     k = blockIdx%x
@@ -626,13 +626,17 @@ attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny)
   end subroutine enforce_periodicity_y
 
   attributes(global) subroutine undo_periodicity_y(f_out, f_in, ny)
+    !! Undo Y-direction periodicity reordering after non-periodic transforms.
+    !!
+    !! Inverse of enforce_periodicity_y: reconstructs original domain layout
+    !! from symmetric FFT ordering. Restores odd/even point positions.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: f_out
-    real(dp), device, intent(in), dimension(:, :, :) :: f_in
-    integer, value, intent(in) :: ny
+    real(dp), device, intent(out), dimension(:, :, :) :: f_out  !! Output: Restored field
+    real(dp), device, intent(in), dimension(:, :, :) :: f_in  !! Input: Reordered field
+    integer, value, intent(in) :: ny  !! Y-dimension size
 
-    integer :: i, j, k
+    integer :: i, j, k  !! Thread and loop indices
 
     i = threadIdx%x
     k = blockIdx%x
diff --git a/src/backend/cuda/kernels/thomas.f90 b/src/backend/cuda/kernels/thomas.f90
index b5bf81169..79ab698b1 100644
--- a/src/backend/cuda/kernels/thomas.f90
+++ b/src/backend/cuda/kernels/thomas.f90
@@ -1,4 +1,13 @@
 module m_cuda_kernels_thom
+  !! CUDA kernels for Thomas algorithm-based tridiagonal solvers.
+  !!
+  !! Implements compact finite difference schemes using Thomas algorithm
+  !! for both periodic and non-periodic boundary conditions. Each thread
+  !! handles one pencil line through the domain.
+  !!
+  !! Variants:
+  !! - der_univ_thom: Non-periodic boundaries with explicit near-boundary stencils
+  !! - der_univ_thom_per: Periodic boundaries with cyclic reduction
   use cudafor
 
   use m_common, only: dp
@@ -11,18 +20,23 @@ attributes(global) subroutine der_univ_thom( &
     du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, &
     thom_f, thom_s, thom_w, strch &
     )
+    !! Compute derivatives using Thomas algorithm with non-periodic boundaries.
+    !!
+    !! Forward pass: Apply compact stencil and eliminate sub-diagonal.
+    !! Backward pass: Back-substitution to solve tridiagonal system.
+    !! Near-boundary points use explicit stencils from coeffs_s/coeffs_e.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n_tds, n_rhs
-    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Derivative field
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: Field to differentiate
+    integer, value, intent(in) :: n_tds, n_rhs  !! Number of unknowns and RHS points
+    real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Start/end explicit stencil coefficients
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Bulk stencil coefficients (9-point)
+    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch  !! Thomas algorithm coefficients and stretching
 
-    integer :: i, j, b
+    integer :: i, j, b  !! Thread, loop, and block indices
 
-    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du
+    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du  !! Stencil coefficients and temporary
 
     i = threadIdx%x
     b = blockIdx%x
@@ -120,21 +134,26 @@ end subroutine der_univ_thom
   attributes(global) subroutine der_univ_thom_per( &
     du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch &
     )
+    !! Compute derivatives using Thomas algorithm with periodic boundaries.
+    !!
+    !! Forward pass: Apply periodic compact stencil with modulo indexing.
+    !! Backward pass: Standard back-substitution.
+    !! Periodic correction: Sherman-Morrison formula for cyclic system.
     implicit none
 
-    real(dp), device, intent(out), dimension(:, :, :) :: du
-    real(dp), device, intent(in), dimension(:, :, :) :: u
-    integer, value, intent(in) :: n
-    real(dp), device, intent(in), dimension(:) :: coeffs
-    real(dp), value, intent(in) :: alpha
+    real(dp), device, intent(out), dimension(:, :, :) :: du  !! Output: Derivative field
+    real(dp), device, intent(in), dimension(:, :, :) :: u  !! Input: Field to differentiate
+    integer, value, intent(in) :: n  !! Number of points in periodic direction
+    real(dp), device, intent(in), dimension(:) :: coeffs  !! Stencil coefficients (9-point)
+    real(dp), value, intent(in) :: alpha  !! Periodic coupling coefficient
     real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, &
-                                                  thom_p, strch
+                                                  thom_p, strch  !! Thomas and periodic correction coefficients
 
-    integer :: i, j, b
-    integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4
+    integer :: i, j, b  !! Thread, loop, and block indices
+    integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4  !! Periodic neighbor indices
 
-    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4
-    real(dp) :: temp_du, ss
+    real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4  !! Stencil coefficients
+    real(dp) :: temp_du, ss  !! Temporary derivative and Sherman-Morrison correction
 
     i = threadIdx%x
     b = blockIdx%x
diff --git a/src/backend/cuda/poisson_fft.f90 b/src/backend/cuda/poisson_fft.f90
index 32a362f10..43b062997 100644
--- a/src/backend/cuda/poisson_fft.f90
+++ b/src/backend/cuda/poisson_fft.f90
@@ -1,4 +1,9 @@
 module m_cuda_poisson_fft
+  !! FFT-based Poisson solver on GPU using cuFFT.
+  !!
+  !! Extends poisson_fft_t with device-resident spectral data and cuFFT plans.
+  !! Handles forward/backward transforms, spectral post-processing for different
+  !! boundary conditions, and periodic extensions.
   use iso_c_binding, only: c_loc, c_ptr, c_f_pointer, c_int, c_float, &
                            c_double_complex, c_float_complex
   use iso_fortran_env, only: stderr => error_unit
@@ -24,7 +29,7 @@ module m_cuda_poisson_fft
   implicit none
 
   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
-    !! FFT based Poisson solver
+    !! GPU-accelerated FFT-based Poisson solver with device-resident spectral data.
 
     !> Local domain sized array storing the spectral equivalence constants
     complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
@@ -149,20 +154,28 @@ end subroutine create_fft_plan
 
   function init(mesh, xdirps, ydirps, zdirps, lowmem) &
     result(poisson_fft)
+    !! Initialise CUDA Poisson FFT solver with cuFFT plans and spectral arrays.
+    !!
+    !! Sets up 3D FFT plans, allocates device storage for wave numbers and
+    !! stretching operators, and configures 1D decomposition (Z in real space,
+    !! Y in spectral space).
     implicit none
 
-    type(mesh_t), intent(in) :: mesh
-    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    logical, optional, intent(in) :: lowmem
+    type(mesh_t), intent(in) :: mesh  !! Computational mesh
+    type(dirps_t), intent(in) :: xdirps, ydirps, zdirps  !! Directional operators
+    logical, optional, intent(in) :: lowmem  !! Low memory mode flag
 
-    type(cuda_poisson_fft_t) :: poisson_fft
+    type(cuda_poisson_fft_t) :: poisson_fft  !! Initialised solver
 
-    integer :: nx, ny, nz
+    integer :: nx, ny, nz  !! Global grid dimensions
 
-    integer :: ierr
-    integer(int_ptr_kind()) :: worksize
+    integer :: ierr  !! Error code
+    integer(int_ptr_kind()) :: worksize  !! cuFFT workspace size
 
-    integer :: dims_glob(3), dims_loc(3), n_spec(3), n_sp_st(3)
+    integer :: dims_glob(3)  !! Global domain dimensions
+    integer :: dims_loc(3)  !! Local domain dimensions
+    integer :: n_spec(3)  !! Spectral space dimensions
+    integer :: n_sp_st(3)  !! Spectral space start indices
 
     ! 1D decomposition along Z in real domain, and along Y in spectral space
     if (mesh%par%nproc_dir(2) /= 1) print *, 'nproc_dir in y-dir must be 1'
@@ -282,19 +295,25 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) &
   end function init
 
   subroutine fft_forward_cuda(self, f)
+    !! Execute forward 3D FFT on device field.
+    !!
+    !! Copies padded field data into cuFFT descriptor storage and performs
+    !! forward transform using cuFFTMp.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(in) :: f
+    class(field_t), intent(in) :: f  !! Input field in real space
 
-    real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: padded_dev(:, :, :)  !! Padded field data
+    real(dp), device, pointer :: d_dev(:, :, :)  !! cuFFT descriptor data
+    real(dp), device, pointer :: f_ptr  !! Workaround device pointer for cuFFT
+    type(c_ptr) :: f_c_ptr  !! Intermediate C pointer for workaround
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    integer :: tsize, ierr
-    type(dim3) :: blocks, threads
+    integer :: tsize  !! Thread block size
+    integer :: ierr  !! Error code
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f)
     type is (cuda_field_t)
@@ -340,19 +359,25 @@ subroutine fft_forward_cuda(self, f)
   end subroutine fft_forward_cuda
 
   subroutine fft_backward_cuda(self, f)
+    !! Execute backward 3D FFT and copy result to device field.
+    !!
+    !! Performs inverse transform using cuFFTMp and copies result from
+    !! descriptor storage back to field's device array.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f
+    class(field_t), intent(inout) :: f  !! Output field in real space
 
-    real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :)
-    real(dp), device, pointer :: f_ptr
-    type(c_ptr) :: f_c_ptr
+    real(dp), device, pointer :: padded_dev(:, :, :)  !! Padded field data
+    real(dp), device, pointer :: d_dev(:, :, :)  !! cuFFT descriptor data
+    real(dp), device, pointer :: f_ptr  !! Workaround device pointer for cuFFT
+    type(c_ptr) :: f_c_ptr  !! Intermediate C pointer for workaround
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    integer :: tsize, ierr
-    type(dim3) :: blocks, threads
+    integer :: tsize  !! Thread block size
+    integer :: ierr  !! Error code
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f)
     type is (cuda_field_t)
@@ -399,15 +424,19 @@ subroutine fft_backward_cuda(self, f)
   end subroutine fft_backward_cuda
 
   subroutine fft_postprocess_000_cuda(self)
+    !! Post-process spectral data for Dirichlet-Dirichlet-Dirichlet boundaries.
+    !!
+    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with homogeneous
+    !! Dirichlet boundaries in all directions.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
 
-    type(cudaXtDesc), pointer :: descriptor
+    type(cudaXtDesc), pointer :: descriptor  !! cuFFTMp descriptor
 
-    complex(dp), device, dimension(:, :, :), pointer :: c_dev
-    type(dim3) :: blocks, threads
-    integer :: tsize
+    complex(dp), device, dimension(:, :, :), pointer :: c_dev  !! Spectral data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
+    integer :: tsize  !! Thread block size
 
     ! tsize is different than SZ, because here we work on a 3D Cartesian
     ! data structure, and free to specify any suitable thread/block size.
@@ -438,15 +467,22 @@ subroutine fft_postprocess_000_cuda(self)
   end subroutine fft_postprocess_000_cuda
 
   subroutine fft_postprocess_010_cuda(self)
+    !! Post-process spectral data for Dirichlet-Neumann-Dirichlet boundaries.
+    !!
+    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with Dirichlet
+    !! boundaries in X and Z, Neumann in Y. Handles stretched meshes with
+    !! matrix solves in spectral space.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
 
     type(cudaXtDesc), pointer :: descriptor
 
-    complex(dp), device, dimension(:, :, :), pointer :: c_dev
-    type(dim3) :: blocks, threads
-    integer :: tsize, off, inc
+    complex(dp), device, dimension(:, :, :), pointer :: c_dev  !! Spectral data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
+    integer :: tsize  !! Thread block size
+    integer :: off  !! Array offset for odd/even modes
+    integer :: inc  !! Array increment stride
 
     ! tsize is different than SZ, because here we work on a 3D Cartesian
     ! data structure, and free to specify any suitable thread/block size.
@@ -542,14 +578,19 @@ subroutine fft_postprocess_010_cuda(self)
   end subroutine fft_postprocess_010_cuda
 
   subroutine enforce_periodicity_y_cuda(self, f_out, f_in)
+    !! Enforce periodic extension in Y for Neumann boundaries.
+    !!
+    !! Extends field from physical domain size to doubled periodic domain
+    !! by symmetry (f(y+L) = f(L-y)) for Neumann boundary FFTs.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(field_t), intent(inout) :: f_out  !! Extended periodic field
+    class(field_t), intent(in) :: f_in  !! Original physical field
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev
-    type(dim3) :: blocks, threads
+    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev  !! Output device data
+    real(dp), device, pointer, dimension(:, :, :) :: f_in_dev  !! Input device data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f_out)
     type is (cuda_field_t)
@@ -569,14 +610,19 @@ subroutine enforce_periodicity_y_cuda(self, f_out, f_in)
   end subroutine enforce_periodicity_y_cuda
 
   subroutine undo_periodicity_y_cuda(self, f_out, f_in)
+    !! Extract physical domain from periodic extension in Y.
+    !!
+    !! Reverses enforce_periodicity_y by extracting original domain size
+    !! from doubled periodic field after inverse FFT.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
-    class(field_t), intent(inout) :: f_out
-    class(field_t), intent(in) :: f_in
+    class(field_t), intent(inout) :: f_out  !! Physical domain field
+    class(field_t), intent(in) :: f_in  !! Extended periodic field
 
-    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev
-    type(dim3) :: blocks, threads
+    real(dp), device, pointer, dimension(:, :, :) :: f_out_dev  !! Output device data
+    real(dp), device, pointer, dimension(:, :, :) :: f_in_dev  !! Input device data
+    type(dim3) :: blocks, threads  !! CUDA kernel configuration
 
     select type (f_out)
     type is (cuda_field_t)
diff --git a/src/backend/cuda/sendrecv.f90 b/src/backend/cuda/sendrecv.f90
index 4df5861d3..f8d20575c 100644
--- a/src/backend/cuda/sendrecv.f90
+++ b/src/backend/cuda/sendrecv.f90
@@ -1,4 +1,18 @@
 module m_cuda_sendrecv
+  !! MPI communication for CUDA backend using device pointers.
+  !!
+  !! Passes device pointers directly to MPI calls. With GPU-aware MPI
+  !! implementations (e.g., OpenMPI with CUDA support, MVAPICH2-GDR),
+  !! data transfers directly between GPU memories without staging through
+  !! host, reducing latency and increasing bandwidth.
+  !!
+  !! Without GPU-aware MPI, the implementation may stage through host
+  !! memory automatically, still functional but with additional overhead.
+  !!
+  !! - sendrecv_fields: Single field halo exchange
+  !! - sendrecv_3fields: Batch exchange for three fields (velocity components
+  !!   or derivatives). Batching amortises MPI overhead and enables better
+  !!   network utilisation.
   use cudafor
   use mpi
 
@@ -10,11 +24,21 @@ module m_cuda_sendrecv
 
   subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, &
                              n_data, nproc, prev, next)
+    !! Exchange boundary halos using MPI with device pointers.
+    !!
+    !! MPI_Isend/Irecv allows all four communications (send to prev/next,
+    !! receive from prev/next) to proceed concurrently, enabling network
+    !! pipelining. MPI_Waitall synchronises only when results needed.
+    !!
+    !! When nproc=1, data copied directly on device without MPI.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e
-    real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+    real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e  !! Device receive buffers
+    real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e   !! Device send buffers
+    integer, intent(in) :: n_data    !! Number of data elements
+    integer, intent(in) :: nproc     !! Number of processes in direction
+    integer, intent(in) :: prev      !! Previous neighbour rank
+    integer, intent(in) :: next      !! Next neighbour rank
 
     integer :: req(4), err(4), ierr, tag = 1234
 
@@ -41,13 +65,22 @@ subroutine sendrecv_3fields( &
     f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e, &
     n_data, nproc, prev, next &
     )
+    !! Exchange three fields simultaneously using batched MPI communication.
+    !!
+    !! Used for: (1) velocity component halos (u, v, w) before computing transport
+    !! equation, (2) derivative field halos (du, dud, d2u) in distributed compact
+    !! schemes. Batching all three fields amortises MPI setup overhead. Single
+    !! MPI_Waitall for all 12 operations reduces synchronisation points.
     implicit none
 
     real(dp), device, dimension(:, :, :), intent(out) :: &
-      f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e
+      f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e  !! Device receive buffers
     real(dp), device, dimension(:, :, :), intent(in) :: &
-      f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e
-    integer, intent(in) :: n_data, nproc, prev, next
+      f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e  !! Device send buffers
+    integer, intent(in) :: n_data    !! Number of data elements per field
+    integer, intent(in) :: nproc     !! Number of processes
+    integer, intent(in) :: prev      !! Previous neighbour rank
+    integer, intent(in) :: next      !! Next neighbour rank
 
     integer :: req(12), err(12), ierr, tag = 1234
 
diff --git a/src/backend/cuda/tdsops.f90 b/src/backend/cuda/tdsops.f90
index b14cf5614..d8fda1892 100644
--- a/src/backend/cuda/tdsops.f90
+++ b/src/backend/cuda/tdsops.f90
@@ -1,4 +1,9 @@
 module m_cuda_tdsops
+  !! GPU-resident tridiagonal operator coefficients.
+  !!
+  !! Extends base tdsops_t with device memory copies of all coefficient
+  !! arrays. One-time upload to GPU avoids repeated host-device transfers
+  !! during kernel execution, critical for performance.
   use iso_fortran_env, only: stderr => error_unit
 
   use m_common, only: dp
@@ -7,18 +12,15 @@ module m_cuda_tdsops
   implicit none
 
   type, extends(tdsops_t) :: cuda_tdsops_t
-    !! CUDA extension of the Tridiagonal Solver Operators class.
-    !!
-    !! Regular tdsops_t class is initiated and the coefficient arrays are
-    !! copied into device arrays so that cuda kernels can use them.
+    !! Tridiagonal operators with device-resident coefficients.
     real(dp), device, allocatable :: dist_fw_dev(:), dist_bw_dev(:), &
                                      dist_sa_dev(:), dist_sc_dev(:), &
-                                     dist_af_dev(:)
+                                     dist_af_dev(:)  !! Distributed compact scheme coefficients
     real(dp), device, allocatable :: thom_f_dev(:), thom_s_dev(:), &
-                                     thom_w_dev(:), thom_p_dev(:)
-    real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:)
+                                     thom_w_dev(:), thom_p_dev(:)  !! Thomas algorithm coefficients
+    real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:)  !! Grid stretching factors
     real(dp), device, allocatable :: coeffs_dev(:), &
-                                     coeffs_s_dev(:, :), coeffs_e_dev(:, :)
+                                     coeffs_s_dev(:, :), coeffs_e_dev(:, :)  !! Finite difference stencils
   contains
   end type cuda_tdsops_t
 
@@ -32,11 +34,13 @@ function cuda_tdsops_init( &
     n_tds, delta, operation, scheme, bc_start, bc_end, &
     stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
     ) result(tdsops)
-    !! Constructor function for the cuda_tdsops_t class.
-    !! See tdsops_t for details.
+    !! Initialise tridiagonal operators and upload to GPU.
+    !!
+    !! Computes coefficients on CPU via base tdsops_init, then copies
+    !! to device arrays for kernel access. See tdsops_t for parameters.
     implicit none
 
-    type(cuda_tdsops_t) :: tdsops !! return value of the function
+    type(cuda_tdsops_t) :: tdsops
 
     integer, intent(in) :: n_tds
     real(dp), intent(in) :: delta

From 825cefc1945c5669153ec6a42ef82fabbd96880f Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 14:19:26 +0000
Subject: [PATCH 09/12] manually add parameter that was accidentally removed
 during rebase

---
 src/common.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common.f90 b/src/common.f90
index 0fec74609..73d5c349b 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -24,6 +24,7 @@ module m_common
   logical, parameter :: is_sp = .false.   !! Flag indicating double precision
 #endif
 
+  integer, parameter :: sp = kind(0.0e0)  !! Single precision kind parameter
   integer, parameter :: i8 = selected_int_kind(18)  !! Integer kind for 64-bit integers
 
   real(dp), parameter :: pi = 4*atan(1.0_dp)  !! Mathematical constant \(\pi\)

From b6a6a476118dbe41155fce23689b8a1dc0f0651a Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 18:34:16 +0000
Subject: [PATCH 10/12] docs: fix FORD formatting and add detailed description
 to some routines

---
 src/allocator.f90                             |   2 +-
 src/backend/backend.f90                       | 430 +++++++++++++++---
 src/backend/cuda/allocator.f90                |   3 +-
 src/backend/cuda/backend.f90                  |   2 +-
 .../cuda/kernels/spectral_processing.f90      |   1 +
 src/backend/cuda/kernels/thomas.f90           |   5 +-
 src/backend/cuda/poisson_fft.f90              |   2 +-
 src/backend/cuda/sendrecv.f90                 |   4 +-
 src/backend/cuda/tdsops.f90                   |   2 +-
 src/backend/omp/backend.f90                   |  12 +-
 src/backend/omp/exec_dist.f90                 |   4 +-
 src/backend/omp/exec_thom.f90                 |   1 +
 src/backend/omp/kernels/distributed.f90       |   3 +
 .../omp/kernels/spectral_processing.f90       |   7 +-
 src/backend/omp/poisson_fft.f90               |  10 +-
 src/case/base_case.f90                        |  13 +-
 src/case/channel.f90                          |  10 +-
 src/case/generic.f90                          |   4 +-
 src/case/tgv.f90                              |   5 +
 src/common.f90                                |   1 +
 src/config.f90                                |  14 +-
 src/field.f90                                 |   2 +-
 src/io/adios2/io.f90                          |   3 +
 src/io/checkpoint_manager.f90                 |  23 +-
 src/io/dummy/io.f90                           |   3 +
 src/io/io_field_utils.f90                     |  59 ++-
 src/io/io_manager.f90                         |   2 +
 src/io/io_session.f90                         |  46 +-
 src/io/snapshot_manager.f90                   |  12 +-
 src/mesh.f90                                  |   2 +-
 src/mesh_content.f90                          |   7 +-
 src/module/ibm.f90                            |  45 +-
 src/ordering.f90                              |   4 +-
 src/poisson_fft.f90                           |  14 +-
 src/solver.f90                                |   3 +-
 src/tdsops.f90                                |   9 +-
 src/time_integrator.f90                       |  41 +-
 src/vector_calculus.f90                       |  21 +-
 src/xcompact.f90                              |   7 +-
 39 files changed, 670 insertions(+), 168 deletions(-)

diff --git a/src/allocator.f90 b/src/allocator.f90
index af6c57361..8a204f0ae 100644
--- a/src/allocator.f90
+++ b/src/allocator.f90
@@ -2,7 +2,7 @@ module m_allocator
   !! Memory allocator module for managing field data blocks.
   !!
   !! This module provides an allocator type that manages a pool of memory blocks
-  !! (field_t objects) organised in a linked list. The allocator supports efficient
+  !! (`field_t` objects) organised in a linked list. The allocator supports efficient
   !! memory reuse by allowing blocks to be requested and released, minimizing
   !! allocation/deallocation overhead during simulations.
 
diff --git a/src/backend/backend.f90 b/src/backend/backend.f90
index 4c10d2c74..5c73da7ba 100644
--- a/src/backend/backend.f90
+++ b/src/backend/backend.f90
@@ -1,4 +1,47 @@
 module m_base_backend
+  !! Abstract base backend defining the computational interface for X3D2 solver.
+  !!
+  !! This module defines the `base_backend_t` abstract type, which establishes
+  !! the interface for all backend implementations (CUDA GPU, OpenMP CPU, etc.).
+  !! The solver operates exclusively through these abstract interfaces, enabling
+  !! complete architecture independence.
+  !!
+  !! **Architecture Pattern:**
+  !!
+  !! The backend abstraction follows the Strategy design pattern:
+  !!
+  !! - **Abstract interface** (`base_backend_t`): Defines deferred procedures for
+  !!   all computational operations required by the solver
+  !! - **Concrete implementations**: CUDA backend (`m_cuda_backend`) and OMP
+  !!   backend (`m_omp_backend`) extend this base and provide architecture-specific
+  !!   implementations
+  !! - **Solver independence**: The solver (`m_solver`) calls backend methods
+  !!   through the abstract interface without knowing the underlying implementation
+  !!
+  !! **Key Operations Defined:**
+  !!
+  !! - **Transport equation derivatives**: `transeq_x`, `transeq_y`, `transeq_z`
+  !!   compute directional derivatives with halo exchange for distributed compact schemes
+  !! - **Tridiagonal solves**: `tds_solve` applies compact finite difference operators
+  !! - **Data reordering**: `reorder` transforms data between pencil decomposition
+  !!   orientations (X, Y, Z directions)
+  !! - **Field operations**: Vector arithmetic (`veccopy`, `vecadd`, `vecmult`),
+  !!   reductions (`scalar_product`, `field_volume_integral`), and utilities
+  !!   (`field_scale`, `field_shift`, `field_set_face`)
+  !! - **Summation**: `sum_yintox`, `sum_zintox` for integrating fields along
+  !!   specific directions
+  !!
+  !! **Backend Implementations:**
+  !!
+  !! - **CUDA backend** (`src/backend/cuda/backend.f90`): GPU-accelerated using
+  !!   NVIDIA CUDA with device memory management and kernel launches
+  !! - **OMP backend** (`src/backend/omp/backend.f90`): CPU parallelism via
+  !!   OpenMP threading and MPI domain decomposition
+  !!
+  !! **Usage:**
+  !!
+  !! Backends are instantiated at runtime based on compile-time configuration and
+  !! passed to the solver as a polymorphic pointer (`class(base_backend_t), pointer`).
   use mpi
 
   use m_allocator, only: allocator_t
@@ -11,19 +54,37 @@ module m_base_backend
   implicit none
 
   type, abstract :: base_backend_t
-      !! base_backend class defines all the abstract operations that the
-      !! solver class requires.
+      !! Abstract base type defining the computational backend interface.
       !!
-      !! For example, transport equation in solver class evaluates the
-      !! derivatives in x, y, and z directions, and reorders the input
-      !! fields as required. Then finally, combines all the directional
-      !! derivatives to obtain the divergence of U*.
+      !! This type encapsulates all architecture-specific operations required
+      !! by the solver, enabling transparent execution on different hardware
+      !! platforms (GPU via CUDA, CPU via OpenMP) without modifying solver code.
       !!
-      !! All these high level operations solver class executes are
-      !! defined here using the abstract interfaces. Every backend
-      !! implementation extends the present abstact backend class to
-      !! define the specifics of these operations based on the target
-      !! architecture.
+      !! **Design Philosophy:**
+      !!
+      !! The solver executes high-level operations (compute transport equation,
+      !! solve tridiagonal systems, reorder data, etc.) through deferred procedures
+      !! defined in this abstract interface. Each backend (CUDA, OMP) extends this
+      !! type and implements these procedures using architecture-specific kernels,
+      !! libraries, and memory management strategies.
+      !!
+      !! **Example Workflow:**
+      !!
+      !! When computing the transport equation, the solver calls:
+      !!
+      !! 1. `transeq_x`, `transeq_y`, `transeq_z` to compute directional derivatives
+      !! 2. `reorder` to transform data between pencil orientations
+      !! 3. `vecadd` to combine derivatives into divergence of \(U^*\)
+      !!
+      !! Each call dispatches to the appropriate backend implementation at runtime
+      !! via dynamic polymorphism.
+      !!
+      !! **Components:**
+      !!
+      !! - `n_halo`: Number of halo layers for distributed compact schemes (fixed at 4)
+      !! - `mesh`: Pointer to mesh object (grid dimensions, boundary conditions, decomposition)
+      !! - `allocator`: Memory allocator for field storage (host for OMP, device for CUDA)
+      !! - `poisson_fft`: FFT-based Poisson solver for pressure correction
 
     !> DistD2 implementation is hardcoded for 4 halo layers for all backends
     integer :: n_halo = 4
@@ -59,11 +120,35 @@ module m_base_backend
 
   abstract interface
     subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps)
-         !! transeq equation obtains the derivatives direction by
-         !! direction, and the exact algorithm used to obtain these
-         !! derivatives are decided at runtime. Backend implementations
-         !! are responsible from directing calls to transeq_ders into
-         !! the correct algorithm.
+      !! Compute transport equation derivatives for velocity components.
+      !!
+      !! This is the core computational kernel for the transport equation,
+      !! computing the advection-diffusion terms in one coordinate direction:
+      !!
+      !! \[
+      !! \frac{\partial u_i}{\partial t} = -u \frac{\partial u_i}{\partial x_j}
+      !!                                    - v \frac{\partial u_i}{\partial x_j}
+      !!                                    - w \frac{\partial u_i}{\partial x_j}
+      !!                                    + \nu \nabla^2 u_i
+      !! \]
+      !!
+      !! (where the direction \(x_j\) is specified by `dirps`).
+      !!
+      !! **Runtime algorithm selection:**
+      !!
+      !! The exact algorithm used to obtain the derivatives is decided at runtime
+      !! by the backend implementation. Backend implementations are responsible
+      !! for directing calls to the appropriate algorithm based on:
+      !!
+      !! - Operator configuration in `dirps` (distributed vs local compact schemes)
+      !! - Domain decomposition (number of processes in current direction)
+      !! - Boundary conditions (periodic vs non-periodic)
+      !!
+      !! The implementation routes to either:
+      !!
+      !! - **Distributed algorithm** (`exec_dist_transeq_3fused`): For distributed
+      !!   compact schemes with MPI halo exchange
+      !! - **Thomas algorithm** (`exec_thom_transeq`): For localized/periodic operators
       import :: base_backend_t
       import :: field_t
       import :: dirps_t
@@ -71,20 +156,34 @@ subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps)
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: du, dv, dw
-      class(field_t), intent(in) :: u, v, w
-      real(dp), intent(in) :: nu
-      type(dirps_t), intent(in) :: dirps
+      class(field_t), intent(inout) :: du, dv, dw  !! Derivative outputs (momentum equation RHS)
+      class(field_t), intent(in) :: u, v, w        !! Velocity components
+      real(dp), intent(in) :: nu                   !! Kinematic viscosity
+      type(dirps_t), intent(in) :: dirps           !! Directional derivative operators
     end subroutine transeq_ders
   end interface
 
   abstract interface
     subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync)
-         !! transeq equation obtains the derivatives direction by
-         !! direction, and the exact algorithm used to obtain these
-         !! derivatives are decided at runtime. Backend implementations
-         !! are responsible from directing calls to transeq_ders into
-         !! the correct algorithm.
+      !! Compute transport equation derivatives for passive scalar species.
+      !!
+      !! Similar to `transeq_ders` but for passive scalar transport:
+      !!
+      !! \[
+      !! \frac{\partial \phi}{\partial t} = -u \frac{\partial \phi}{\partial x_j}
+      !!                                     - v \frac{\partial \phi}{\partial x_j}
+      !!                                     - w \frac{\partial \phi}{\partial x_j}
+      !!                                     + \nu \nabla^2 \phi
+      !! \]
+      !!
+      !! where \(\phi\) is the scalar concentration and \(x_j\) is the direction
+      !! specified by `dirps`.
+      !!
+      !! **Synchronization:**
+      !!
+      !! The `sync` flag controls whether to synchronize device-to-host memory
+      !! transfers (CUDA backend) after computation. Set `.false.` when chaining
+      !! multiple operations to avoid unnecessary transfers.
       import :: base_backend_t
       import :: field_t
       import :: dirps_t
@@ -92,144 +191,275 @@ subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync)
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: dspec
-      class(field_t), intent(in) :: uvw, spec
-      real(dp), intent(in) :: nu
-      type(dirps_t), intent(in) :: dirps
-      logical, intent(in) :: sync
+      class(field_t), intent(inout) :: dspec  !! Scalar derivative output
+      class(field_t), intent(in) :: uvw       !! Velocity component in current direction
+      class(field_t), intent(in) :: spec      !! Scalar species concentration
+      real(dp), intent(in) :: nu              !! Diffusion coefficient
+      type(dirps_t), intent(in) :: dirps      !! Directional derivative operators
+      logical, intent(in) :: sync             !! Synchronize device transfers (CUDA only)
     end subroutine transeq_ders_spec
   end interface
 
   abstract interface
     subroutine tds_solve(self, du, u, tdsops)
-      !! transeq equation obtains the derivatives direction by
-      !! direction, and the exact algorithm used to obtain these
-      !! derivatives are decided at runtime. Backend implementations
-      !! are responsible from directing calls to tds_solve to the
-      !! correct algorithm.
+      !! Apply a tridiagonal operator to a field (compact finite difference operation).
+      !!
+      !! Solves the tridiagonal system arising from compact finite difference
+      !! schemes:
+      !!
+      !! \[
+      !! A f' = B f
+      !! \]
+      !!
+      !! where \(A\) is the implicit (tridiagonal) operator, \(B\) is the explicit
+      !! stencil, and \(f'\) is the derivative (or interpolated value).
+      !!
+      !! **Backend dispatch:**
+      !!
+      !! Routes to the appropriate tridiagonal solver:
+      !!
+      !! - **Distributed compact**: Uses `exec_dist_tds_compact` with MPI communication
+      !!   for boundary coupling between processes
+      !! - **Thomas algorithm**: Uses `exec_thom_tds_compact` for local/periodic systems
+      !! - **GPU**: Uses batched tridiagonal solvers (cuSPARSE or custom kernels)
+      !!
+      !! **Operations supported:**
+      !!
+      !! First derivative, second derivative, interpolation, staggered derivatives
+      !! (configured in `tdsops`).
       import :: base_backend_t
       import :: field_t
       import :: tdsops_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: du
-      class(field_t), intent(in) :: u
-      class(tdsops_t), intent(in) :: tdsops
+      class(field_t), intent(inout) :: du     !! Output field (derivative or interpolated values)
+      class(field_t), intent(in) :: u         !! Input field
+      class(tdsops_t), intent(in) :: tdsops   !! Tridiagonal operator (preprocessed)
     end subroutine tds_solve
   end interface
 
   abstract interface
     subroutine reorder(self, u_, u, direction)
-         !! reorder subroutines are straightforward, they rearrange
-         !! data into our specialist data structure so that regardless
-         !! of the direction tridiagonal systems are solved efficiently
-         !! and fast.
+      !! Reorder field data between pencil decomposition orientations.
+      !!
+      !! Transforms field layout from one pencil orientation to another to enable
+      !! efficient tridiagonal solves in different coordinate directions:
+      !!
+      !! - **DIR_X**: X-pencils (data contiguous in X, decomposed in Y-Z)
+      !! - **DIR_Y**: Y-pencils (data contiguous in Y, decomposed in X-Z)
+      !! - **DIR_Z**: Z-pencils (data contiguous in Z, decomposed in X-Y)
+      !! - **DIR_C**: Special compact orientation
+      !!
+      !! The `direction` parameter specifies the target orientation using reorder
+      !! constants (`RDR_X2Y`, `RDR_Y2Z`, etc.).
+      !!
+      !! **Backend implementation:**
+      !!
+      !! - **CUDA**: GPU transpose kernels with coalesced memory access
+      !! - **OMP**: MPI all-to-all communication with OpenMP threading
+      !!
+      !! **Performance note:** This is a bandwidth-intensive operation requiring
+      !! global data movement (MPI or device memory transfers).
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-      integer, intent(in) :: direction
+      class(field_t), intent(inout) :: u_      !! Output field (reordered)
+      class(field_t), intent(in) :: u          !! Input field
+      integer, intent(in) :: direction         !! Reorder direction (RDR_X2Y, RDR_Y2Z, etc.)
     end subroutine reorder
   end interface
 
   abstract interface
     subroutine sum_intox(self, u, u_)
-         !! sum9into3 subroutine combines all the directional velocity
-         !! derivatives into the corresponding x directional fields.
+      !! Sum directional derivatives back into X-oriented fields.
+      !!
+      !! Combines derivative contributions computed in different pencil orientations
+      !! (Y-pencils, Z-pencils) back into the X-pencil orientation:
+      !!
+      !! \[
+      !! u = u + u'
+      !! \]
+      !!
+      !! This operation accumulates terms when computing composite derivatives
+      !! like divergence:
+      !!
+      !! \[
+      !! \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x}
+      !!                          + \frac{\partial v}{\partial y}
+      !!                          + \frac{\partial w}{\partial z}
+      !! \]
+      !!
+      !! Each directional derivative is computed in its respective pencil orientation,
+      !! then summed into X-pencils via `sum_yintox` and `sum_zintox`.
+      !!
+      !! **Note:** The input field `u_` must be in a Y or Z pencil orientation;
+      !! the output `u` is always in X-pencil orientation.
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: u
-      class(field_t), intent(in) :: u_
+      class(field_t), intent(inout) :: u   !! Accumulated field (X-pencils, updated in-place)
+      class(field_t), intent(in) :: u_     !! Contribution to add (Y or Z pencils)
     end subroutine sum_intox
   end interface
 
   abstract interface
     subroutine veccopy(self, dst, src)
-         !! copy vectors: y = x
+      !! Copy one field to another: `dst = src`.
+      !!
+      !! Performs an element-wise copy of all field data from `src` to `dst`.
+      !! Both fields must have compatible dimensions and memory layout.
+      !!
+      !! **Backend implementation:**
+      !!
+      !! - **CUDA**: Device-to-device memory copy (cudaMemcpy)
+      !! - **OMP**: Host memory copy (array assignment or memcpy)
+      !!
+      !! **Note:** This is a deep copy operation; the fields remain independent
+      !! after the copy.
       import :: base_backend_t
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: dst
-      class(field_t), intent(in) :: src
+      class(field_t), intent(inout) :: dst  !! Destination field
+      class(field_t), intent(in) :: src     !! Source field
     end subroutine veccopy
   end interface
 
   abstract interface
     subroutine vecadd(self, a, x, b, y)
-         !! adds two vectors together: y = a*x + b*y
+      !! Compute linear combination of two fields (AXPBY operation).
+      !!
+      !! Performs the vector operation: \(y = a \cdot x + b \cdot y\)
+      !!
+      !! This is equivalent to the BLAS AXPBY operation, computing a scaled
+      !! sum of two vectors. The result is stored in-place in `y`.
+      !!
+      !! **Common use cases:**
+      !!
+      !! - **Vector addition**: `vecadd(self, 1.0_dp, x, 1.0_dp, y)` \(\rightarrow\) \(y = x + y\)
+      !! - **Scaled addition**: `vecadd(self, alpha, x, 1.0_dp, y)` \(\rightarrow\) \(y = \alpha x + y\)
+      !! - **Replacement**: `vecadd(self, 1.0_dp, x, 0.0_dp, y)` \(\rightarrow\) \(y = x\)
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      real(dp), intent(in) :: a
-      class(field_t), intent(in) :: x
-      real(dp), intent(in) :: b
-      class(field_t), intent(inout) :: y
+      real(dp), intent(in) :: a         !! Scaling factor for x
+      class(field_t), intent(in) :: x   !! Input field
+      real(dp), intent(in) :: b         !! Scaling factor for y
+      class(field_t), intent(inout) :: y !! Input/output field (modified in-place)
     end subroutine vecadd
   end interface
 
   abstract interface
     subroutine vecmult(self, y, x)
-        !! pointwise multiplication between two vectors: y(:) = y(:) * x(:)
+      !! Element-wise (pointwise) multiplication of two fields.
+      !!
+      !! Performs the element-wise product: \(y = y \odot x\)
+      !!
+      !! Each element of `y` is multiplied by the corresponding element of `x`.
+      !! The result is stored in-place in `y`. This is also known as the
+      !! Hadamard product or pointwise multiplication.
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: y
-      class(field_t), intent(in) :: x
+      class(field_t), intent(inout) :: y !! Input/output field (modified in-place)
+      class(field_t), intent(in) :: x    !! Multiplier field
     end subroutine vecmult
   end interface
 
   abstract interface
     real(dp) function scalar_product(self, x, y) result(s)
-         !! Calculates the scalar product of two input fields
+      !! Compute the global scalar (dot) product of two fields.
+      !!
+      !! Calculates: \(s = \sum_{i} x_i \cdot y_i\)
+      !!
+      !! This computes the inner product (dot product) of two fields across
+      !! all grid points. For distributed memory systems (MPI), partial sums
+      !! from each process are accumulated via MPI reduction to produce the
+      !! global sum.
+      !!
+      !! **Note:** The result includes contributions from all MPI ranks.
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: x, y
+      class(field_t), intent(in) :: x !! First field
+      class(field_t), intent(in) :: y !! Second field
     end function scalar_product
   end interface
 
   abstract interface
     subroutine field_ops(self, f, a)
-      !! Scales or shifts a field by a
+      !! Generic interface for in-place field operations with a scalar constant.
+      !!
+      !! This abstract interface is implemented by two operations:
+      !!
+      !! - **field_scale**: Multiply field by constant: \(f = a \cdot f\)
+      !! - **field_shift**: Add constant to field: \(f = f + a\)
+      !!
+      !! Both operations modify the field in-place and are backend-specific
+      !! (GPU kernels for CUDA, array operations for OMP).
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: f
-      real(dp), intent(in) :: a
+      class(field_t), intent(in) :: f  !! Field to operate on (modified in-place)
+      real(dp), intent(in) :: a        !! Scalar constant (scaling factor or shift amount)
     end subroutine field_ops
   end interface
 
   abstract interface
     real(dp) function field_reduce(self, f) result(s)
-      !! Reduces field to a scalar, example: volume integral
+      !! Reduce a field to a single scalar value via global summation.
+      !!
+      !! This abstract interface is currently implemented by:
+      !!
+      !! - **field_volume_integral**: Computes the volume integral \(\int f \,dV\)
+      !!
+      !! **Algorithm:**
+      !!
+      !! 1. **Local summation**: Each MPI process sums its local field values
+      !!    (optionally weighted by cell volumes for volume integration)
+      !! 2. **Global reduction**: MPI_Allreduce combines partial sums from all
+      !!    processes to produce the global result
+      !!
+      !! **Backend implementations:**
+      !!
+      !! - **CUDA**: GPU reduction kernel followed by MPI_Allreduce
+      !! - **OMP**: OpenMP parallel reduction followed by MPI_Allreduce
+      !!
+      !! **Requirements:**
+      !!
+      !! - Field must have `data_loc` set (cannot be `NULL_LOC`)
+      !! - Field must be in X-pencil orientation (`dir = DIR_X`)
+      !!
+      !! **Use cases:**
+      !!
+      !! - Volume integrals for conservation checks
+      !! - Global norms (L1, L2) for convergence monitoring
+      !! - Total mass/energy calculations
       import :: base_backend_t
       import :: dp
       import :: field_t
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(in) :: f
+      class(field_t), intent(in) :: f  !! Field to reduce
     end function field_reduce
   end interface
 
@@ -255,7 +485,7 @@ subroutine field_set_face(self, f, c_start, c_end, face)
       !! or a global domain boundary based on the location of the subdomain.
       !! This subroutine allows us to set any of these faces to a value,
       !! 'c_start' and 'c_end' for faces at opposite sides.
-      !! 'face' is one of X_FACE, Y_FACE, Z_FACE from common.f90
+      !! 'face' is one of `X_FACE`, `Y_FACE`, `Z_FACE` from `common.f90`
       import :: base_backend_t
       import :: dp
       import :: field_t
@@ -298,6 +528,39 @@ end subroutine copy_f_to_data
 
   abstract interface
     subroutine alloc_tdsops( &
+      !! Allocate and initialise a backend-specific tridiagonal operator.
+      !!
+      !! This deferred procedure creates a `tdsops_t` object configured for
+      !! compact finite difference operations (derivatives, interpolation, etc.).
+      !! The backend implementation allocates the appropriate subtype:
+      !!
+      !! - **CUDA backend**: Allocates `cuda_tdsops_t` with device memory pointers
+      !!   for GPU execution
+      !! - **OMP backend**: Allocates `omp_tdsops_t` with host memory for CPU execution
+      !!
+      !! The operator is fully preprocessed and ready for repeated application via
+      !! `tds_solve`.
+      !!
+      !! **Required arguments:**
+      !!
+      !! - `n_tds`: System size (number of grid points in the operator direction)
+      !! - `delta`: Grid spacing
+      !! - `operation`: Operation type (`'first-deriv'`, `'second-deriv'`,
+      !!   `'interpolate'`, `'stag-deriv'`)
+      !! - `scheme`: Numerical scheme name (e.g., `'compact6'`, `'compact4'`)
+      !! - `bc_start`, `bc_end`: Boundary condition flags (`BC_PERIODIC`,
+      !!   `BC_NEUMANN`, `BC_DIRICHLET`)
+      !!
+      !! **Optional arguments:**
+      !!
+      !! - `stretch`: Stretching coefficients for non-uniform grids
+      !! - `stretch_correct`: Correction for second derivatives on stretched grids
+      !! - `n_halo`: Number of halo layers (default from backend)
+      !! - `from_to`: Staggered grid direction (`'v2p'`, `'p2v'`)
+      !! - `sym`: Field symmetry at Neumann boundaries (`.true.` = symmetric/even,
+      !!   `.false.` = anti-symmetric/odd)
+      !! - `c_nu`, `nu0_nu`: Hyperviscosity parameters for compact6-hyperviscous
+      !!   second derivatives
       self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, &
       stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu &
       )
@@ -322,6 +585,37 @@ end subroutine alloc_tdsops
 
   abstract interface
     subroutine init_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem)
+      !! Initialise the backend-specific FFT-based Poisson solver.
+      !!
+      !! This deferred procedure creates and configures the Poisson solver object
+      !! (`self%poisson_fft`) for solving the pressure Poisson equation:
+      !! \(\nabla^2 \phi = f\)
+      !!
+      !! The backend implementation allocates the appropriate solver subtype:
+      !!
+      !! - **CUDA backend**: Allocates `cuda_poisson_fft_t` using cuFFT library
+      !!   for GPU-accelerated FFT transforms
+      !! - **OMP backend**: Allocates `omp_poisson_fft_t` using 2DECOMP&FFT library
+      !!   for CPU FFT transforms with MPI parallelisation
+      !!
+      !! The solver requires directional derivative operators (`xdirps`, `ydirps`,
+      !! `zdirps`) to construct spectral equivalence constants for handling:
+      !!
+      !! - Non-uniform grid spacing (stretching) in the Y-direction
+      !! - Mixed boundary conditions (e.g., periodic in X/Z, Dirichlet in Y)
+      !!
+      !! **Arguments:**
+      !!
+      !! - `mesh`: Mesh object containing grid dimensions, boundary conditions,
+      !!   and parallel decomposition information
+      !! - `xdirps`, `ydirps`, `zdirps`: Second-derivative operators in each direction,
+      !!   used to compute spectral equivalence constants for the modified wavenumbers
+      !! - `lowmem` (optional): Low-memory mode flag. When `.true.`, reduces memory
+      !!   footprint by deallocating temporary arrays after initialisation (CUDA only)
+      !!
+      !! **Note:** The Poisson solver is stored in `self%poisson_fft` and accessed
+      !! by the solver during the pressure correction step of the fractional-step
+      !! method.
       import :: base_backend_t
       import :: dirps_t
       import :: mesh_t
diff --git a/src/backend/cuda/allocator.f90 b/src/backend/cuda/allocator.f90
index 16d4014cb..6cab14145 100644
--- a/src/backend/cuda/allocator.f90
+++ b/src/backend/cuda/allocator.f90
@@ -7,7 +7,8 @@ module m_cuda_allocator
   !! expensive implicit host-device transfers that would kill performance.
   !!
   !! **Design rationale:**
-  !! - cuda_field_t extends field_t with device pointers (p_data_d, data_d)
+  !!
+  !! - `cuda_field_t` extends `field_t` with device pointers (`p_data_d`, `data_d`)
   !! - Maintains both 1D and 3D views of same memory for flexibility
   !! - Reference counting prevents premature deallocation
   !! - Block-based allocation reduces allocation overhead
diff --git a/src/backend/cuda/backend.f90 b/src/backend/cuda/backend.f90
index 7c67f09b2..292c8a7cd 100644
--- a/src/backend/cuda/backend.f90
+++ b/src/backend/cuda/backend.f90
@@ -1,7 +1,7 @@
 module m_cuda_backend
   !! CUDA backend implementing GPU-accelerated solver operations.
   !!
-  !! Extends base_backend_t with GPU kernel launches and device memory
+  !! Extends `base_backend_t` with GPU kernel launches and device memory
   !! management. Transport equations, tridiagonal solves, FFT operations,
   !! and field manipulations execute on GPU.
   !!
diff --git a/src/backend/cuda/kernels/spectral_processing.f90 b/src/backend/cuda/kernels/spectral_processing.f90
index c8bd4256c..0a97f6cee 100644
--- a/src/backend/cuda/kernels/spectral_processing.f90
+++ b/src/backend/cuda/kernels/spectral_processing.f90
@@ -2,6 +2,7 @@ module m_cuda_spectral
   !! CUDA kernels for spectral space processing and FFT post-processing.
   !!
   !! This module contains kernels for:
+  !!
   !! - Post-processing spectral transforms (forward/backward)
   !! - Solving Poisson equations in spectral space
   !! - Enforcing and undoing periodicity in Y-direction
diff --git a/src/backend/cuda/kernels/thomas.f90 b/src/backend/cuda/kernels/thomas.f90
index 79ab698b1..de1f023a2 100644
--- a/src/backend/cuda/kernels/thomas.f90
+++ b/src/backend/cuda/kernels/thomas.f90
@@ -6,8 +6,9 @@ module m_cuda_kernels_thom
   !! handles one pencil line through the domain.
   !!
   !! Variants:
-  !! - der_univ_thom: Non-periodic boundaries with explicit near-boundary stencils
-  !! - der_univ_thom_per: Periodic boundaries with cyclic reduction
+  !!
+  !! - `der_univ_thom`: Non-periodic boundaries with explicit near-boundary stencils
+  !! - `der_univ_thom_per`: Periodic boundaries with cyclic reduction
   use cudafor
 
   use m_common, only: dp
diff --git a/src/backend/cuda/poisson_fft.f90 b/src/backend/cuda/poisson_fft.f90
index 43b062997..ca1772095 100644
--- a/src/backend/cuda/poisson_fft.f90
+++ b/src/backend/cuda/poisson_fft.f90
@@ -1,7 +1,7 @@
 module m_cuda_poisson_fft
   !! FFT-based Poisson solver on GPU using cuFFT.
   !!
-  !! Extends poisson_fft_t with device-resident spectral data and cuFFT plans.
+  !! Extends `poisson_fft_t` with device-resident spectral data and cuFFT plans.
   !! Handles forward/backward transforms, spectral post-processing for different
   !! boundary conditions, and periodic extensions.
   use iso_c_binding, only: c_loc, c_ptr, c_f_pointer, c_int, c_float, &
diff --git a/src/backend/cuda/sendrecv.f90 b/src/backend/cuda/sendrecv.f90
index f8d20575c..37ed2184e 100644
--- a/src/backend/cuda/sendrecv.f90
+++ b/src/backend/cuda/sendrecv.f90
@@ -9,8 +9,8 @@ module m_cuda_sendrecv
   !! Without GPU-aware MPI, the implementation may stage through host
   !! memory automatically, still functional but with additional overhead.
   !!
-  !! - sendrecv_fields: Single field halo exchange
-  !! - sendrecv_3fields: Batch exchange for three fields (velocity components
+  !! - `sendrecv_fields`: Single field halo exchange
+  !! - `sendrecv_3fields`: Batch exchange for three fields (velocity components
   !!   or derivatives). Batching amortises MPI overhead and enables better
   !!   network utilisation.
   use cudafor
diff --git a/src/backend/cuda/tdsops.f90 b/src/backend/cuda/tdsops.f90
index d8fda1892..2ce9e47e8 100644
--- a/src/backend/cuda/tdsops.f90
+++ b/src/backend/cuda/tdsops.f90
@@ -1,7 +1,7 @@
 module m_cuda_tdsops
   !! GPU-resident tridiagonal operator coefficients.
   !!
-  !! Extends base tdsops_t with device memory copies of all coefficient
+  !! Extends base `tdsops_t` with device memory copies of all coefficient
   !! arrays. One-time upload to GPU avoids repeated host-device transfers
   !! during kernel execution, critical for performance.
   use iso_fortran_env, only: stderr => error_unit
diff --git a/src/backend/omp/backend.f90 b/src/backend/omp/backend.f90
index 9afbeec41..5a812b16f 100644
--- a/src/backend/omp/backend.f90
+++ b/src/backend/omp/backend.f90
@@ -6,11 +6,13 @@ module m_omp_backend
   !! abstract backend operations defined in `base_backend_t`.
   !!
   !! **Parallelisation Strategy:**
+  !!
   !! - **MPI**: Domain decomposition across nodes/processes
   !! - **OpenMP**: Thread parallelism within each MPI rank
   !! - **Hybrid MPI+OpenMP**: Enables efficient use of multi-core clusters
   !!
   !! **Key Features:**
+  !!
   !! - Compact finite difference operators (tridiagonal solves)
   !! - Halo exchange for distributed derivatives
   !! - FFT-based Poisson solver integration
@@ -18,14 +20,16 @@ module m_omp_backend
   !! - Optimised data reordering between decomposition directions
   !!
   !! **Memory Management:**
-  !! - Send/receive buffers for MPI halo exchange (u, v, w, du, dud, d2u)
+  !!
+  !! - Send/receive buffers for MPI halo exchange (`u`, `v`, `w`, `du`, `dud`, `d2u`)
   !! - Buffers sized based on largest decomposition direction
   !! - Persistent buffers to avoid repeated allocation
   !!
   !! **Solver Operations:**
-  !! - transeq: Transport equation terms with halo exchange
-  !! - tds_solve: Tridiagonal system solves (Thomas algorithm)
-  !! - reorder: Data layout transformations (DIR_X, DIR_Y, DIR_Z)
+  !!
+  !! - `transeq`: Transport equation terms with halo exchange
+  !! - `tds_solve`: Tridiagonal system solves (Thomas algorithm)
+  !! - `reorder`: Data layout transformations (`DIR_X`, `DIR_Y`, `DIR_Z`)
   !! - Field operations: copy, add, multiply, integrate, etc.
   !!
   !! **Note:** This backend requires 2DECOMP&FFT library for FFT operations
diff --git a/src/backend/omp/exec_dist.f90 b/src/backend/omp/exec_dist.f90
index 9c4018def..e56c3a61f 100644
--- a/src/backend/omp/exec_dist.f90
+++ b/src/backend/omp/exec_dist.f90
@@ -6,13 +6,15 @@ module m_omp_exec_dist
   !! sweeps, and boundary system solves for multi-process compact operators.
   !!
   !! **Key features:**
+  !!
   !! - Forward/backward elimination with boundary coupling
   !! - Non-blocking MPI communication for 2x2 boundary systems
   !! - OpenMP parallelisation over pencil groups
   !! - Fused kernels for transport equation efficiency
   !!
   !! **Distributed algorithm:**
-  !! 1. Forward/backward sweep on local domain $\rightarrow$ generate boundary systems
+  !!
+  !! 1. Forward/backward sweep on local domain \(\rightarrow\) generate boundary systems
   !! 2. MPI exchange boundary data between neighbours
   !! 3. Solve coupled 2x2 systems at process interfaces
   !! 4. Substitution sweep to complete solution
diff --git a/src/backend/omp/exec_thom.f90 b/src/backend/omp/exec_thom.f90
index fb61485c1..575da1f8b 100644
--- a/src/backend/omp/exec_thom.f90
+++ b/src/backend/omp/exec_thom.f90
@@ -6,6 +6,7 @@ module m_exec_thom
   !! decomposed in the derivative direction (all data local to process).
   !!
   !! **Two variants:**
+  !!
   !! - **Non-periodic:** Standard Thomas with arbitrary boundary conditions
   !! - **Periodic:** Modified Thomas for cyclic tridiagonal systems
   !!
diff --git a/src/backend/omp/kernels/distributed.f90 b/src/backend/omp/kernels/distributed.f90
index acb1e8024..8d0b85866 100644
--- a/src/backend/omp/kernels/distributed.f90
+++ b/src/backend/omp/kernels/distributed.f90
@@ -6,18 +6,21 @@ module m_omp_kernels_dist
   !! MPI ranks to compute derivatives near subdomain boundaries.
   !!
   !! **Key Features:**
+  !!
   !! - 9-point stencil compact schemes (4th-6th order accuracy)
   !! - Explicit vectorisation with OpenMP SIMD directives
   !! - Near and far boundary treatments for non-periodic domains
   !! - Forward and backward elimination phases for distributed solves
   !!
   !! **Kernels:**
+  !!
   !! - `der_univ_dist`: Universal derivative (1st/2nd) with halo exchange
   !! - `interpl_dist`: Interpolation from cell to vertices or vice versa
   !!
   !! **Distributed Algorithm:**
   !! Compact schemes couple neighbouring points via implicit systems.
   !! In distributed memory:
+  !!
   !! 1. Near-boundary points use special coefficients incorporating halo data
   !! 2. Interior points use standard bulk coefficients
   !! 3. Modified Thomas algorithm handles cross-process dependencies
diff --git a/src/backend/omp/kernels/spectral_processing.f90 b/src/backend/omp/kernels/spectral_processing.f90
index 411024e88..8b00031b1 100644
--- a/src/backend/omp/kernels/spectral_processing.f90
+++ b/src/backend/omp/kernels/spectral_processing.f90
@@ -11,9 +11,10 @@ module m_omp_spectral
   !! **Reference:** JCP 228 (2009), 5989-6015, Section 4
   !!
   !! **Processing steps:**
-  !! 1. Forward spectral equivalence transform (physical $\rightarrow$ modified wavenumbers)
-  !! 2. Solve: $\hat{\phi}_k = -\hat{f}_k / k^2$
-  !! 3. Backward spectral equivalence transform (modified wavenumbers $\rightarrow$ physical)
+  !!
+  !! 1. Forward spectral equivalence transform (physical \(\rightarrow\) modified wavenumbers)
+  !! 2. Solve: \(\hat{\phi}_k = -\hat{f}_k / k^2\)
+  !! 3. Backward spectral equivalence transform (modified wavenumbers \(\rightarrow\) physical)
   use m_common, only: dp
   implicit none
 
diff --git a/src/backend/omp/poisson_fft.f90 b/src/backend/omp/poisson_fft.f90
index 4266c5f5f..0434d7b98 100644
--- a/src/backend/omp/poisson_fft.f90
+++ b/src/backend/omp/poisson_fft.f90
@@ -1,16 +1,18 @@
 module m_omp_poisson_fft
   !! FFT-based Poisson solver for OMP backend.
   !!
-  !! Solves $\nabla^2 \phi = f$ using spectral methods with 2DECOMP&FFT library.
+  !! Solves \(\nabla^2 \phi = f\) using spectral methods with 2DECOMP&FFT library.
   !! Transforms to Fourier space, solves diagonal system in spectral space,
   !! then transforms back to physical space.
   !!
   !! **Algorithm:**
-  !! 1. Forward FFT: physical $\rightarrow$ spectral space
-  !! 2. Spectral solve: $\phi_k = f_k / k^2$ (with modifications for boundary conditions)
-  !! 3. Backward FFT: spectral $\rightarrow$ physical space
+  !!
+  !! 1. Forward FFT: physical \(\rightarrow\) spectral space
+  !! 2. Spectral solve: \(\hat{\phi}_k = \hat{f}_k / k^2\) (with modifications for boundary conditions)
+  !! 3. Backward FFT: spectral \(\rightarrow\) physical space
   !!
   !! **Boundary conditions:**
+  !!
   !! - (0,0,0): Periodic in all directions
   !! - (0,1,0): Dirichlet in Y, periodic in X/Z (uses symmetry transform)
   !!
diff --git a/src/case/base_case.f90 b/src/case/base_case.f90
index 3445a5345..8efadcaf8 100644
--- a/src/case/base_case.f90
+++ b/src/case/base_case.f90
@@ -4,6 +4,7 @@ module m_base_case
   !! This abstract base class provides the framework for implementing specific
   !! flow cases (channel, TGV, generic, etc.). New cases extend this class and
   !! override deferred procedures to specify:
+  !!
   !! - **Initial conditions**: Set velocity and other field initial states
   !! - **Boundary conditions**: Apply physical boundary conditions each timestep
   !! - **Forcing terms**: Add body forces or model-specific source terms
@@ -11,7 +12,9 @@ module m_base_case
   !! - **Postprocessing**: Compute statistics, output diagnostics, etc.
   !!
   !! **Simulation Workflow:**
+  !!
   !! The `run()` method orchestrates the time integration loop:
+  !!
   !! 1. Apply boundary conditions
   !! 2. Advance solution one timestep via solver%step()
   !! 3. Write checkpoints/snapshots (via checkpoint_mgr)
@@ -19,13 +22,16 @@ module m_base_case
   !! 5. Repeat until final time reached
   !!
   !! **Time Integration:**
+  !!
   !! Each timestep involves multiple stages (for RK) or steps (for AB):
+  !!
   !! - Transport equation (transeq) computes velocity derivatives
   !! - Forcing terms applied after transeq
   !! - Pre-correction modifies velocity (e.g., for immersed boundaries)
   !! - Pressure correction enforces incompressibility
   !!
   !! **Restart Capability:**
+  !!
   !! The checkpoint manager handles restart from saved states automatically
   !! if a restart file is detected.
   use m_allocator, only: allocator_t
@@ -77,7 +83,7 @@ subroutine initial_conditions(self)
       !! Abstract interface for setting initial conditions.
       !!
       !! Called once during initialisation to set the initial state of velocity
-      !! and scalar fields. Implementations should populate u, v, w (and species
+      !! and scalar fields. Implementations should populate \(u, v, w\) (and species
       !! if present) with case-appropriate initial values.
       import :: base_case_t
       implicit none
@@ -275,10 +281,12 @@ subroutine print_div_max_mean(self, u, v, w)
     !! \[ \nabla \cdot \mathbf{u} = 0 \]
     !!
     !! This diagnostic reports:
+    !!
     !! - **Maximum divergence**: Largest local violation of incompressibility
     !! - **Mean divergence**: Volume-averaged divergence (should be near machine zero)
     !!
     !! **Purpose:**
+    !!
     !! - Monitor quality of pressure correction (divergence should be ~ 1e-10 or smaller)
     !! - Detect numerical issues (large divergence indicates solver problems)
     !! - Verify proper boundary condition implementation
@@ -315,6 +323,7 @@ subroutine run(self)
     !! orchestrating all aspects of the simulation:
     !!
     !! **Each Timestep:**
+    !!
     !! 1. Apply boundary conditions
     !! 2. Compute derivatives and advance via time_integrator%step()
     !! 3. Handle checkpointing and snapshot output (via checkpoint_mgr)
@@ -322,12 +331,14 @@ subroutine run(self)
     !! 5. Print diagnostics (divergence, enstrophy)
     !!
     !! **Time Integration Stages:**
+    !!
     !! For multi-stage methods (RK), each timestep involves multiple stages.
     !! The solver%step() method handles the stage-by-stage advancement,
     !! calling transeq, forcings, pre_correction, and pressure_correction
     !! at appropriate points.
     !!
     !! **Restart Support:**
+    !!
     !! If a restart file is detected, continues from the saved iteration
     !! and time rather than starting from t=0.
     implicit none
diff --git a/src/case/channel.f90 b/src/case/channel.f90
index 7010e94e4..b20222234 100644
--- a/src/case/channel.f90
+++ b/src/case/channel.f90
@@ -6,21 +6,25 @@ module m_case_channel
   !! to maintain a target bulk velocity.
   !!
   !! **Flow Configuration:**
-  !! - Domain: Periodic in X and Z, wall-bounded in Y
-  !! - Walls at y = 0 and y = L_y with no-slip boundary conditions
+  !!
+  !! - Domain: Periodic in \(X\) and \(Z\), wall-bounded in \(Y\)
+  !! - Walls at \(y = 0\) and \(y = L_y\) with no-slip boundary conditions
   !! - Mean pressure gradient maintains constant bulk velocity
   !! - Optional rotation forcing (Coriolis-like terms) for rotating channel
   !!
   !! **Initial Conditions:**
+  !!
   !! - Parabolic base profile: \( u = 1 - y^2 \)
   !! - Random perturbations with configurable amplitude (noise parameter)
   !! - Perturbations concentrated near centreline for faster transition
   !!
   !! **Boundary Conditions:**
-  !! - No-slip walls: u = v = w = 0 at y = 0 and y = L_y
+  !!
+  !! - No-slip walls: \( u = v = w = 0 \) at \( y = 0 \) and \( y = L_y \)
   !! - Enforces mean bulk velocity via volume shift (simulates pressure gradient)
   !!
   !! **Forcing:**
+  !!
   !! - Mean pressure gradient (constant in time, via bulk velocity constraint)
   !! - Optional Coriolis forcing for rotating channel flows
   use iso_fortran_env, only: stderr => error_unit
diff --git a/src/case/generic.f90 b/src/case/generic.f90
index a83fa3d47..a1659361a 100644
--- a/src/case/generic.f90
+++ b/src/case/generic.f90
@@ -2,7 +2,7 @@ module m_case_generic
   !! Generic freestream flow case for general-purpose simulations.
   !!
   !! This module provides a minimal template for setting up custom flow
-  !! cases. It implements a simple uniform freestream flow (u=1, v=0, w=0)
+  !! cases. It implements a simple uniform freestream flow (\(u=1, v=0, w=0\))
   !! with no forcing or boundary corrections.
   !!
   !! **Use Cases:**
@@ -12,7 +12,7 @@ module m_case_generic
   !! - Custom flow setups requiring minimal default behaviour
   !!
   !! **Default Configuration:**
-  !! - Initial condition: Uniform flow u=1, v=w=0
+  !! - Initial condition: Uniform flow \(u=1, v=0, w=0\)
   !! - No boundary condition corrections
   !! - No forcing terms
   !! - No pre-correction
diff --git a/src/case/tgv.f90 b/src/case/tgv.f90
index a9462ef08..bd094717a 100644
--- a/src/case/tgv.f90
+++ b/src/case/tgv.f90
@@ -5,12 +5,14 @@ module m_case_tgv
   !! Navier-Stokes solvers. It features an analytically-defined initial
   !! condition that transitions from laminar to turbulent flow, providing
   !! a rigorous test of:
+  !!
   !! - Spatial discretisation accuracy
   !! - Time integration stability
   !! - Energy conservation properties
   !! - Transition to turbulence physics
   !!
   !! **Initial Conditions:**
+  !!
   !! \[ u = \sin(x) \cos(y) \cos(z) \]
   !! \[ v = -\cos(x) \sin(y) \cos(z) \]
   !! \[ w = 0 \]
@@ -19,15 +21,18 @@ module m_case_tgv
   !! in all three directions.
   !!
   !! **Domain:**
+  !!
   !! Typically \( [0, 2\pi]^3 \) with periodic boundary conditions in all directions.
   !!
   !! **Validation Metrics:**
+  !!
   !! - Kinetic energy decay rate
   !! - Enstrophy evolution
   !! - Dissipation rate
   !! - Vorticity dynamics
   !!
   !! **Reference:**
+  !!
   !! Taylor, G. I., & Green, A. E. (1937). Mechanism of the production of
   !! small eddies from large ones. Proc. R. Soc. Lond. A, 158(895), 499-521.
   use iso_fortran_env, only: stderr => error_unit
diff --git a/src/common.f90 b/src/common.f90
index 73d5c349b..3c5c98ff0 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -2,6 +2,7 @@ module m_common
   !! Common module containing global constants, parameters, and utility functions.
   !!
   !! This module provides:
+  !!
   !! - Precision definitions (single or double precision based on compilation flags)
   !! - Mathematical constants (e.g., \(\pi\))
   !! - Direction and reordering constants for domain decomposition
diff --git a/src/config.f90 b/src/config.f90
index b164b3b90..ce28d6e98 100644
--- a/src/config.f90
+++ b/src/config.f90
@@ -96,9 +96,9 @@ module m_config
     subroutine read(self, nml_file, nml_string) !&
       !! Assigns the member variables either from a file or text source.
       !!
-      !! nml_file can be an absolute or relative path
-      !! nml_string is a character string that contains the namelist.
-      !! For example, nml_string="&foobar_nml foo=0, bar='this'/"
+      !! `nml_file` can be an absolute or relative path
+      !! `nml_string` is a character string that contains the namelist.
+      !! For example, `nml_string="&foobar_nml foo=0, bar='this'/"`
       import :: base_config_t
 
       class(base_config_t) :: self
@@ -133,6 +133,8 @@ subroutine read_domain_nml(self, nml_file, nml_string)
 
     namelist /domain_settings/ flow_case_name, L_global, dims_global, &
       nproc_dir, BC_x, BC_y, BC_z, stretching, beta
+      !! Specifies the computational domain geometry, mesh resolution, boundary conditions,
+      !! and MPI decomposition for the simulation.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading domain config failed! &
@@ -189,6 +191,8 @@ subroutine read_solver_nml(self, nml_file, nml_string)
       n_species, pr_species, lowmem_transeq, lowmem_fft, &
       time_intg, der1st_scheme, der2nd_scheme, interpl_scheme, &
       stagder_scheme, ibm_on
+      !! Specifies numerical solver settings including Reynolds number, time integration,
+      !! discretization schemes, and solver options for the Navier-Stokes equations.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading solver config failed! &
@@ -241,6 +245,8 @@ subroutine read_channel_nml(self, nml_file, nml_string)
     integer :: n_rotate
 
     namelist /channel_nml/ noise, rotation, omega_rot, n_rotate
+      !! Specifies parameters specific to turbulent channel flow simulations,
+      !! including initial perturbations and optional rotation effects.
 
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading channel config failed! &
@@ -290,6 +296,8 @@ subroutine read_checkpoint_nml(self, nml_file, nml_string)
     namelist /checkpoint_params/ checkpoint_freq, snapshot_freq, &
       keep_checkpoint, checkpoint_prefix, snapshot_prefix, &
       restart_from_checkpoint, restart_file, output_stride, snapshot_sp
+      !! Specifies checkpoint and snapshot settings for simulation output and restart,
+      !! including file naming, frequency, and spatial output stride.
     if (present(nml_file) .and. present(nml_string)) then
       error stop 'Reading checkpoint config failed! &
                  &Provide only a file name or source, not both.'
diff --git a/src/field.f90 b/src/field.f90
index af8b6fc89..7d2e69c4a 100644
--- a/src/field.f90
+++ b/src/field.f90
@@ -1,7 +1,7 @@
 module m_field
   !! Field data structure module for managing computational grid data.
   !!
-  !! This module provides the field_t type for storing 3D scalar fields
+  !! This module provides the `field_t` type for storing 3D scalar fields
   !! on the computational grid. Fields can be organised in linked lists
   !! for memory management and support different data orientations
   !! (x-pencil, y-pencil, z-pencil).
diff --git a/src/io/adios2/io.f90 b/src/io/adios2/io.f90
index d8b95c86a..613db079f 100644
--- a/src/io/adios2/io.f90
+++ b/src/io/adios2/io.f90
@@ -7,12 +7,14 @@ module m_io_backend
   !! from the session interface into specific ADIOS2 API calls.
   !!
   !! **Architecture:**
+  !!
   !! - Extends abstract base types from `m_io_base`
   !! - Implements all required I/O procedures (init, open, read, write, etc.)
   !! - Manages ADIOS2-specific objects (adios, io, engine)
   !! - Handles step-based I/O for time-series data
   !!
   !! **ADIOS2 Features Leveraged:**
+  !!
   !! - **Engine Abstraction**: Same API for different formats (BP4, BP5, HDF5)
   !! - **Asynchronous I/O**: Deferred transport mode overlaps computation and I/O
   !! - **MPI Integration**: Designed for large-scale parallel I/O
@@ -20,6 +22,7 @@ module m_io_backend
   !! - **Hyperslab Selection**: Parallel distributed array I/O
   !!
   !! **Type Hierarchy:**
+  !!
   !! ```
   !! io_base (abstract)
   !!   |-- io_reader_t (abstract)
diff --git a/src/io/checkpoint_manager.f90 b/src/io/checkpoint_manager.f90
index 0d4831be3..205f08d0e 100644
--- a/src/io/checkpoint_manager.f90
+++ b/src/io/checkpoint_manager.f90
@@ -6,26 +6,31 @@ module m_checkpoint_manager
   !! allows simulations to be stopped and resumed from the exact state.
   !!
   !! **Key Features:**
+  !!
   !! - Configuration via namelist (checkpoint frequency, prefix, etc.)
   !! - Periodic writing of full-resolution simulation state
   !! - Complete restart logic from specified checkpoint file
   !! - Safe-write strategy: temporary file then atomic rename
   !! - Optional cleanup of old checkpoints to conserve disk space
-  !! - Stores velocity fields (u, v, w), timestep, and simulation time
+  !! - Stores velocity fields (\(u, v, w\)), timestep, and simulation time
   !!
   !! **Safe-Write Strategy:**
+  !!
   !! To prevent corrupted checkpoints from crashes during write:
-  !! 1. Write to temporary file (e.g., checkpoint_0001000.tmp.bp)
-  !! 2. Atomic rename to final name (checkpoint_0001000.bp)
-  !! 3. Optionally delete previous checkpoint if keep_checkpoint=false
+  !!
+  !! 1. Write to temporary file (e.g., `checkpoint_0001000.tmp.bp`)
+  !! 2. Atomic rename to final name (`checkpoint_0001000.bp`)
+  !! 3. Optionally delete previous checkpoint if `keep_checkpoint=false`
   !!
   !! **Configuration:**
+  !!
   !! Controlled via `checkpoint_config_t` read from input namelist:
-  !! - checkpoint_freq: write interval (iterations)
-  !! - keep_checkpoint: retain all checkpoints vs overwrite old ones
-  !! - checkpoint_prefix: filename prefix
-  !! - restart_from_checkpoint: enable restart
-  !! - restart_file: checkpoint file to restart from
+  !!
+  !! - `checkpoint_freq`: write interval (iterations)
+  !! - `keep_checkpoint`: retain all checkpoints vs overwrite old ones
+  !! - `checkpoint_prefix`: filename prefix
+  !! - `restart_from_checkpoint`: enable restart
+  !! - `restart_file`: checkpoint file to restart from
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank, MPI_Abort
   use m_common, only: dp, i8, DIR_X, get_argument
   use m_field, only: field_t
diff --git a/src/io/dummy/io.f90 b/src/io/dummy/io.f90
index b75bbc325..0e282fbbe 100644
--- a/src/io/dummy/io.f90
+++ b/src/io/dummy/io.f90
@@ -7,16 +7,19 @@ module m_io_backend
   !! I/O library.
   !!
   !! **Purpose:**
+  !!
   !! - Enables compilation without external I/O library dependencies
   !! - Provides informative error messages when I/O operations are attempted
   !! - Allows code structure to remain consistent regardless of I/O backend
   !!
   !! **Behaviour:**
+  !!
   !! - Write operations are silently ignored (no-op)
   !! - Read operations terminate with error message directing user to recompile
   !! - File open/close operations are tracked but perform no actual I/O
   !!
   !! **Use Cases:**
+  !!
   !! - Testing/debugging without I/O overhead
   !! - Systems where ADIOS2 is unavailable
   !! - Dry runs to validate simulation setup
diff --git a/src/io/io_field_utils.f90 b/src/io/io_field_utils.f90
index be8911a7b..8ebb3dc17 100644
--- a/src/io/io_field_utils.f90
+++ b/src/io/io_field_utils.f90
@@ -1,10 +1,11 @@
 module m_io_field_utils
-!! @brief Provides common utilities and helper routines for field I/O
-!! operations
+!! Common utilities and helper routines for field I/O operations.
+!!
+!! This module contains a collection of procedures and derived types that
+!! handle the low-level tasks required for writing field data.
+!!
+!! **Primary functionalities:**
 !!
-!! @details This module contains a collection of procedures and derived
-!! types that handle the low-level tasks required for writing field data
-!! Its primary functionalities include:
 !! - Data sub-sampling (striding) - applying a stride to data to reduce the
 !! size of the output files
 !! - Parallel I/O calculations - determining correct global shapes,
@@ -27,14 +28,56 @@ module m_io_field_utils
             cleanup_field_buffers
 
   type :: field_buffer_map_t
-    ! Race-free field buffer mapping for async I/O operations.
-    ! Each field gets its own dedicated buffer to prevent data races
-    ! when multiple async write operations are in flight.
+    !! Named buffer for thread-safe asynchronous I/O operations.
+    !!
+    !! This type maps a field name to its dedicated memory buffer, preventing
+    !! data races when multiple asynchronous write operations are in flight
+    !! simultaneously.
+    !!
+    !! **Purpose:**
+    !!
+    !! During asynchronous I/O, fields are copied into persistent buffers that
+    !! remain valid while I/O operations execute in the background. Each field
+    !! gets its own buffer identified by name, ensuring:
+    !!
+    !! - **Thread safety**: No conflicts between concurrent writes
+    !! - **Data integrity**: Field data remains stable during async operations
+    !! - **Flexibility**: Supports strided/downsampled data for visualization
+    !!
+    !! **Workflow:**
+    !!
+    !! 1. `prepare_field_buffers`: Allocate buffers for all fields
+    !! 2. `write_single_field_to_buffer`: Copy field data into named buffer
+    !! 3. ADIOS2 writes from buffer (async, non-blocking)
+    !! 4. `cleanup_field_buffers`: Deallocate buffers when done
+    !!
+    !! **Components:**
+    !!
+    !! - `field_name`: Identifier for buffer lookup (e.g., "u", "v", "w", "p")
+    !! - `buffer`: 3D array holding field data (possibly strided)
     character(len=32) :: field_name
     real(dp), dimension(:, :, :), allocatable :: buffer
   end type field_buffer_map_t
 
   type :: field_ptr_t
+    !! Wrapper type for storing polymorphic field pointers in arrays.
+    !!
+    !! Fortran does not allow allocatable arrays of polymorphic pointers directly
+    !! (e.g., `class(field_t), pointer :: fields(:)`), so this wrapper type
+    !! enables creating arrays of field pointers:
+    !!
+    !! ```fortran
+    !! type(field_ptr_t), allocatable :: field_array(:)
+    !! ```
+    !!
+    !! **Use cases:**
+    !!
+    !! - Managing multiple fields for I/O operations
+    !! - Storing references to velocity components (u, v, w)
+    !! - Building lists of fields to write/read simultaneously
+    !!
+    !! **Note:** Each `field_ptr_t` holds a pointer to a `field_t` object;
+    !! the pointer can be null if not yet associated.
     class(field_t), pointer :: ptr => null()
   end type field_ptr_t
 
diff --git a/src/io/io_manager.f90 b/src/io/io_manager.f90
index 1093482a7..18dab197c 100644
--- a/src/io/io_manager.f90
+++ b/src/io/io_manager.f90
@@ -7,12 +7,14 @@ module m_io_manager
   !! delegates tasks to specialised checkpoint and snapshot managers.
   !!
   !! **Responsibilities:**
+  !!
   !! - Initialise checkpoint and snapshot managers
   !! - Coordinate restart from checkpoints
   !! - Orchestrate periodic checkpoint and snapshot writes
   !! - Finalise I/O operations and clean up resources
   !!
   !! **Usage Pattern:**
+  !!
   !! ```fortran
   !! type(io_manager_t) :: io_mgr
   !! call io_mgr%init(comm)
diff --git a/src/io/io_session.f90 b/src/io/io_session.f90
index 9eae14f7b..21c0de7e7 100644
--- a/src/io/io_session.f90
+++ b/src/io/io_session.f90
@@ -1,12 +1,12 @@
 module m_io_session
-!! @brief Provides high-level, session-based user interface for all I/O
-!! operations
+!! High-level, session-based user interface for all I/O operations.
 !!
-!! @details This module is the sole entry point for file reading and writing.
+!! This module is the sole entry point for file reading and writing.
 !! It abstracts away all backend details and provides a type-safe interface
 !! for all I/O tasks.
 !!
-!! Key features:
+!! **Key features:**
+!!
 !! - Type-safe sessions: specialised `reader_session_t` and `writer_session_t`
 !!   types for reading and writing operations, respectively.
 !! - Automatic backend selection: based on compile-time options
@@ -16,10 +16,11 @@ module m_io_session
 !! `open -> read/write -> close` workflow, with no need for manual file handle
 !!   management or explicit cleanup calls.
 !!
-!! @example
-!! A typical usage pattern for reading data and writing data:
+!! **Usage Example:**
+!!
+!! A typical usage pattern for reading and writing data:
 !!
-!! @code{.f90}
+!! ```fortran
 !! use m_io_session, only: writer_session_t, reader_session_t
 !!
 !! implicit none
@@ -39,9 +40,9 @@ module m_io_session
 !! call reader%read_data("temperature", temp_field)
 !! call reader%close()
 !! ! Note: reader is automatically cleaned up when it goes out of scope
-!! @endcode
+!! ```
 !!
-!! @note Users should only use the types provided by this module. The lower-level
+!! **Note:** Users should only use the types provided by this module. The lower-level
 !! modules like `m_io_base` and `m_io_backend` are internal components and should
 !! never be used directly in user code.
   use m_common, only: dp, i8
@@ -68,16 +69,19 @@ module m_io_session
     procedure :: close => session_base_close
   end type io_session_base_t
 
-  !> **PRIMARY TYPE FOR READING DATA** - Use this for all file reading operations
+  !> PRIMARY TYPE FOR READING DATA - Use this for all file reading operations
   !! This is the only interface users should use for reading data.
   !! Provides type-safe reading operations with automatic backend selection.
   !!
-  !! Usage example:
+  !! **Usage example:**
+  !!
+  !! ```fortran
   !!   type(reader_session_t) :: reader_session
   !!   call reader_session%open("checkpoint.bp", MPI_COMM_WORLD)
   !!   call reader_session%read_data("timestep", timestep)
   !!   call reader_session%read_data("velocity_u", u_field, start_dims, count_dims)
   !!   call reader_session%close()
+  !! ```
   type, extends(io_session_base_t) :: reader_session_t
     private
     class(io_reader_t), allocatable :: reader
@@ -94,18 +98,20 @@ module m_io_session
     final :: reader_session_finaliser
   end type reader_session_t
 
-  !> **PRIMARY TYPE FOR WRITING DATA** - Use this for all file writing operations
+  !> PRIMARY TYPE FOR WRITING DATA - Use this for all file writing operations
   !! This is the only interface users should use for writing data.
   !! Provides type-safe writing operations with automatic backend selection.
   !!
-  !! Usage example:
-  !!   type(writer_session_t) :: writer_session
-  !!   call writer_session%open("output.bp", MPI_COMM_WORLD)
-  !!   call writer_session%write_data("timestep", current_step)
-  !!   call writer_session%write_data("pressure", p_field, start_dims, count_dims)
-  !!   call writer_session%close()
-  !!   call writer_session%write_attribute("ParaView", "vtk_xml_content")
-  !!   call writer_session%close()
+  !! **Usage example:**
+  !!
+  !! ```fortran
+  !! type(writer_session_t) :: writer_session
+  !! call writer_session%open("output.bp", MPI_COMM_WORLD)
+  !! call writer_session%write_data("timestep", current_step)
+  !! call writer_session%write_data("pressure", p_field, start_dims, count_dims)
+  !! call writer_session%close()
+  !! call writer_session%write_attribute("ParaView", "vtk_xml_content")
+  !! ```
   type, extends(io_session_base_t) :: writer_session_t
     private
     class(io_writer_t), allocatable :: writer
diff --git a/src/io/snapshot_manager.f90 b/src/io/snapshot_manager.f90
index ee2c2f8ae..1d03052a0 100644
--- a/src/io/snapshot_manager.f90
+++ b/src/io/snapshot_manager.f90
@@ -7,6 +7,7 @@ module m_snapshot_manager
   !! sufficient resolution for visualisation.
   !!
   !! **Key Differences from Checkpoints:**
+  !!
   !! - **Purpose**: Visualisation/analysis vs exact restart
   !! - **Resolution**: Can be strided (e.g., every 2nd point) vs full resolution
   !! - **Frequency**: Typically more frequent than checkpoints
@@ -14,16 +15,19 @@ module m_snapshot_manager
   !!   separate files per checkpoint
   !!
   !! **Features:**
+  !!
   !! - Configurable spatial striding to reduce output size
   !! - Persistent file handle (stays open across multiple writes)
   !! - Generates VTK-compatible XML for ParaView visualisation
   !! - Writes velocity fields at each snapshot interval
   !!
   !! **Configuration:**
+  !!
   !! Controlled via `checkpoint_config_t` read from input namelist:
-  !! - snapshot_freq: write interval (iterations)
-  !! - snapshot_prefix: filename prefix
-  !! - output_stride: spatial stride factors [sx, sy, sz]
+  !!
+  !! - `snapshot_freq`: write interval (iterations)
+  !! - `snapshot_prefix`: filename prefix
+  !! - `output_stride`: spatial stride factors [`sx`, `sy`, `sz`]
   use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank
   use m_common, only: dp, i8, DIR_C, VERT, get_argument
   use m_field, only: field_t
@@ -222,6 +226,7 @@ subroutine generate_vtk_xml(self, dims, fields, origin, spacing)
     !! ADIOS2 files in ParaView without conversion.
     !!
     !! **VTK ImageData Format:**
+    !!
     !! - Defines structured rectilinear grid with uniform spacing
     !! - Extent: grid dimensions from 0 to N-1 in (z,y,x) order
     !! - Origin: physical coordinates of first grid point
@@ -367,6 +372,7 @@ subroutine open_snapshot_file(self, filename, comm)
     !! the same file, enabling efficient time-series visualization.
     !!
     !! **Persistent File Strategy:**
+    !!
     !! - File opened once at first snapshot
     !! - Remains open for subsequent snapshots (append mode)
     !! - Each write adds a new timestep to the file
diff --git a/src/mesh.f90 b/src/mesh.f90
index f83694b34..a9b625654 100644
--- a/src/mesh.f90
+++ b/src/mesh.f90
@@ -1,7 +1,7 @@
 module m_mesh
   !! Mesh module providing high-level mesh management and query functions.
   !!
-  !! This module defines the mesh_t type which aggregates geometry, grid, and
+  !! This module defines the `mesh_t` type which aggregates geometry, grid, and
   !! parallel decomposition information. It provides methods to query mesh
   !! dimensions, coordinates, and other mesh properties for both global and
   !! local (per MPI rank) domains.
diff --git a/src/mesh_content.f90 b/src/mesh_content.f90
index 8207edeaa..43012f705 100644
--- a/src/mesh_content.f90
+++ b/src/mesh_content.f90
@@ -2,9 +2,10 @@ module m_mesh_content
   !! Module containing mesh content types for geometry, grid, and parallel decomposition.
   !!
   !! This module defines three main types:
-  !! - geo_t: Geometry information including coordinates and mesh stretching
-  !! - grid_t: Grid dimensions and boundary conditions
-  !! - par_t: Parallel domain decomposition information
+  !!
+  !! - `geo_t`: Geometry information including coordinates and mesh stretching
+  !! - `grid_t`: Grid dimensions and boundary conditions
+  !! - `par_t`: Parallel domain decomposition information
 
   use m_common, only: dp, pi
   implicit none
diff --git a/src/module/ibm.f90 b/src/module/ibm.f90
index c24445625..8f6d019b7 100644
--- a/src/module/ibm.f90
+++ b/src/module/ibm.f90
@@ -1,10 +1,10 @@
 module m_ibm
 !! This module implements the IBM capabilities.
 !!
-!! When iibm = 0, the IBM object is never used.
+!! When `iibm = 0`, the IBM object is never used.
 !!
-!! When iibm = 1, the basic IBM capability is used.
-!! It only requires ep1, a 3D field, as input.
+!! When `iibm = 1`, the basic IBM capability is used.
+!! It only requires `ep1`, a 3D field, as input.
 !! This field should be one (zero) in the fluid (solid)
 !! domain.
   use iso_fortran_env, only: stderr => error_unit
@@ -25,6 +25,45 @@ module m_ibm
   integer, parameter :: iibm_basic = 1
 
   type :: ibm_t
+    !! Immersed Boundary Method (IBM) for simulating flow around solid bodies.
+    !!
+    !! The IBM approach enables simulation of flows with complex solid geometries
+    !! without requiring body-fitted meshes. Instead, the solid geometry is
+    !! represented by a masking field (`ep1`) on a Cartesian grid.
+    !!
+    !! **Current Implementation (iibm = 1):**
+    !!
+    !! The basic IBM enforces zero velocity inside solid regions by multiplying
+    !! velocity components with the mask field `ep1`:
+    !!
+    !! - `ep1 = 1` in fluid regions → velocity unchanged
+    !! - `ep1 = 0` in solid regions → velocity set to zero
+    !!
+    !! This is applied before the pressure solve to ensure the divergence-free
+    !! constraint is satisfied only in the fluid domain.
+    !!
+    !! **Mask Field (ep1):**
+    !!
+    !! The `ep1` field defines the fluid/solid interface:
+    !!
+    !! - Values of 1.0 indicate fluid cells (no modification)
+    !! - Values of 0.0 indicate solid cells (velocity zeroed)
+    !! - Intermediate values (0 < ep1 < 1) represent interface cells
+    !!
+    !! **Future Extensions:**
+    !!
+    !! The current implementation sets velocity to zero in solid regions.
+    !! A more accurate IBM would set velocity to \(\Delta t \nabla p^n\)
+    !! before the pressure solve, then subtract \(\Delta t \nabla p^{n+1}\)
+    !! after reconstruction to properly enforce boundary conditions.
+    !!
+    !! **Components:**
+    !!
+    !! - `backend`: Computational backend for field operations
+    !! - `mesh`: Grid information
+    !! - `host_allocator`: Memory allocator for field storage
+    !! - `iibm`: IBM mode (0 = disabled, 1 = basic IBM)
+    !! - `ep1`: Mask field (1 in fluid, 0 in solid)
     class(base_backend_t), pointer :: backend => null()
     class(mesh_t), pointer :: mesh => null()
     type(allocator_t), pointer :: host_allocator => null()
diff --git a/src/ordering.f90 b/src/ordering.f90
index 0d9a7d466..7c1d45c87 100644
--- a/src/ordering.f90
+++ b/src/ordering.f90
@@ -2,8 +2,8 @@ module m_ordering
   !! Module for index conversion between application storage and Cartesian layouts.
   !!
   !! This module provides functions to convert between directional "application storage"
-  !! indices (optimised for cache locality) and Cartesian (i,j,k) indices. The application
-  !! storage layout arranges data in blocks oriented along a specific direction (X, Y, or Z)
+  !! indices (optimised for cache locality) and Cartesian \( (i,j,k) \) indices. The application
+  !! storage layout arranges data in blocks oriented along a specific direction ( \( X, Y \), or \( Z \) )
   !! to improve memory access patterns during computations.
 
   use m_common, only: dp, get_dirs_from_rdr, DIR_X, DIR_Y, DIR_Z, DIR_C
diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
index 97fd11a32..937f42ff6 100644
--- a/src/poisson_fft.f90
+++ b/src/poisson_fft.f90
@@ -6,27 +6,31 @@ module m_poisson_fft
   !! \[ \nabla^2 \phi = f \]
   !!
   !! **Solution Strategy:**
+  !!
   !! 1. **Forward FFT**: Transform RHS from physical to spectral space
   !! 2. **Spectral division**: Solve algebraically using wave numbers:
   !!    \( \hat{\phi} = \hat{f} / k^2 \)
   !! 3. **Backward FFT**: Transform solution back to physical space
   !!
   !! **Boundary Condition Support:**
+  !!
   !! - **Periodic (000)**: Fully periodic in all directions (standard FFT)
-  !! - **Mixed (010)**: Periodic in X/Z, non-periodic in Y (requires special handling)
+  !! - **Mixed (010)**: Periodic in \( X/Z \), non-periodic in \( Y \) (requires special handling)
   !!
   !! **Grid Stretching:**
-  !! - Uniform grids in X and Z (required for FFT)
-  !! - Y-direction stretching supported for 010 BCs via transformation matrices
+  !!
+  !! - Uniform grids in \( X \) and \( Z \) (required for FFT)
+  !! - \( Y \)-direction stretching supported for `010` BCs via transformation matrices
   !! - Stretching handled through spectral equivalence constants
   !!
   !! **Parallel Implementation:**
-  !! - Pencil decomposition in Y and Z directions (X must be undivided)
+  !!
+  !! - Pencil decomposition in \( Y \) and \( Z \) directions (\( X \) must be undivided)
   !! - Spectral space operations on permuted/transposed data layouts
   !! - Backend-specific FFT implementations (CPU/GPU)
   !!
   !! The module is abstract; concrete implementations provide FFT routines
-  !! via deferred procedures (fft_forward, fft_backward, fft_postprocess).
+  !! via deferred procedures (`fft_forward`, `fft_backward`, `fft_postprocess`).
   use m_common, only: dp, pi, CELL
   use m_field, only: field_t
   use m_mesh, only: mesh_t, geo_t
diff --git a/src/solver.f90 b/src/solver.f90
index 61342bab8..dd5b5b8f2 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -3,11 +3,12 @@ module m_solver
   !!
   !! This module provides the high-level solver infrastructure for solving
   !! incompressible Navier-Stokes equations using compact finite differences.
-  !! The solver orchestrates the transport equation (transeq), divergence,
+  !! The solver orchestrates the transport equation (`transeq`), divergence,
   !! Poisson solver, and gradient operations required for the fractional-step
   !! projection method.
   !!
   !! The implementation supports:
+  !!
   !! - Multiple backend executors (CPU/GPU)
   !! - Distributed and Thomas algorithm for derivatives
   !! - Immersed boundary method (IBM)
diff --git a/src/tdsops.f90 b/src/tdsops.f90
index cd492979f..09e68aa66 100644
--- a/src/tdsops.f90
+++ b/src/tdsops.f90
@@ -4,21 +4,24 @@ module m_tdsops
   !! This module provides preprocessed tridiagonal operator arrays for
   !! solving compact finite difference schemes. It supports both distributed
   !! and Thomas algorithm implementations for computing:
+  !!
   !! - First and second derivatives
   !! - Interpolation between vertex and cell-centre grids
   !! - Staggered derivatives
   !!
   !! The operators are preprocessed based on:
+  !!
   !! - Grid spacing and optional stretching
   !! - Boundary conditions (periodic, Neumann, Dirichlet)
   !! - Numerical scheme (compact schemes of various orders)
   !! - Symmetry properties for free-slip boundaries
   !!
   !! The distributed algorithm is designed for parallel execution and consists of:
-  !! 1. Forward/backward elimination phase (dist_fw, dist_bw)
-  !! 2. Back-substitution phase (dist_sa, dist_sc)
   !!
-  !! The Thomas algorithm (thom_*) is used for serial execution or
+  !! 1. Forward/backward elimination phase (`dist_fw`, `dist_bw`)
+  !! 2. Back-substitution phase (`dist_sa`, `dist_sc`)
+  !!
+  !! The Thomas algorithm (`thom_*`) is used for serial execution or
   !! when the distributed approach is not suitable.
   use iso_fortran_env, only: stderr => error_unit
 
diff --git a/src/time_integrator.f90 b/src/time_integrator.f90
index 97cc61ae9..1bcf2bdf5 100644
--- a/src/time_integrator.f90
+++ b/src/time_integrator.f90
@@ -15,7 +15,7 @@ module m_time_integrator
   !! AB3, AB4. These methods are more memory-efficient than RK schemes
   !! for the same order of accuracy.
   !!
-  !! The time_intg_t type encapsulates all integration state and provides
+  !! The `time_intg_t` type encapsulates all integration state and provides
   !! a unified interface through the step procedure pointer, which routes
   !! to either runge_kutta() or adams_bashforth() based on the selected method.
   !!
@@ -31,6 +31,45 @@ module m_time_integrator
   private adams_bashforth, runge_kutta
 
   type :: time_intg_t
+    !! Time integrator for explicit multi-step and multi-stage methods.
+    !!
+    !! This type encapsulates all data and methods needed for time integration
+    !! of ordinary differential equations (ODEs) arising from spatial discretization
+    !! of the Navier-Stokes equations:
+    !!
+    !! \[
+    !! \frac{d\mathbf{u}}{dt} = \mathbf{F}(\mathbf{u}, t)
+    !! \]
+    !!
+    !! where \(\mathbf{F}\) represents the spatial operators (advection, diffusion,
+    !! pressure gradient, etc.).
+    !!
+    !! **Supported Methods:**
+    !!
+    !! - **Adams-Bashforth (AB1-AB4)**: Explicit multi-step methods using
+    !!   previous timestep derivatives. Efficient (single evaluation per step)
+    !!   but requires startup procedure for higher orders.
+    !! - **Runge-Kutta (RK1-RK4)**: Explicit multi-stage methods using
+    !!   intermediate stages within a timestep. Self-starting but requires
+    !!   multiple evaluations per step.
+    !!
+    !! **Method Selection:**
+    !!
+    !! The `step` procedure pointer is bound at initialization to either
+    !! `runge_kutta()` or `adams_bashforth()` based on the method name
+    !! (e.g., "AB3" or "RK4"), enabling polymorphic time stepping.
+    !!
+    !! **Data Management:**
+    !!
+    !! - **AB methods**: Store previous timestep derivatives in `olds` array,
+    !!   rotated each timestep to maintain history
+    !! - **RK methods**: Store intermediate stage solutions in `olds` array,
+    !!   overwritten within each timestep
+    !!
+    !! **Startup Procedure (AB only):**
+    !!
+    !! Higher-order AB methods (AB2-AB4) ramp up from first-order during initial
+    !! timesteps until sufficient derivative history is available.
     integer :: method       !! Integration method identifier (unused, kept for compatibility)
     integer :: istep        !! Current timestep number (for AB startup ramping)
     integer :: istage       !! Current stage number within timestep (RK only)
diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90
index ce1110f1e..625c22ec4 100644
--- a/src/vector_calculus.f90
+++ b/src/vector_calculus.f90
@@ -1,5 +1,5 @@
 module m_vector_calculus
-  !! Vector calculus operators for finite-difference CFD.
+  !! Vector calculus operators for finite-difference.
   !!
   !! This module provides implementations of fundamental differential operators
   !! (divergence, gradient, curl, Laplacian) on staggered and collocated grids.
@@ -7,22 +7,25 @@ module m_vector_calculus
   !! from the tdsops module.
   !!
   !! **Key Features:**
+  !!
   !! - **Staggered grid support**: Operators handle transitions between cell centres
-  !!   (CELL) and vertices (VERT) through staged derivatives and interpolation
+  !!   (`CELL`) and vertices (`VERT`) through staged derivatives and interpolation
   !! - **Data reordering**: Automatically manages pencil decomposition, reordering
-  !!   fields between X, Y, Z orientations as needed for derivatives
+  !!   fields between \( X, Y, Z \) orientations as needed for derivatives
   !! - **Memory efficiency**: Uses allocator blocks for temporary fields with
   !!   careful release management to minimise memory footprint
   !!
   !! **Grid Conventions:**
-  !! - CELL (data_loc=CELL): Variables stored at cell centres (e.g., pressure)
-  !! - VERT (data_loc=VERT): Variables stored at cell vertices (e.g., velocity)
-  !! - Staggered operators (v2c, c2v) transition between these locations
+  !!
+  !! - `CELL` (`data_loc=CELL`): Variables stored at cell centres (e.g., pressure)
+  !! - `VERT` (`data_loc=VERT`): Variables stored at cell vertices (e.g., velocity)
+  !! - Staggered operators (`v2c`, `c2v`) transition between these locations
   !!
   !! **Data Layouts:**
-  !! - DIR_X: Pencil decomposed in X direction (default for most operations)
-  !! - DIR_Y: Pencil decomposed in Y direction (for Y derivatives)
-  !! - DIR_Z: Pencil decomposed in Z direction (for Z derivatives)
+  !!
+  !! - `DIR_X`: Pencil decomposed in \( X \) direction (default for most operations)
+  !! - `DIR_Y`: Pencil decomposed in \( Y \) direction (for Y derivatives)
+  !! - `DIR_Z`: Pencil decomposed in \( Z \) direction (for Z derivatives)
   use iso_fortran_env, only: stderr => error_unit
 
   use m_allocator, only: allocator_t
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index e1d091c9b..663d75f14 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -4,12 +4,14 @@ program xcompact
   !! X3D2 is a high-order finite-difference incompressible Navier-Stokes
   !! solver based on Xcompact3D/Incompact3D. It solves the incompressible
   !! Navier-Stokes equations using:
+  !!
   !! - **Compact finite differences** for spatial derivatives (4th-6th order)
   !! - **Fractional-step method** for pressure-velocity coupling
   !! - **FFT-based or iterative Poisson solvers** for pressure
   !! - **Explicit time integration** (Runge-Kutta or Adams-Bashforth)
   !!
   !! **Program Flow:**
+  !!
   !! 1. Initialise MPI and determine rank/size
   !! 2. Select computational backend (CUDA GPU or OpenMP CPU)
   !! 3. Read configuration from input file (domain and solver parameters)
@@ -20,16 +22,19 @@ program xcompact
   !! 8. Report timing and finalise MPI
   !!
   !! **Backend Options:**
+  !!
   !! - **CUDA**: GPU acceleration via NVIDIA CUDA (compile with -DCUDA)
   !! - **OMP**: CPU parallelism via OpenMP threading
   !!
   !! **Input:** Namelist file specified as command-line argument (e.g., input.x3d)
   !!
   !! **Domain Decomposition:**
+  !!
   !! X3D2 supports two decomposition strategies:
+  !!
   !! - **2DECOMP&FFT**: External library used when FFT Poisson solver + OMP backend.
   !!   Provides optimised pencil decomposition and FFT transforms. Cannot decompose
-  !!   in X-direction (nproc_dir(1) must be 1).
+  !!   in X-direction (`nproc_dir(1)` must be 1).
   !! - **Generic**: Built-in X3D2 decomposition used for CUDA backend or when
   !!   2DECOMP&FFT is unavailable. Can decompose in any direction (X, Y, Z).
   !!

From 57a0a3324a30eb2a62f91b652501ab747d3e1fe0 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 29 Jan 2026 19:10:04 +0000
Subject: [PATCH 11/12] fix fprettify issues

---
 src/backend/cuda/backend.f90        | 3 ++-
 src/backend/cuda/kernels/thomas.f90 | 3 ++-
 src/vector_calculus.f90             | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/backend/cuda/backend.f90 b/src/backend/cuda/backend.f90
index 292c8a7cd..94e09a59f 100644
--- a/src/backend/cuda/backend.f90
+++ b/src/backend/cuda/backend.f90
@@ -881,7 +881,8 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
     !! Copy boundary data into MPI send buffers using CUDA kernel.
     implicit none
 
-    real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, u_send_e_dev  !! Send buffers
+    real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, &
+                                                         u_send_e_dev  !! Send buffers
     real(dp), device, dimension(:, :, :), intent(in) :: u_dev  !! Source field
     integer, intent(in) :: n  !! Grid dimension
 
diff --git a/src/backend/cuda/kernels/thomas.f90 b/src/backend/cuda/kernels/thomas.f90
index de1f023a2..af9522f7a 100644
--- a/src/backend/cuda/kernels/thomas.f90
+++ b/src/backend/cuda/kernels/thomas.f90
@@ -33,7 +33,8 @@ attributes(global) subroutine der_univ_thom( &
     integer, value, intent(in) :: n_tds, n_rhs  !! Number of unknowns and RHS points
     real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e  !! Start/end explicit stencil coefficients
     real(dp), device, intent(in), dimension(:) :: coeffs  !! Bulk stencil coefficients (9-point)
-    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch  !! Thomas algorithm coefficients and stretching
+    real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, &
+                                                  thom_w, strch  !! Thomas algorithm coefficients and stretching
 
     integer :: i, j, b  !! Thread, loop, and block indices
 
diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90
index 625c22ec4..411332b7f 100644
--- a/src/vector_calculus.f90
+++ b/src/vector_calculus.f90
@@ -180,7 +180,7 @@ subroutine divergence_v2c(self, div_u, u, v, w, &
     !! Compute divergence of a vector field from vertices to cell centres.
     !!
     !! Computes:
-    !! \[ \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + 
+    !! \[ \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} +
     !!    \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \]
     !!
     !! Input velocity components (u, v, w) are at vertices (VERT), and
@@ -298,7 +298,7 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, &
     !! Compute gradient of a scalar field from cell centres to vertices.
     !!
     !! Computes:
-    !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, 
+    !! \[ \nabla p = \left( \frac{\partial p}{\partial x},
     !!    \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \]
     !!
     !! Input pressure p is at cell centres (CELL), and gradient components
@@ -394,7 +394,7 @@ subroutine laplacian(self, lapl_u, u, x_der2nd, y_der2nd, z_der2nd)
     !! Compute Laplacian of a scalar field.
     !!
     !! Computes:
-    !! \[ \nabla^2 u = \frac{\partial^2 u}{\partial x^2} + 
+    !! \[ \nabla^2 u = \frac{\partial^2 u}{\partial x^2} +
     !!    \frac{\partial^2 u}{\partial y^2} + \frac{\partial^2 u}{\partial z^2} \]
     !!
     !! The Laplacian is evaluated at the same grid location (CELL or VERT)

From 625dc597e8002c55df69c25a9cd10b8eda145ac0 Mon Sep 17 00:00:00 2001
From: Irufan Ahmed <irufan.ahmed04@imperial.ac.uk>
Date: Thu, 12 Feb 2026 16:16:48 +0000
Subject: [PATCH 12/12] doc: fix erroneous comments on bcs

---
 src/backend/cuda/poisson_fft.f90                | 14 +++++++-------
 src/backend/omp/kernels/spectral_processing.f90 |  6 +++---
 src/backend/omp/poisson_fft.f90                 | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/backend/cuda/poisson_fft.f90 b/src/backend/cuda/poisson_fft.f90
index ca1772095..5b421a3ef 100644
--- a/src/backend/cuda/poisson_fft.f90
+++ b/src/backend/cuda/poisson_fft.f90
@@ -424,10 +424,10 @@ subroutine fft_backward_cuda(self, f)
   end subroutine fft_backward_cuda
 
   subroutine fft_postprocess_000_cuda(self)
-    !! Post-process spectral data for Dirichlet-Dirichlet-Dirichlet boundaries.
+    !! Post-process spectral data for Periodic-Periodic-Periodic boundaries.
     !!
     !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with homogeneous
-    !! Dirichlet boundaries in all directions.
+    !! Periodic boundaries in all directions.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
@@ -467,10 +467,10 @@ subroutine fft_postprocess_000_cuda(self)
   end subroutine fft_postprocess_000_cuda
 
   subroutine fft_postprocess_010_cuda(self)
-    !! Post-process spectral data for Dirichlet-Neumann-Dirichlet boundaries.
+    !! Post-process spectral data for Periodic-Non-periodic-Periodic boundaries.
     !!
-    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with Dirichlet
-    !! boundaries in X and Z, Neumann in Y. Handles stretched meshes with
+    !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with Periodic
+    !! boundaries in X and Z, non-periodic in Y. Handles stretched meshes with
     !! matrix solves in spectral space.
     implicit none
 
@@ -578,10 +578,10 @@ subroutine fft_postprocess_010_cuda(self)
   end subroutine fft_postprocess_010_cuda
 
   subroutine enforce_periodicity_y_cuda(self, f_out, f_in)
-    !! Enforce periodic extension in Y for Neumann boundaries.
+    !! Enforce periodic extension in Y for non-periodic boundaries.
     !!
     !! Extends field from physical domain size to doubled periodic domain
-    !! by symmetry (f(y+L) = f(L-y)) for Neumann boundary FFTs.
+    !! by symmetry (f(y+L) = f(L-y)) for non-periodic boundary FFTs.
     implicit none
 
     class(cuda_poisson_fft_t) :: self
diff --git a/src/backend/omp/kernels/spectral_processing.f90 b/src/backend/omp/kernels/spectral_processing.f90
index 8b00031b1..fbc188928 100644
--- a/src/backend/omp/kernels/spectral_processing.f90
+++ b/src/backend/omp/kernels/spectral_processing.f90
@@ -3,7 +3,7 @@ module m_omp_spectral
   !!
   !! Provides kernels for solving Poisson equation in Fourier space with
   !! spectral equivalence transformations. Handles different boundary
-  !! condition combinations: fully periodic (000) and Dirichlet in Y (010).
+  !! condition combinations: fully periodic (000) and non-periodic in Y (010).
   !!
   !! **Spectral equivalence:** Modified wavenumbers for finite-difference
   !! grid (Lele 1992). Ensures spectral solver matches compact FD schemes.
@@ -139,7 +139,7 @@ subroutine process_spectral_010( &
     )
     !! Solve Poisson in spectral space for (0,1,0) boundary conditions.
     !!
-    !! Processes Dirichlet in Y, periodic in X and Z. Uses sine series
+    !! Processes non-periodic in Y, periodic in X and Z. Uses sine series
     !! in Y-direction (symmetry/antisymmetry transform) combined with
     !! Fourier in X and Z.
     !!
@@ -152,7 +152,7 @@ subroutine process_spectral_010( &
     !! 6. Backward spectral equivalence in X and Z
     !!
     !! **Y-direction:** Sine series requires special symmetric processing
-    !! to maintain real-valued solution with Dirichlet BCs.
+    !! to maintain real-valued solution with non-periodic BCs.
     !!
     !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4
     implicit none
diff --git a/src/backend/omp/poisson_fft.f90 b/src/backend/omp/poisson_fft.f90
index 0434d7b98..db5241803 100644
--- a/src/backend/omp/poisson_fft.f90
+++ b/src/backend/omp/poisson_fft.f90
@@ -14,7 +14,7 @@ module m_omp_poisson_fft
   !! **Boundary conditions:**
   !!
   !! - (0,0,0): Periodic in all directions
-  !! - (0,1,0): Dirichlet in Y, periodic in X/Z (uses symmetry transform)
+  !! - (0,1,0): Non-periodic in Y, periodic in X/Z (uses symmetry transform)
   !!
   !! **Parallelisation:** MPI via 2DECOMP&FFT pencil decomposition
   !!
@@ -42,7 +42,7 @@ module m_omp_poisson_fft
     procedure :: fft_backward => fft_backward_omp         !! Transform to physical space
     procedure :: fft_postprocess_000 => fft_postprocess_000_omp  !! Spectral solve for (0,0,0) BCs
     procedure :: fft_postprocess_010 => fft_postprocess_010_omp  !! Spectral solve for (0,1,0) BCs
-    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp  !! Symmetry transform for Y Dirichlet
+    procedure :: enforce_periodicity_y => enforce_periodicity_y_omp  !! Symmetry transform for Y non-periodic
     procedure :: undo_periodicity_y => undo_periodicity_y_omp        !! Inverse symmetry transform
   end type omp_poisson_fft_t
 
@@ -152,7 +152,7 @@ end subroutine fft_postprocess_000_omp
   subroutine fft_postprocess_010_omp(self)
     !! Spectral solve for (0,1,0) boundary conditions.
     !!
-    !! Solves Poisson equation with Dirichlet BCs in Y-direction,
+    !! Solves Poisson equation with non-periodic BCs in Y-direction,
     !! periodic in X and Z. Uses modified wavenumbers accounting for
     !! symmetry transformation (sine series in Y).
     !!
@@ -171,11 +171,11 @@ subroutine fft_postprocess_010_omp(self)
   end subroutine fft_postprocess_010_omp
 
   subroutine enforce_periodicity_y_omp(self, f_out, f_in)
-    !! Apply symmetry transform for Y Dirichlet boundary conditions.
+    !! Apply symmetry transform for Y non-periodic boundary conditions.
     !!
     !! Converts physical field to symmetric/antisymmetric representation
     !! suitable for sine series FFT. Used before forward FFT when Y-direction
-    !! has Dirichlet (non-periodic) BCs.
+    !! has non-periodic BCs.
     !!
     !! **Transformation:** Maps domain to symmetric extension for sine basis.
     implicit none
@@ -204,10 +204,10 @@ subroutine enforce_periodicity_y_omp(self, f_out, f_in)
   end subroutine enforce_periodicity_y_omp
 
   subroutine undo_periodicity_y_omp(self, f_out, f_in)
-    !! Inverse symmetry transform for Y Dirichlet boundary conditions.
+    !! Inverse symmetry transform for Y non-periodic boundary conditions.
     !!
     !! Converts symmetric/antisymmetric representation back to physical
-    !! field. Used after backward FFT when Y-direction has Dirichlet BCs.
+    !! field. Used after backward FFT when Y-direction has non-periodic BCs.
     !!
     !! **Transformation:** Extracts physical domain from symmetric extension.
     implicit none