diff --git a/src/allocator.f90 b/src/allocator.f90 index 698beda79..8a204f0ae 100644 --- a/src/allocator.f90 +++ b/src/allocator.f90 @@ -1,4 +1,11 @@ module m_allocator + !! Memory allocator module for managing field data blocks. + !! + !! This module provides an allocator type that manages a pool of memory blocks + !! (`field_t` objects) organised in a linked list. The allocator supports efficient + !! memory reuse by allowing blocks to be requested and released, minimizing + !! allocation/deallocation overhead during simulations. + use iso_fortran_env, only: stderr => error_unit use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C, NULL_LOC @@ -34,16 +41,18 @@ module m_allocator !! [[m_allocator(module):release_block(subroutine)]]. The !! released block is then pushed in front of the block list. - integer :: ngrid, sz - !> The id for the next allocated block. This counter is - !> incremented each time a new block is allocated. + integer :: ngrid !! Total number of grid points per block + integer :: sz !! Block size for data reordering + !> The ID for the next allocated block. This counter is + !! incremented each time a new block is allocated. integer :: next_id = 0 - !> padded dimensions and n_groups in all 'dir's + !> Padded dimensions in all directions [3 dims x 4 directions]. + !! Dimensions are padded based on block size for efficient reordering. integer, private :: dims_padded_dir(3, 4) + !> Number of groups for reordering in each direction [x, y, z]. integer, private :: n_groups_dir(3) - !> The pointer to the first block on the list. Non associated if - !> the list is empty - ! TODO: Rename first to head + !> Pointer to the first block on the linked list. Non-associated if + !! the list is empty. (TODO: Rename first to head) class(field_t), pointer :: first => null() contains procedure :: get_block @@ -62,8 +71,14 @@ module m_allocator contains function allocator_init(dims, sz) result(allocator) - integer, intent(in) :: dims(3), sz - type(allocator_t) :: allocator + !! Initialise an allocator for the given grid dimensions and block size. + !! + !! Creates a new allocator configured for the specified grid dimensions + !! with the given block size. Computes padded dimensions and number of + !! groups for efficient data reordering operations. + integer, intent(in) :: dims(3) !! Grid dimensions [nx, ny, nz] + integer, intent(in) :: sz !! Block size for reordering + type(allocator_t) :: allocator !! Initialised allocator integer :: nx, ny, nz, nx_padded, ny_padded, nz_padded @@ -205,21 +220,31 @@ function get_block_ids(self) end function get_block_ids function get_padded_dims(self, dir) result(dims) + !! Get padded dimensions for a specific direction. + !! + !! Returns the padded dimensions used for memory allocation in the + !! specified direction. Padding is applied to ensure efficient memory + !! access patterns and alignment. implicit none - class(allocator_t), intent(inout) :: self - integer, intent(in) :: dir - integer :: dims(3) + class(allocator_t), intent(inout) :: self !! Allocator object + integer, intent(in) :: dir !! Direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) + integer :: dims(3) !! Padded dimensions [nx_pad, ny_pad, nz_pad] dims = self%dims_padded_dir(1:3, dir) end function get_padded_dims function get_n_groups(self, dir) result(n_groups) + !! Get number of groups for data reordering in a direction. + !! + !! Returns the number of groups used for data reordering operations + !! in the specified direction. Groups are determined by the block size + !! and grid dimensions. implicit none - class(allocator_t), intent(inout) :: self - integer, intent(in) :: dir - integer :: n_groups + class(allocator_t), intent(inout) :: self !! Allocator object + integer, intent(in) :: dir !! Direction (DIR_X, DIR_Y, or DIR_Z) + integer :: n_groups !! Number of groups n_groups = self%n_groups_dir(dir) end function get_n_groups diff --git a/src/backend/backend.f90 b/src/backend/backend.f90 index 4c10d2c74..5c73da7ba 100644 --- a/src/backend/backend.f90 +++ b/src/backend/backend.f90 @@ -1,4 +1,47 @@ module m_base_backend + !! Abstract base backend defining the computational interface for X3D2 solver. + !! + !! This module defines the `base_backend_t` abstract type, which establishes + !! the interface for all backend implementations (CUDA GPU, OpenMP CPU, etc.). + !! The solver operates exclusively through these abstract interfaces, enabling + !! complete architecture independence. + !! + !! **Architecture Pattern:** + !! + !! The backend abstraction follows the Strategy design pattern: + !! + !! - **Abstract interface** (`base_backend_t`): Defines deferred procedures for + !! all computational operations required by the solver + !! - **Concrete implementations**: CUDA backend (`m_cuda_backend`) and OMP + !! backend (`m_omp_backend`) extend this base and provide architecture-specific + !! implementations + !! - **Solver independence**: The solver (`m_solver`) calls backend methods + !! through the abstract interface without knowing the underlying implementation + !! + !! **Key Operations Defined:** + !! + !! - **Transport equation derivatives**: `transeq_x`, `transeq_y`, `transeq_z` + !! compute directional derivatives with halo exchange for distributed compact schemes + !! - **Tridiagonal solves**: `tds_solve` applies compact finite difference operators + !! - **Data reordering**: `reorder` transforms data between pencil decomposition + !! orientations (X, Y, Z directions) + !! - **Field operations**: Vector arithmetic (`veccopy`, `vecadd`, `vecmult`), + !! reductions (`scalar_product`, `field_volume_integral`), and utilities + !! (`field_scale`, `field_shift`, `field_set_face`) + !! - **Summation**: `sum_yintox`, `sum_zintox` for integrating fields along + !! specific directions + !! + !! **Backend Implementations:** + !! + !! - **CUDA backend** (`src/backend/cuda/backend.f90`): GPU-accelerated using + !! NVIDIA CUDA with device memory management and kernel launches + !! - **OMP backend** (`src/backend/omp/backend.f90`): CPU parallelism via + !! OpenMP threading and MPI domain decomposition + !! + !! **Usage:** + !! + !! Backends are instantiated at runtime based on compile-time configuration and + !! passed to the solver as a polymorphic pointer (`class(base_backend_t), pointer`). use mpi use m_allocator, only: allocator_t @@ -11,19 +54,37 @@ module m_base_backend implicit none type, abstract :: base_backend_t - !! base_backend class defines all the abstract operations that the - !! solver class requires. + !! Abstract base type defining the computational backend interface. !! - !! For example, transport equation in solver class evaluates the - !! derivatives in x, y, and z directions, and reorders the input - !! fields as required. Then finally, combines all the directional - !! derivatives to obtain the divergence of U*. + !! This type encapsulates all architecture-specific operations required + !! by the solver, enabling transparent execution on different hardware + !! platforms (GPU via CUDA, CPU via OpenMP) without modifying solver code. !! - !! All these high level operations solver class executes are - !! defined here using the abstract interfaces. Every backend - !! implementation extends the present abstact backend class to - !! define the specifics of these operations based on the target - !! architecture. + !! **Design Philosophy:** + !! + !! The solver executes high-level operations (compute transport equation, + !! solve tridiagonal systems, reorder data, etc.) through deferred procedures + !! defined in this abstract interface. Each backend (CUDA, OMP) extends this + !! type and implements these procedures using architecture-specific kernels, + !! libraries, and memory management strategies. + !! + !! **Example Workflow:** + !! + !! When computing the transport equation, the solver calls: + !! + !! 1. `transeq_x`, `transeq_y`, `transeq_z` to compute directional derivatives + !! 2. `reorder` to transform data between pencil orientations + !! 3. `vecadd` to combine derivatives into divergence of \(U^*\) + !! + !! Each call dispatches to the appropriate backend implementation at runtime + !! via dynamic polymorphism. + !! + !! **Components:** + !! + !! - `n_halo`: Number of halo layers for distributed compact schemes (fixed at 4) + !! - `mesh`: Pointer to mesh object (grid dimensions, boundary conditions, decomposition) + !! - `allocator`: Memory allocator for field storage (host for OMP, device for CUDA) + !! - `poisson_fft`: FFT-based Poisson solver for pressure correction !> DistD2 implementation is hardcoded for 4 halo layers for all backends integer :: n_halo = 4 @@ -59,11 +120,35 @@ module m_base_backend abstract interface subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps) - !! transeq equation obtains the derivatives direction by - !! direction, and the exact algorithm used to obtain these - !! derivatives are decided at runtime. Backend implementations - !! are responsible from directing calls to transeq_ders into - !! the correct algorithm. + !! Compute transport equation derivatives for velocity components. + !! + !! This is the core computational kernel for the transport equation, + !! computing the advection-diffusion terms in one coordinate direction: + !! + !! \[ + !! \frac{\partial u_i}{\partial t} = -u \frac{\partial u_i}{\partial x_j} + !! - v \frac{\partial u_i}{\partial x_j} + !! - w \frac{\partial u_i}{\partial x_j} + !! + \nu \nabla^2 u_i + !! \] + !! + !! (where the direction \(x_j\) is specified by `dirps`). + !! + !! **Runtime algorithm selection:** + !! + !! The exact algorithm used to obtain the derivatives is decided at runtime + !! by the backend implementation. Backend implementations are responsible + !! for directing calls to the appropriate algorithm based on: + !! + !! - Operator configuration in `dirps` (distributed vs local compact schemes) + !! - Domain decomposition (number of processes in current direction) + !! - Boundary conditions (periodic vs non-periodic) + !! + !! The implementation routes to either: + !! + !! - **Distributed algorithm** (`exec_dist_transeq_3fused`): For distributed + !! compact schemes with MPI halo exchange + !! - **Thomas algorithm** (`exec_thom_transeq`): For localized/periodic operators import :: base_backend_t import :: field_t import :: dirps_t @@ -71,20 +156,34 @@ subroutine transeq_ders(self, du, dv, dw, u, v, w, nu, dirps) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(field_t), intent(inout) :: du, dv, dw !! Derivative outputs (momentum equation RHS) + class(field_t), intent(in) :: u, v, w !! Velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional derivative operators end subroutine transeq_ders end interface abstract interface subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync) - !! transeq equation obtains the derivatives direction by - !! direction, and the exact algorithm used to obtain these - !! derivatives are decided at runtime. Backend implementations - !! are responsible from directing calls to transeq_ders into - !! the correct algorithm. + !! Compute transport equation derivatives for passive scalar species. + !! + !! Similar to `transeq_ders` but for passive scalar transport: + !! + !! \[ + !! \frac{\partial \phi}{\partial t} = -u \frac{\partial \phi}{\partial x_j} + !! - v \frac{\partial \phi}{\partial x_j} + !! - w \frac{\partial \phi}{\partial x_j} + !! + \nu \nabla^2 \phi + !! \] + !! + !! where \(\phi\) is the scalar concentration and \(x_j\) is the direction + !! specified by `dirps`. + !! + !! **Synchronization:** + !! + !! The `sync` flag controls whether to synchronize device-to-host memory + !! transfers (CUDA backend) after computation. Set `.false.` when chaining + !! multiple operations to avoid unnecessary transfers. import :: base_backend_t import :: field_t import :: dirps_t @@ -92,144 +191,275 @@ subroutine transeq_ders_spec(self, dspec, uvw, spec, nu, dirps, sync) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: dspec - class(field_t), intent(in) :: uvw, spec - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps - logical, intent(in) :: sync + class(field_t), intent(inout) :: dspec !! Scalar derivative output + class(field_t), intent(in) :: uvw !! Velocity component in current direction + class(field_t), intent(in) :: spec !! Scalar species concentration + real(dp), intent(in) :: nu !! Diffusion coefficient + type(dirps_t), intent(in) :: dirps !! Directional derivative operators + logical, intent(in) :: sync !! Synchronize device transfers (CUDA only) end subroutine transeq_ders_spec end interface abstract interface subroutine tds_solve(self, du, u, tdsops) - !! transeq equation obtains the derivatives direction by - !! direction, and the exact algorithm used to obtain these - !! derivatives are decided at runtime. Backend implementations - !! are responsible from directing calls to tds_solve to the - !! correct algorithm. + !! Apply a tridiagonal operator to a field (compact finite difference operation). + !! + !! Solves the tridiagonal system arising from compact finite difference + !! schemes: + !! + !! \[ + !! A f' = B f + !! \] + !! + !! where \(A\) is the implicit (tridiagonal) operator, \(B\) is the explicit + !! stencil, and \(f'\) is the derivative (or interpolated value). + !! + !! **Backend dispatch:** + !! + !! Routes to the appropriate tridiagonal solver: + !! + !! - **Distributed compact**: Uses `exec_dist_tds_compact` with MPI communication + !! for boundary coupling between processes + !! - **Thomas algorithm**: Uses `exec_thom_tds_compact` for local/periodic systems + !! - **GPU**: Uses batched tridiagonal solvers (cuSPARSE or custom kernels) + !! + !! **Operations supported:** + !! + !! First derivative, second derivative, interpolation, staggered derivatives + !! (configured in `tdsops`). import :: base_backend_t import :: field_t import :: tdsops_t implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - class(tdsops_t), intent(in) :: tdsops + class(field_t), intent(inout) :: du !! Output field (derivative or interpolated values) + class(field_t), intent(in) :: u !! Input field + class(tdsops_t), intent(in) :: tdsops !! Tridiagonal operator (preprocessed) end subroutine tds_solve end interface abstract interface subroutine reorder(self, u_, u, direction) - !! reorder subroutines are straightforward, they rearrange - !! data into our specialist data structure so that regardless - !! of the direction tridiagonal systems are solved efficiently - !! and fast. + !! Reorder field data between pencil decomposition orientations. + !! + !! Transforms field layout from one pencil orientation to another to enable + !! efficient tridiagonal solves in different coordinate directions: + !! + !! - **DIR_X**: X-pencils (data contiguous in X, decomposed in Y-Z) + !! - **DIR_Y**: Y-pencils (data contiguous in Y, decomposed in X-Z) + !! - **DIR_Z**: Z-pencils (data contiguous in Z, decomposed in X-Y) + !! - **DIR_C**: Special compact orientation + !! + !! The `direction` parameter specifies the target orientation using reorder + !! constants (`RDR_X2Y`, `RDR_Y2Z`, etc.). + !! + !! **Backend implementation:** + !! + !! - **CUDA**: GPU transpose kernels with coalesced memory access + !! - **OMP**: MPI all-to-all communication with OpenMP threading + !! + !! **Performance note:** This is a bandwidth-intensive operation requiring + !! global data movement (MPI or device memory transfers). import :: base_backend_t import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - integer, intent(in) :: direction + class(field_t), intent(inout) :: u_ !! Output field (reordered) + class(field_t), intent(in) :: u !! Input field + integer, intent(in) :: direction !! Reorder direction (RDR_X2Y, RDR_Y2Z, etc.) end subroutine reorder end interface abstract interface subroutine sum_intox(self, u, u_) - !! sum9into3 subroutine combines all the directional velocity - !! derivatives into the corresponding x directional fields. + !! Sum directional derivatives back into X-oriented fields. + !! + !! Combines derivative contributions computed in different pencil orientations + !! (Y-pencils, Z-pencils) back into the X-pencil orientation: + !! + !! \[ + !! u = u + u' + !! \] + !! + !! This operation accumulates terms when computing composite derivatives + !! like divergence: + !! + !! \[ + !! \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + !! + \frac{\partial v}{\partial y} + !! + \frac{\partial w}{\partial z} + !! \] + !! + !! Each directional derivative is computed in its respective pencil orientation, + !! then summed into X-pencils via `sum_yintox` and `sum_zintox`. + !! + !! **Note:** The input field `u_` must be in a Y or Z pencil orientation; + !! the output `u` is always in X-pencil orientation. import :: base_backend_t import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: u - class(field_t), intent(in) :: u_ + class(field_t), intent(inout) :: u !! Accumulated field (X-pencils, updated in-place) + class(field_t), intent(in) :: u_ !! Contribution to add (Y or Z pencils) end subroutine sum_intox end interface abstract interface subroutine veccopy(self, dst, src) - !! copy vectors: y = x + !! Copy one field to another: `dst = src`. + !! + !! Performs an element-wise copy of all field data from `src` to `dst`. + !! Both fields must have compatible dimensions and memory layout. + !! + !! **Backend implementation:** + !! + !! - **CUDA**: Device-to-device memory copy (cudaMemcpy) + !! - **OMP**: Host memory copy (array assignment or memcpy) + !! + !! **Note:** This is a deep copy operation; the fields remain independent + !! after the copy. import :: base_backend_t import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: dst - class(field_t), intent(in) :: src + class(field_t), intent(inout) :: dst !! Destination field + class(field_t), intent(in) :: src !! Source field end subroutine veccopy end interface abstract interface subroutine vecadd(self, a, x, b, y) - !! adds two vectors together: y = a*x + b*y + !! Compute linear combination of two fields (AXPBY operation). + !! + !! Performs the vector operation: \(y = a \cdot x + b \cdot y\) + !! + !! This is equivalent to the BLAS AXPBY operation, computing a scaled + !! sum of two vectors. The result is stored in-place in `y`. + !! + !! **Common use cases:** + !! + !! - **Vector addition**: `vecadd(self, 1.0_dp, x, 1.0_dp, y)` \(\rightarrow\) \(y = x + y\) + !! - **Scaled addition**: `vecadd(self, alpha, x, 1.0_dp, y)` \(\rightarrow\) \(y = \alpha x + y\) + !! - **Replacement**: `vecadd(self, 1.0_dp, x, 0.0_dp, y)` \(\rightarrow\) \(y = x\) import :: base_backend_t import :: dp import :: field_t implicit none class(base_backend_t) :: self - real(dp), intent(in) :: a - class(field_t), intent(in) :: x - real(dp), intent(in) :: b - class(field_t), intent(inout) :: y + real(dp), intent(in) :: a !! Scaling factor for x + class(field_t), intent(in) :: x !! Input field + real(dp), intent(in) :: b !! Scaling factor for y + class(field_t), intent(inout) :: y !! Input/output field (modified in-place) end subroutine vecadd end interface abstract interface subroutine vecmult(self, y, x) - !! pointwise multiplication between two vectors: y(:) = y(:) * x(:) + !! Element-wise (pointwise) multiplication of two fields. + !! + !! Performs the element-wise product: \(y = y \odot x\) + !! + !! Each element of `y` is multiplied by the corresponding element of `x`. + !! The result is stored in-place in `y`. This is also known as the + !! Hadamard product or pointwise multiplication. import :: base_backend_t import :: dp import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: y - class(field_t), intent(in) :: x + class(field_t), intent(inout) :: y !! Input/output field (modified in-place) + class(field_t), intent(in) :: x !! Multiplier field end subroutine vecmult end interface abstract interface real(dp) function scalar_product(self, x, y) result(s) - !! Calculates the scalar product of two input fields + !! Compute the global scalar (dot) product of two fields. + !! + !! Calculates: \(s = \sum_{i} x_i \cdot y_i\) + !! + !! This computes the inner product (dot product) of two fields across + !! all grid points. For distributed memory systems (MPI), partial sums + !! from each process are accumulated via MPI reduction to produce the + !! global sum. + !! + !! **Note:** The result includes contributions from all MPI ranks. import :: base_backend_t import :: dp import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(in) :: x, y + class(field_t), intent(in) :: x !! First field + class(field_t), intent(in) :: y !! Second field end function scalar_product end interface abstract interface subroutine field_ops(self, f, a) - !! Scales or shifts a field by a + !! Generic interface for in-place field operations with a scalar constant. + !! + !! This abstract interface is implemented by two operations: + !! + !! - **field_scale**: Multiply field by constant: \(f = a \cdot f\) + !! - **field_shift**: Add constant to field: \(f = f + a\) + !! + !! Both operations modify the field in-place and are backend-specific + !! (GPU kernels for CUDA, array operations for OMP). import :: base_backend_t import :: dp import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(in) :: f - real(dp), intent(in) :: a + class(field_t), intent(in) :: f !! Field to operate on (modified in-place) + real(dp), intent(in) :: a !! Scalar constant (scaling factor or shift amount) end subroutine field_ops end interface abstract interface real(dp) function field_reduce(self, f) result(s) - !! Reduces field to a scalar, example: volume integral + !! Reduce a field to a single scalar value via global summation. + !! + !! This abstract interface is currently implemented by: + !! + !! - **field_volume_integral**: Computes the volume integral \(\int f \,dV\) + !! + !! **Algorithm:** + !! + !! 1. **Local summation**: Each MPI process sums its local field values + !! (optionally weighted by cell volumes for volume integration) + !! 2. **Global reduction**: MPI_Allreduce combines partial sums from all + !! processes to produce the global result + !! + !! **Backend implementations:** + !! + !! - **CUDA**: GPU reduction kernel followed by MPI_Allreduce + !! - **OMP**: OpenMP parallel reduction followed by MPI_Allreduce + !! + !! **Requirements:** + !! + !! - Field must have `data_loc` set (cannot be `NULL_LOC`) + !! - Field must be in X-pencil orientation (`dir = DIR_X`) + !! + !! **Use cases:** + !! + !! - Volume integrals for conservation checks + !! - Global norms (L1, L2) for convergence monitoring + !! - Total mass/energy calculations import :: base_backend_t import :: dp import :: field_t implicit none class(base_backend_t) :: self - class(field_t), intent(in) :: f + class(field_t), intent(in) :: f !! Field to reduce end function field_reduce end interface @@ -255,7 +485,7 @@ subroutine field_set_face(self, f, c_start, c_end, face) !! or a global domain boundary based on the location of the subdomain. !! This subroutine allows us to set any of these faces to a value, !! 'c_start' and 'c_end' for faces at opposite sides. - !! 'face' is one of X_FACE, Y_FACE, Z_FACE from common.f90 + !! 'face' is one of `X_FACE`, `Y_FACE`, `Z_FACE` from `common.f90` import :: base_backend_t import :: dp import :: field_t @@ -298,6 +528,39 @@ end subroutine copy_f_to_data abstract interface subroutine alloc_tdsops( & + !! Allocate and initialise a backend-specific tridiagonal operator. + !! + !! This deferred procedure creates a `tdsops_t` object configured for + !! compact finite difference operations (derivatives, interpolation, etc.). + !! The backend implementation allocates the appropriate subtype: + !! + !! - **CUDA backend**: Allocates `cuda_tdsops_t` with device memory pointers + !! for GPU execution + !! - **OMP backend**: Allocates `omp_tdsops_t` with host memory for CPU execution + !! + !! The operator is fully preprocessed and ready for repeated application via + !! `tds_solve`. + !! + !! **Required arguments:** + !! + !! - `n_tds`: System size (number of grid points in the operator direction) + !! - `delta`: Grid spacing + !! - `operation`: Operation type (`'first-deriv'`, `'second-deriv'`, + !! `'interpolate'`, `'stag-deriv'`) + !! - `scheme`: Numerical scheme name (e.g., `'compact6'`, `'compact4'`) + !! - `bc_start`, `bc_end`: Boundary condition flags (`BC_PERIODIC`, + !! `BC_NEUMANN`, `BC_DIRICHLET`) + !! + !! **Optional arguments:** + !! + !! - `stretch`: Stretching coefficients for non-uniform grids + !! - `stretch_correct`: Correction for second derivatives on stretched grids + !! - `n_halo`: Number of halo layers (default from backend) + !! - `from_to`: Staggered grid direction (`'v2p'`, `'p2v'`) + !! - `sym`: Field symmetry at Neumann boundaries (`.true.` = symmetric/even, + !! `.false.` = anti-symmetric/odd) + !! - `c_nu`, `nu0_nu`: Hyperviscosity parameters for compact6-hyperviscous + !! second derivatives self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, & stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu & ) @@ -322,6 +585,37 @@ end subroutine alloc_tdsops abstract interface subroutine init_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem) + !! Initialise the backend-specific FFT-based Poisson solver. + !! + !! This deferred procedure creates and configures the Poisson solver object + !! (`self%poisson_fft`) for solving the pressure Poisson equation: + !! \(\nabla^2 \phi = f\) + !! + !! The backend implementation allocates the appropriate solver subtype: + !! + !! - **CUDA backend**: Allocates `cuda_poisson_fft_t` using cuFFT library + !! for GPU-accelerated FFT transforms + !! - **OMP backend**: Allocates `omp_poisson_fft_t` using 2DECOMP&FFT library + !! for CPU FFT transforms with MPI parallelisation + !! + !! The solver requires directional derivative operators (`xdirps`, `ydirps`, + !! `zdirps`) to construct spectral equivalence constants for handling: + !! + !! - Non-uniform grid spacing (stretching) in the Y-direction + !! - Mixed boundary conditions (e.g., periodic in X/Z, Dirichlet in Y) + !! + !! **Arguments:** + !! + !! - `mesh`: Mesh object containing grid dimensions, boundary conditions, + !! and parallel decomposition information + !! - `xdirps`, `ydirps`, `zdirps`: Second-derivative operators in each direction, + !! used to compute spectral equivalence constants for the modified wavenumbers + !! - `lowmem` (optional): Low-memory mode flag. When `.true.`, reduces memory + !! footprint by deallocating temporary arrays after initialisation (CUDA only) + !! + !! **Note:** The Poisson solver is stored in `self%poisson_fft` and accessed + !! by the solver during the pressure correction step of the fractional-step + !! method. import :: base_backend_t import :: dirps_t import :: mesh_t diff --git a/src/backend/cuda/allocator.f90 b/src/backend/cuda/allocator.f90 index 1d21de6e3..6cab14145 100644 --- a/src/backend/cuda/allocator.f90 +++ b/src/backend/cuda/allocator.f90 @@ -1,4 +1,18 @@ module m_cuda_allocator + !! GPU memory allocator for CUDA backend. + !! + !! GPU memory (device memory) is physically separate from CPU memory (host). + !! This allocator manages device-side storage, ensuring field data resides + !! in GPU memory for kernel execution. Explicit device allocation avoids + !! expensive implicit host-device transfers that would kill performance. + !! + !! **Design rationale:** + !! + !! - `cuda_field_t` extends `field_t` with device pointers (`p_data_d`, `data_d`) + !! - Maintains both 1D and 3D views of same memory for flexibility + !! - Reference counting prevents premature deallocation + !! - Block-based allocation reduces allocation overhead + !! use m_allocator, only: allocator_t use m_common, only: dp use m_field, only: field_t @@ -7,8 +21,9 @@ module m_cuda_allocator implicit none type, extends(allocator_t) :: cuda_allocator_t + !! GPU memory allocator extending base allocator contains - procedure :: create_block => create_cuda_block + procedure :: create_block => create_cuda_block !! Allocate GPU field block end type cuda_allocator_t interface cuda_allocator_t @@ -16,12 +31,13 @@ module m_cuda_allocator end interface cuda_allocator_t type, extends(field_t) :: cuda_field_t - real(dp), device, pointer, private :: p_data_d(:) - real(dp), device, pointer, contiguous :: data_d(:, :, :) + !! Field residing in GPU device memory + real(dp), device, pointer, private :: p_data_d(:) !! 1D device memory pointer (raw allocation) + real(dp), device, pointer, contiguous :: data_d(:, :, :) !! 3D device view (for kernel access) contains - procedure :: fill => fill_cuda - procedure :: get_shape => get_shape_cuda - procedure :: set_shape => set_shape_cuda + procedure :: fill => fill_cuda !! Fill with constant value + procedure :: get_shape => get_shape_cuda !! Query 3D dimensions + procedure :: set_shape => set_shape_cuda !! Reshape 3D view end type cuda_field_t interface cuda_field_t @@ -31,9 +47,15 @@ module m_cuda_allocator contains function cuda_field_init(ngrid, next, id) result(f) - integer, intent(in) :: ngrid, id - type(cuda_field_t), pointer, intent(in) :: next - type(cuda_field_t) :: f + !! Initialise GPU field with device memory allocation. + !! + !! Device memory must be explicitly allocated before use. This constructor + !! allocates the 1D device array and sets up metadata for later reshaping + !! to 3D when dimensions are known. + integer, intent(in) :: ngrid !! Total number of grid points + integer, intent(in) :: id !! Unique field identifier + type(cuda_field_t), pointer, intent(in) :: next !! Next field in linked list + type(cuda_field_t) :: f !! Initialised field allocate (f%p_data_d(ngrid)) f%refcount = 0 @@ -42,47 +64,74 @@ function cuda_field_init(ngrid, next, id) result(f) end function cuda_field_init subroutine fill_cuda(self, c) + !! Fill entire field with constant value on GPU. + !! + !! Initialising fields directly on GPU avoids transferring initialisation + !! data from host. Single assignment to device array leverages GPU's + !! memory controllers for efficient broadcast to all elements. implicit none - class(cuda_field_t) :: self - real(dp), intent(in) :: c + class(cuda_field_t) :: self !! Field to fill + real(dp), intent(in) :: c !! Constant value self%p_data_d = c end subroutine fill_cuda function get_shape_cuda(self) result(dims) + !! Query current 3D dimensions of field. + !! + !! Fields are allocated with total size but reshaped dynamically based + !! on decomposition. This query enables algorithms to adapt to actual + !! current dimensions without hard-coding sizes. implicit none - class(cuda_field_t) :: self - integer :: dims(3) + class(cuda_field_t) :: self !! Field to query + integer :: dims(3) !! Current dimensions dims = shape(self%data_d) end function get_shape_cuda subroutine set_shape_cuda(self, dims) + !! Reshape 3D view of device memory. + !! + !! Same 1D device allocation is reused for different pencil orientations + !! (X-pencils, Y-pencils, Z-pencils). Reshaping avoids reallocating GPU + !! memory, which is expensive. Fortran pointer remapping is essentially + !! free, just changing metadata not data. implicit none - class(cuda_field_t) :: self - integer, intent(in) :: dims(3) + class(cuda_field_t) :: self !! Field to reshape + integer, intent(in) :: dims(3) !! New dimensions self%data_d(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data_d end subroutine set_shape_cuda function cuda_allocator_init(dims, sz) result(allocator) - integer, intent(in) :: dims(3), sz - type(cuda_allocator_t) :: allocator + !! Initialise CUDA allocator with grid dimensions. + !! + !! Base allocator handles dimension calculations and block management + !! logic. CUDA allocator only needs to override block creation to use + !! device memory, avoiding code duplication. + integer, intent(in) :: dims(3) !! Grid dimensions + integer, intent(in) :: sz !! Pencil size (SZ) + type(cuda_allocator_t) :: allocator !! Initialised allocator allocator%allocator_t = allocator_t(dims, sz) end function cuda_allocator_init function create_cuda_block(self, next) result(ptr) - class(cuda_allocator_t), intent(inout) :: self - type(cuda_field_t), pointer, intent(in) :: next - type(cuda_field_t), pointer :: newblock - class(field_t), pointer :: ptr + !! Create new field block in GPU memory. + !! + !! Central allocation point ensures consistent initialisation and enables + !! tracking (via IDs) for debugging memory issues. Returning base class + !! pointer maintains polymorphism for generic algorithm code. + class(cuda_allocator_t), intent(inout) :: self !! Allocator instance + type(cuda_field_t), pointer, intent(in) :: next !! Next in linked list + type(cuda_field_t), pointer :: newblock !! Newly allocated block + class(field_t), pointer :: ptr !! Polymorphic return pointer allocate (newblock) self%next_id = self%next_id + 1 newblock = cuda_field_t(self%ngrid, next, id=self%next_id) diff --git a/src/backend/cuda/backend.f90 b/src/backend/cuda/backend.f90 index 8efa5c041..94e09a59f 100644 --- a/src/backend/cuda/backend.f90 +++ b/src/backend/cuda/backend.f90 @@ -1,4 +1,14 @@ module m_cuda_backend + !! CUDA backend implementing GPU-accelerated solver operations. + !! + !! Extends `base_backend_t` with GPU kernel launches and device memory + !! management. Transport equations, tridiagonal solves, FFT operations, + !! and field manipulations execute on GPU. + !! + !! **MPI Communication:** Halo exchange passes device pointers directly to + !! MPI calls. With GPU-aware MPI implementations (OpenMPI with CUDA support, + !! MVAPICH2-GDR), data transfers directly between GPU memories. Without + !! GPU-aware MPI, the implementation stages through host memory automatically. use iso_fortran_env, only: stderr => error_unit use cudafor use mpi @@ -35,6 +45,10 @@ module m_cuda_backend private :: transeq_halo_exchange, transeq_dist_component type, extends(base_backend_t) :: cuda_backend_t + !! GPU backend with device communication buffers and kernel configurations. + !! + !! Extends [[m_base_backend(module):base_backend_t(type)]] with CUDA-specific + !! implementations and device memory buffers for halo exchange. !character(len=*), parameter :: name = 'cuda' real(dp), device, allocatable, dimension(:, :, :) :: & u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev, & @@ -78,11 +92,16 @@ module m_cuda_backend contains function init(mesh, allocator) result(backend) + !! Initialise CUDA backend with kernel configurations and communication buffers. + !! + !! Sets up CUDA thread blocks ([[m_cuda_common(module):SZ(variable)]] threads per + !! warp-aligned block) and allocates device buffers for halo exchange. Buffer size + !! accommodates largest pencil direction to support all three orientations. implicit none - type(mesh_t), target, intent(inout) :: mesh - class(allocator_t), target, intent(inout) :: allocator - type(cuda_backend_t) :: backend + type(mesh_t), target, intent(inout) :: mesh !! Computational mesh + class(allocator_t), target, intent(inout) :: allocator !! GPU memory allocator + type(cuda_backend_t) :: backend !! Initialised CUDA backend type(cuda_poisson_fft_t) :: cuda_poisson_fft integer :: n_groups @@ -140,19 +159,25 @@ subroutine alloc_cuda_tdsops( & self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, & stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu & ) + !! Allocate and initialise CUDA tridiagonal operators. + !! + !! Implements [[m_base_backend(module):alloc_tdsops(interface)]] for GPU. + !! Allocates [[m_cuda_tdsops(module):cuda_tdsops_t(type)]] with device-resident + !! coefficient arrays. implicit none class(cuda_backend_t) :: self - class(tdsops_t), allocatable, intent(inout) :: tdsops - integer, intent(in) :: n_tds - real(dp), intent(in) :: delta - character(*), intent(in) :: operation, scheme - integer, intent(in) :: bc_start, bc_end - real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) - integer, optional, intent(in) :: n_halo - character(*), optional, intent(in) :: from_to - logical, optional, intent(in) :: sym - real(dp), optional, intent(in) :: c_nu, nu0_nu + class(tdsops_t), allocatable, intent(inout) :: tdsops !! Output: allocated CUDA operators + integer, intent(in) :: n_tds !! Number of tridiagonal systems + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: operation !! Operation type (derivative/interpolation) + character(*), intent(in) :: scheme !! Scheme name + integer, intent(in) :: bc_start, bc_end !! Boundary condition flags + real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) !! Grid stretching factors + integer, optional, intent(in) :: n_halo !! Halo width for distributed schemes + character(*), optional, intent(in) :: from_to !! Interpolation direction + logical, optional, intent(in) :: sym !! Symmetry flag + real(dp), optional, intent(in) :: c_nu, nu0_nu !! Viscosity parameters allocate (cuda_tdsops_t :: tdsops) @@ -166,13 +191,18 @@ subroutine alloc_cuda_tdsops( & end subroutine alloc_cuda_tdsops subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation in x-direction using CUDA. + !! + !! Implements [[m_base_backend(module):transeq_ders(interface)]]. + !! Routes to distributed or Thomas algorithm based on + !! [[m_tdsops(module):dirps_t(type)]] configuration. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(field_t), intent(inout) :: du, dv, dw !! Output: RHS contributions + class(field_t), intent(in) :: u, v, w !! Input: velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional operators call self%transeq_cuda_dist(du, dv, dw, u, v, w, nu, dirps, & self%xblocks, self%xthreads) @@ -180,13 +210,17 @@ subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_x_cuda subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation in y-direction using CUDA. + !! + !! Implements [[m_base_backend(module):transeq_ders(interface)]]. + !! Arguments reordered (v, u, w) to match y-pencil orientation. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(field_t), intent(inout) :: du, dv, dw !! Output: RHS contributions + class(field_t), intent(in) :: u, v, w !! Input: velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional operators ! u, v, w is reordered so that we pass v, u, w call self%transeq_cuda_dist(dv, du, dw, v, u, w, nu, dirps, & @@ -195,13 +229,17 @@ subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_y_cuda subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation in z-direction using CUDA. + !! + !! Implements [[m_base_backend(module):transeq_ders(interface)]]. + !! Arguments reordered (w, u, v) to match z-pencil orientation. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(field_t), intent(inout) :: du, dv, dw !! Output: RHS contributions + class(field_t), intent(in) :: u, v, w !! Input: velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional operators ! u, v, w is reordered so that we pass w, u, v call self%transeq_cuda_dist(dw, du, dv, w, u, v, nu, dirps, & @@ -212,16 +250,19 @@ end subroutine transeq_z_cuda subroutine transeq_species_cuda(self, dspec, uvw, spec, nu, dirps, sync) !! Compute the convection and diffusion for the given field !! in the given direction. - !! Halo exchange for the given field is necessary - !! When sync is true, halo exchange of momentum is necessary + !! + !! Implements [[m_base_backend(module):transeq_ders_spec(interface)]]. + !! Halo exchange for the given field is necessary. + !! When sync is true, halo exchange of momentum is necessary. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: dspec - class(field_t), intent(in) :: uvw, spec - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps - logical, intent(in) :: sync + class(field_t), intent(inout) :: dspec !! Output: RHS contribution for species + class(field_t), intent(in) :: uvw !! Input: velocity component in transport direction + class(field_t), intent(in) :: spec !! Input: species concentration field + real(dp), intent(in) :: nu !! Diffusivity (kinematic viscosity) + type(dirps_t), intent(in) :: dirps !! Directional operators + logical, intent(in) :: sync !! If true, also exchange momentum halos integer :: n_groups type(cuda_tdsops_t), pointer :: der1st, der1st_sym, der2nd, der2nd_sym @@ -282,14 +323,19 @@ end subroutine transeq_species_cuda subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, & blocks, threads) + !! Compute transport equation using distributed compact scheme on GPU. + !! + !! Handles halo exchange with [[m_cuda_sendrecv(module):sendrecv_3fields(interface)]], + !! launches [[m_cuda_exec_dist(module):exec_dist_transeq_3fused(interface)]] kernel, + !! and gathers derivatives. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps - type(dim3), intent(in) :: blocks, threads + class(field_t), intent(inout) :: du, dv, dw !! Output: RHS contributions + class(field_t), intent(in) :: u, v, w !! Input: velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional operators + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration real(dp), device, pointer, dimension(:, :, :) :: u_dev, v_dev, w_dev, & du_dev, dv_dev, dw_dev @@ -342,9 +388,13 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, nu, dirps, & end subroutine transeq_cuda_dist subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dir) + !! Exchange velocity field halos using MPI with device pointers. + !! + !! Packs boundary data into communication buffers and exchanges with + !! neighbouring ranks. Uses sendrecv_3fields for batched communication. class(cuda_backend_t) :: self - real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev - integer, intent(in) :: dir + real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev !! Velocity components on device + integer, intent(in) :: dir !! Direction for halo exchange integer :: n, nproc_dir, pprev, pnext integer :: n_groups @@ -376,20 +426,21 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, & conv_recv_s_dev, conv_recv_e_dev, & tdsops_du, tdsops_dud, tdsops_d2u, & dir, blocks, threads) - !! Computes RHS_x^u following: + !! Compute transport equation RHS component using distributed compact schemes. !! - !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2 + !! Computes: $\text{rhs} = -\frac{1}{2}(\text{conv} \frac{\partial u}{\partial x} + \frac{\partial (u \cdot \text{conv})}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$ class(cuda_backend_t) :: self - !> The result field, it is also used as temporary storage - real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev - real(dp), device, dimension(:, :, :), intent(in) :: u_dev, conv_dev - real(dp), intent(in) :: nu + real(dp), device, dimension(:, :, :), intent(out) :: rhs_du_dev !! Output: transport equation RHS + real(dp), device, dimension(:, :, :), intent(in) :: u_dev !! Input: velocity component field + real(dp), device, dimension(:, :, :), intent(in) :: conv_dev !! Input: convecting velocity field + real(dp), intent(in) :: nu !! Kinematic viscosity real(dp), device, dimension(:, :, :), intent(in) :: & - u_recv_s_dev, u_recv_e_dev, & - conv_recv_s_dev, conv_recv_e_dev - class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u - integer, intent(in) :: dir - type(dim3), intent(in) :: blocks, threads + u_recv_s_dev, u_recv_e_dev !! Halo data for u from neighbours + real(dp), device, dimension(:, :, :), intent(in) :: & + conv_recv_s_dev, conv_recv_e_dev !! Halo data for conv from neighbours + class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u !! Operators for derivatives + integer, intent(in) :: dir !! Direction index + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration class(field_t), pointer :: dud, d2u @@ -425,25 +476,31 @@ subroutine transeq_dist_component(self, rhs_du_dev, u_dev, conv_dev, nu, & end subroutine transeq_dist_component subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps) - !! Thomas algorithm implementation. So much more easier than the - !! distributed algorithm. It is intended to work only on a single rank - !! so there is no MPI communication. + !! Compute transport equation using Thomas algorithm. + !! + !! Simpler than distributed scheme - no MPI communication, uses + !! [[m_cuda_exec_thom(module):exec_thom_tds_compact(interface)]] kernel. + !! Intended for single-rank execution only. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - type(dirps_t), intent(in) :: dirps + class(field_t), intent(inout) :: du, dv, dw !! Output: RHS contributions + class(field_t), intent(in) :: u, v, w !! Input: velocity components + type(dirps_t), intent(in) :: dirps !! Directional operators end subroutine transeq_cuda_thom subroutine tds_solve_cuda(self, du, u, tdsops) + !! Solve tridiagonal systems using CUDA kernels. + !! + !! Implements [[m_base_backend(module):tds_solve(interface)]]. + !! Dispatches to appropriate CUDA kernel based on pencil direction. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - class(tdsops_t), intent(in) :: tdsops + class(field_t), intent(inout) :: du !! Output: solution + class(field_t), intent(in) :: u !! Input: RHS + class(tdsops_t), intent(in) :: tdsops !! Tridiagonal operators type(dim3) :: blocks, threads @@ -464,13 +521,17 @@ subroutine tds_solve_cuda(self, du, u, tdsops) end subroutine tds_solve_cuda subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads) + !! Solve distributed tridiagonal systems using CUDA kernels and MPI. + !! + !! Performs forward sweep, exchanges boundary data via MPI (using device + !! pointers for potential GPU-aware MPI benefit), then backward substitution. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - class(tdsops_t), intent(in) :: tdsops - type(dim3), intent(in) :: blocks, threads + class(field_t), intent(inout) :: du !! Output: solution + class(field_t), intent(in) :: u !! Input: RHS + class(tdsops_t), intent(in) :: tdsops !! Tridiagonal operators + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration real(dp), device, pointer, dimension(:, :, :) :: du_dev, u_dev @@ -512,12 +573,16 @@ subroutine tds_solve_dist(self, du, u, tdsops, blocks, threads) end subroutine tds_solve_dist subroutine reorder_cuda(self, u_o, u_i, direction) + !! Reorder field data between pencil orientations using CUDA kernels. + !! + !! Implements [[m_base_backend(module):reorder(interface)]]. + !! Calls appropriate [[m_cuda_kernels_reorder(module)]] kernel based on direction. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_o - class(field_t), intent(in) :: u_i - integer, intent(in) :: direction + class(field_t), intent(inout) :: u_o !! Output: reordered field + class(field_t), intent(in) :: u_i !! Input: source field + integer, intent(in) :: direction !! Reordering direction (RDR_X2Y, RDR_Y2Z, etc) real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d class(field_t), pointer :: u_temp @@ -632,9 +697,12 @@ subroutine reorder_cuda(self, u_o, u_i, direction) end subroutine reorder_cuda subroutine sum_yintox_cuda(self, u, u_y) + !! Sum y-pencil field into x-pencil using CUDA kernel. implicit none class(cuda_backend_t) :: self + class(field_t), intent(inout) :: u !! Output: x-pencil result + class(field_t), intent(in) :: u_y !! Input: y-pencil field to sum class(field_t), intent(inout) :: u class(field_t), intent(in) :: u_y @@ -654,9 +722,12 @@ subroutine sum_yintox_cuda(self, u, u_y) end subroutine sum_yintox_cuda subroutine sum_zintox_cuda(self, u, u_z) + !! Sum z-pencil field into x-pencil using CUDA kernel. implicit none class(cuda_backend_t) :: self + class(field_t), intent(inout) :: u !! Output: x-pencil result + class(field_t), intent(in) :: u_z !! Input: z-pencil field to sum class(field_t), intent(inout) :: u class(field_t), intent(in) :: u_z @@ -676,11 +747,15 @@ subroutine sum_zintox_cuda(self, u, u_z) end subroutine sum_zintox_cuda subroutine veccopy_cuda(self, dst, src) + !! Copy field data using CUDA kernel. + !! + !! Implements [[m_base_backend(module):veccopy(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):buffer_copy(interface)]] kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: dst - class(field_t), intent(in) :: src + class(field_t), intent(inout) :: dst !! Output: destination field + class(field_t), intent(in) :: src !! Input: source field real(dp), device, pointer, dimension(:, :, :) :: dst_d, src_d type(dim3) :: blocks, threads @@ -697,10 +772,14 @@ subroutine veccopy_cuda(self, dst, src) end subroutine veccopy_cuda subroutine vecadd_cuda(self, a, x, b, y) + !! Compute linear combination $y = ax + by$ using CUDA kernel. + !! + !! Implements [[m_base_backend(module):vecadd(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):axpby(interface)]] kernel. implicit none class(cuda_backend_t) :: self - real(dp), intent(in) :: a + real(dp), intent(in) :: a !! Scalar coefficient for x class(field_t), intent(in) :: x real(dp), intent(in) :: b class(field_t), intent(inout) :: y @@ -720,10 +799,15 @@ subroutine vecadd_cuda(self, a, x, b, y) end subroutine vecadd_cuda subroutine vecmult_cuda(self, y, x) - !! [[m_base_backend(module):vecmult(interface)]] + !! Compute element-wise product $y = x \cdot y$ using CUDA kernel. + !! + !! Implements [[m_base_backend(module):vecmult(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):pwmul(interface)]] kernel. implicit none class(cuda_backend_t) :: self + class(field_t), intent(inout) :: y !! Input/Output: multiplied in-place + class(field_t), intent(in) :: x !! Input: multiplier class(field_t), intent(inout) :: y class(field_t), intent(in) :: x real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d @@ -741,11 +825,14 @@ subroutine vecmult_cuda(self, y, x) end subroutine vecmult_cuda real(dp) function scalar_product_cuda(self, x, y) result(s) - !! [[m_base_backend(module):scalar_product(interface)]] + !! Compute global scalar product $\langle x, y \rangle$ using CUDA kernel and MPI reduction. + !! + !! Implements [[m_base_backend(module):scalar_product(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):scalar_product(interface)]] kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(in) :: x, y + class(field_t), intent(in) :: x, y !! Input fields real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d real(dp), device, allocatable :: sum_d @@ -791,12 +878,13 @@ real(dp) function scalar_product_cuda(self, x, y) result(s) end function scalar_product_cuda subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) + !! Copy boundary data into MPI send buffers using CUDA kernel. implicit none real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, & - u_send_e_dev - real(dp), device, dimension(:, :, :), intent(in) :: u_dev - integer, intent(in) :: n + u_send_e_dev !! Send buffers + real(dp), device, dimension(:, :, :), intent(in) :: u_dev !! Source field + integer, intent(in) :: n !! Grid dimension type(dim3) :: blocks, threads integer :: n_halo = 4 @@ -809,13 +897,16 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) end subroutine copy_into_buffers subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc) - !! [[m_base_backend(module):field_max_mean(interface)]] + !! Compute field maximum and mean using CUDA kernel and MPI reductions. + !! + !! Implements [[m_base_backend(module):field_max_mean(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):field_max_sum(interface)]] kernel. implicit none class(cuda_backend_t) :: self - real(dp), intent(out) :: max_val, mean_val - class(field_t), intent(in) :: f - integer, optional, intent(in) :: enforced_data_loc + real(dp), intent(out) :: max_val, mean_val !! Output: global maximum and mean + class(field_t), intent(in) :: f !! Input field + integer, optional, intent(in) :: enforced_data_loc !! Override field data location real(dp), device, pointer, dimension(:, :, :) :: f_d real(dp), device, allocatable :: max_d, sum_d @@ -871,11 +962,15 @@ subroutine field_max_mean_cuda(self, max_val, mean_val, f, enforced_data_loc) end subroutine field_max_mean_cuda subroutine field_scale_cuda(self, f, a) + !! Scale field by constant $f = a \cdot f$ using CUDA kernel. + !! + !! Implements [[m_base_backend(module):field_ops(interface)]] (field_scale binding). + !! Uses [[m_cuda_kernels_fieldops(module):field_scale(interface)]] kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(in) :: f - real(dp), intent(in) :: a + class(field_t), intent(in) :: f !! Field to scale in-place + real(dp), intent(in) :: a !! Scaling factor real(dp), device, pointer, dimension(:, :, :) :: f_d type(dim3) :: blocks, threads @@ -891,11 +986,15 @@ subroutine field_scale_cuda(self, f, a) end subroutine field_scale_cuda subroutine field_shift_cuda(self, f, a) + !! Shift field by constant $f = f + a$ using CUDA kernel. + !! + !! Implements [[m_base_backend(module):field_ops(interface)]] (field_shift binding). + !! Uses [[m_cuda_kernels_fieldops(module):field_shift(interface)]] kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(in) :: f - real(dp), intent(in) :: a + class(field_t), intent(in) :: f !! Field to shift in-place + real(dp), intent(in) :: a !! Shift amount real(dp), device, pointer, dimension(:, :, :) :: f_d type(dim3) :: blocks, threads @@ -911,13 +1010,13 @@ subroutine field_shift_cuda(self, f, a) end subroutine field_shift_cuda subroutine field_set_face_cuda(self, f, c_start, c_end, face) - !! [[m_base_backend(module):field_set_face(subroutine)]] + !! Set boundary face values using CUDA kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: f - real(dp), intent(in) :: c_start, c_end - integer, intent(in) :: face + class(field_t), intent(inout) :: f !! Field to modify + real(dp), intent(in) :: c_start, c_end !! Values for start and end faces + integer, intent(in) :: face !! Face identifier (X_FACE, Y_FACE, Z_FACE) real(dp), device, pointer, dimension(:, :, :) :: f_d type(dim3) :: blocks, threads @@ -952,11 +1051,14 @@ subroutine field_set_face_cuda(self, f, c_start, c_end, face) end subroutine field_set_face_cuda real(dp) function field_volume_integral_cuda(self, f) result(s) - !! volume integral of a field + !! Compute volume integral using CUDA kernel and MPI reduction. + !! + !! Implements [[m_base_backend(module):field_reduce(interface)]]. + !! Uses [[m_cuda_kernels_fieldops(module):volume_integral(interface)]] kernel. implicit none class(cuda_backend_t) :: self - class(field_t), intent(in) :: f + class(field_t), intent(in) :: f !! Input field real(dp), device, pointer, dimension(:, :, :) :: f_d real(dp), device, allocatable :: integral_d @@ -991,28 +1093,34 @@ real(dp) function field_volume_integral_cuda(self, f) result(s) end function field_volume_integral_cuda subroutine copy_data_to_f_cuda(self, f, data) + !! Copy host array to device field. class(cuda_backend_t), intent(inout) :: self - class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(inout) :: data + class(field_t), intent(inout) :: f !! Target device field + real(dp), dimension(:, :, :), intent(inout) :: data !! Source host array select type (f); type is (cuda_field_t); f%data_d = data; end select end subroutine copy_data_to_f_cuda subroutine copy_f_to_data_cuda(self, data, f) + !! Copy device field to host array. class(cuda_backend_t), intent(inout) :: self - real(dp), dimension(:, :, :), intent(out) :: data - class(field_t), intent(in) :: f + real(dp), dimension(:, :, :), intent(out) :: data !! Target host array + class(field_t), intent(in) :: f !! Source device field select type (f); type is (cuda_field_t); data = f%data_d; end select end subroutine copy_f_to_data_cuda subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem) + !! Initialise CUDA FFT Poisson solver. + !! + !! Implements [[m_base_backend(module):init_poisson_fft(interface)]]. + !! Allocates [[m_cuda_poisson_fft(module):cuda_poisson_fft_t(type)]] instance. implicit none class(cuda_backend_t) :: self - type(mesh_t), intent(in) :: mesh - type(dirps_t), intent(in) :: xdirps, ydirps, zdirps - logical, optional, intent(in) :: lowmem + type(mesh_t), intent(in) :: mesh !! Computational mesh + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Directional operators + logical, optional, intent(in) :: lowmem !! Low memory mode flag allocate (cuda_poisson_fft_t :: self%poisson_fft) @@ -1024,8 +1132,9 @@ subroutine init_cuda_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem) end subroutine init_cuda_poisson_fft subroutine resolve_field_t(u_dev, u) - real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev - class(field_t), intent(in) :: u + !! Helper to extract device pointer from cuda_field_t. + real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev !! Device pointer + class(field_t), intent(in) :: u !! Field object select type (u) type is (cuda_field_t) diff --git a/src/backend/cuda/common.f90 b/src/backend/cuda/common.f90 index 6165c38a7..d67dc477a 100644 --- a/src/backend/cuda/common.f90 +++ b/src/backend/cuda/common.f90 @@ -1,6 +1,16 @@ module m_cuda_common + !! Common constants for CUDA backend. + !! + !! CUDA GPUs execute threads in groups of 32 called warps. Setting the + !! pencil size to 32 ensures coalesced memory access patterns, where all + !! threads in a warp access consecutive memory locations simultaneously. + !! This is critical for GPU memory bandwidth efficiency. + !! + !! **Performance impact:** Matching the hardware warp size eliminates + !! divergence and maximises memory throughput, typically improving + !! performance by 2-3x compared to non-coalesced access. implicit none - integer, parameter :: SZ = 32 + integer, parameter :: SZ = 32 !! Pencil size matching GPU warp width end module m_cuda_common diff --git a/src/backend/cuda/exec_dist.f90 b/src/backend/cuda/exec_dist.f90 index 5a71bcc76..08048481c 100644 --- a/src/backend/cuda/exec_dist.f90 +++ b/src/backend/cuda/exec_dist.f90 @@ -1,4 +1,9 @@ module m_cuda_exec_dist + !! Distributed compact scheme execution on GPU. + !! + !! Orchestrates CUDA kernel launches and MPI halo exchange for distributed + !! compact finite difference schemes. Handles both generic derivative operations + !! and fused transport equation computation. use cudafor use mpi @@ -17,21 +22,28 @@ subroutine exec_dist_tds_compact( & du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, & tdsops, nproc, pprev, pnext, blocks, threads & ) + !! Execute distributed compact scheme derivative $du = d(u)$ on GPU. + !! + !! Calls distributed kernel, exchanges halo data for $2 \times 2$ boundary + !! systems, then applies substitution kernel. implicit none ! du = d(u) - real(dp), device, dimension(:, :, :), intent(out) :: du - real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e ! The ones below are intent(out) just so that we can write data in them, ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care the data they pass back + real(dp), device, dimension(:, :, :), intent(out) :: du !! Output: derivative + real(dp), device, dimension(:, :, :), intent(in) :: u !! Input: field with local data + real(dp), device, dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e !! Halo data from neighbours + + ! Temporary buffers for halo exchange (overwritten during computation) real(dp), device, dimension(:, :, :), intent(out) :: & du_send_s, du_send_e, du_recv_s, du_recv_e - type(cuda_tdsops_t), intent(in) :: tdsops - integer, intent(in) :: nproc, pprev, pnext - type(dim3), intent(in) :: blocks, threads + type(cuda_tdsops_t), intent(in) :: tdsops !! Tridiagonal operators + integer, intent(in) :: nproc, pprev, pnext !! MPI ranks (total, previous, next) + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration integer :: n_data @@ -64,27 +76,28 @@ subroutine exec_dist_transeq_3fused( & tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, & blocks, threads & ) + !! Execute fused transport equation computation on GPU with distributed compact scheme. + !! + !! Computes $r\_du = -\frac{1}{2}(v \frac{\partial u}{\partial x} + \frac{\partial (uv)}{\partial x}) + \nu \frac{\partial^2 u}{\partial x^2}$ + !! Launches distributed kernel for three operators (du, dud, d2u), exchanges halo data for all + !! boundary systems in one batch, then applies substitution kernel. implicit none - ! r_du = -1/2*(v*d1(u) + d1(u*v)) + nu*d2(u) - !> The result array, it is also used as temporary storage - real(dp), device, dimension(:, :, :), intent(out) :: r_du - real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e - real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e + real(dp), device, dimension(:, :, :), intent(out) :: r_du !! Output: transport equation RHS + real(dp), device, dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e !! Field u with halos + real(dp), device, dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e !! Field v with halos - ! The ones below are intent(out) just so that we can write data in them, - ! not because we actually need the data they store later where this - ! subroutine is called. We absolutely don't care the data they pass back + ! Temporary storage for derivatives and halo exchange buffers real(dp), device, dimension(:, :, :), intent(out) :: dud, d2u real(dp), device, dimension(:, :, :), intent(out) :: & du_send_s, du_send_e, du_recv_s, du_recv_e, & dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e - type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u - real(dp), intent(in) :: nu - integer, intent(in) :: nproc, pprev, pnext - type(dim3), intent(in) :: blocks, threads + type(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u !! Operators for each derivative + real(dp), intent(in) :: nu !! Kinematic viscosity + integer, intent(in) :: nproc, pprev, pnext !! MPI ranks + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration integer :: n_data diff --git a/src/backend/cuda/exec_thom.f90 b/src/backend/cuda/exec_thom.f90 index 50e757579..3834a20ac 100644 --- a/src/backend/cuda/exec_thom.f90 +++ b/src/backend/cuda/exec_thom.f90 @@ -1,4 +1,8 @@ module m_cuda_exec_thom + !! Thomas algorithm execution on GPU for local tridiagonal systems. + !! + !! Dispatches to periodic or non-periodic Thomas kernels based on + !! boundary conditions. No MPI communication required. use cudafor use m_common, only: dp @@ -10,12 +14,15 @@ module m_cuda_exec_thom contains subroutine exec_thom_tds_compact(du, u, tdsops, blocks, threads) + !! Execute Thomas algorithm for compact scheme derivative $du = d(u)$ on GPU. + !! + !! Selects periodic or non-periodic kernel variant based on operator configuration. implicit none - real(dp), device, dimension(:, :, :), intent(out) :: du - real(dp), device, dimension(:, :, :), intent(in) :: u - type(cuda_tdsops_t), intent(in) :: tdsops - type(dim3), intent(in) :: blocks, threads + real(dp), device, dimension(:, :, :), intent(out) :: du !! Output: derivative + real(dp), device, dimension(:, :, :), intent(in) :: u !! Input: field + type(cuda_tdsops_t), intent(in) :: tdsops !! Tridiagonal operators + type(dim3), intent(in) :: blocks, threads !! CUDA kernel configuration if (tdsops%periodic) then call der_univ_thom_per<<>>( & !& diff --git a/src/backend/cuda/kernels/distributed.f90 b/src/backend/cuda/kernels/distributed.f90 index 8e7a1ba94..4cf97fc19 100644 --- a/src/backend/cuda/kernels/distributed.f90 +++ b/src/backend/cuda/kernels/distributed.f90 @@ -1,4 +1,9 @@ module m_cuda_kernels_dist + !! CUDA kernels for distributed compact finite difference schemes. + !! + !! GPU kernels implementing forward and backward sweeps for compact schemes + !! across MPI domain boundaries. Handles stencil application using halo data, + !! forward elimination, and backward substitution for distributed tridiagonal systems. use cudafor use m_common, only: dp @@ -11,16 +16,20 @@ attributes(global) subroutine der_univ_dist( & du, send_u_s, send_u_e, u, u_s, u_e, & n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf & ) + !! CUDA kernel for distributed compact scheme forward sweep and boundary setup. + !! + !! Applies compact stencils using local data (u) and halo data (u_s, u_e) from + !! neighbours. Performs forward elimination and prepares boundary data for MPI exchange. implicit none - ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, & - send_u_e - real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e - integer, value, intent(in) :: n_tds, n_rhs - real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e - real(dp), device, intent(in), dimension(:) :: coeffs - real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf + real(dp), device, intent(out), dimension(:, :, :) :: du !! Output: derivatives with forward elimination + real(dp), device, intent(out), dimension(:, :, :) :: send_u_s, send_u_e !! Boundary data for MPI exchange + real(dp), device, intent(in), dimension(:, :, :) :: u !! Input: local field data + real(dp), device, intent(in), dimension(:, :, :) :: u_s, u_e !! Halo data from start/end neighbours + integer, value, intent(in) :: n_tds, n_rhs !! Grid and RHS dimensions + real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Boundary stencil coefficients + real(dp), device, intent(in), dimension(:) :: coeffs !! Bulk stencil coefficients + real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf !! Forward elimination factors ! Local variables integer :: i, j, b, k, lj @@ -148,17 +157,22 @@ end subroutine der_univ_dist attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, & n, dist_sa, dist_sc, strch) + !! Backward substitution for distributed compact scheme. + !! + !! Completes the tridiagonal solve using boundary solutions received from + !! neighbouring MPI ranks. Applies Sherman-Morrison-like correction for + !! distributed system using Toeplitz matrix symmetry properties. implicit none ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du - real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e - real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch - integer, value, intent(in) :: n + real(dp), device, intent(out), dimension(:, :, :) :: du !! Output: Final derivative solution + real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e !! Boundary solutions from neighbours + real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc, strch !! Distributed coefficients and stretching + integer, value, intent(in) :: n !! Number of local grid points ! Local variables - integer :: i, j, b - real(dp) :: ur, bl, recp, du_s, du_e + integer :: i, j, b !! Thread, loop, and block indices + real(dp) :: ur, bl, recp, du_s, du_e !! Upper-right, bottom-left, reciprocal, boundary solutions i = threadIdx%x b = blockIdx%x @@ -201,39 +215,44 @@ attributes(global) subroutine transeq_3fused_dist( & dud_coeffs_s, dud_coeffs_e, dud_coeffs, dud_fw, dud_bw, dud_af, & d2u_coeffs_s, d2u_coeffs_e, d2u_coeffs, d2u_fw, d2u_bw, d2u_af & ) + !! Distributed forward sweep for 3 fused transport equation derivatives. + !! + !! Computes du, dud (convective), and d2u simultaneously using independent + !! compact stencils. Performs forward elimination and prepares boundary data + !! for MPI exchange. Optimised for transport equation with convective terms. implicit none ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u + real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u !! Output: Three derivative fields real(dp), device, intent(out), dimension(:, :, :) :: & - send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e + send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e !! Boundary data for MPI exchange real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e, & - v, v_s, v_e - integer, value, intent(in) :: n_tds, n_rhs + v, v_s, v_e !! Input fields and halos + integer, value, intent(in) :: n_tds, n_rhs !! Grid dimensions real(dp), device, intent(in) :: du_coeffs_s(:, :), du_coeffs_e(:, :), & - du_coeffs(:) - real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:) + du_coeffs(:) !! du stencil coefficients + real(dp), device, intent(in) :: du_fw(:), du_bw(:), du_af(:) !! du forward/backward/alpha factors real(dp), device, intent(in) :: dud_coeffs_s(:, :), dud_coeffs_e(:, :), & - dud_coeffs(:) - real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:) + dud_coeffs(:) !! dud stencil coefficients + real(dp), device, intent(in) :: dud_fw(:), dud_bw(:), dud_af(:) !! dud forward/backward/alpha factors real(dp), device, intent(in) :: d2u_coeffs_s(:, :), d2u_coeffs_e(:, :), & - d2u_coeffs(:) - real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:) + d2u_coeffs(:) !! d2u stencil coefficients + real(dp), device, intent(in) :: d2u_fw(:), d2u_bw(:), d2u_af(:) !! d2u forward/backward/alpha factors ! Local variables - integer :: i, j, b + integer :: i, j, b !! Thread, loop, and block indices real(dp) :: du_c_m4, du_c_m3, du_c_m2, du_c_m1, du_c_j, & du_c_p1, du_c_p2, du_c_p3, du_c_p4, & - du_alpha, du_last_r + du_alpha, du_last_r !! du stencil coefficients and factors real(dp) :: dud_c_m4, dud_c_m3, dud_c_m2, dud_c_m1, dud_c_j, & dud_c_p1, dud_c_p2, dud_c_p3, dud_c_p4, & - dud_alpha, dud_last_r + dud_alpha, dud_last_r !! dud stencil coefficients and factors real(dp) :: d2u_c_m4, d2u_c_m3, d2u_c_m2, d2u_c_m1, d2u_c_j, & d2u_c_p1, d2u_c_p2, d2u_c_p3, d2u_c_p4, & - d2u_alpha, d2u_last_r - real(dp) :: temp_du, temp_dud, temp_d2u - real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4 + d2u_alpha, d2u_last_r !! d2u stencil coefficients and factors + real(dp) :: temp_du, temp_dud, temp_d2u !! Temporary derivative values + real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4 !! Reused field values real(dp) :: v_m4, v_m3, v_m2, v_m1, v_j, v_p1, v_p2, v_p3, v_p4 real(dp) :: old_du, old_dud, old_d2u @@ -593,26 +612,31 @@ attributes(global) subroutine transeq_3fused_subs( & n, nu, du_sa, du_sc, du_strch, dud_sa, dud_sc, dud_strch, & d2u_sa, d2u_sc, d2u_strch, d2u_strch_cor & ) + !! Backward substitution for 3 fused transport equation derivatives. + !! + !! Completes distributed tridiagonal solves for du, dud, d2u using boundary + !! solutions from neighbours. Combines results to form RHS of transport equation: + !! r_du = -conv*dud + nu*d2u. Applies Sherman-Morrison corrections for all three fields. implicit none ! Arguments !> The result array, it stores 'du' first then its overwritten - real(dp), device, intent(inout), dimension(:, :, :) :: r_du - real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u + real(dp), device, intent(inout), dimension(:, :, :) :: r_du !! In/out: Stores du then overwritten with RHS + real(dp), device, intent(in), dimension(:, :, :) :: conv, dud, d2u !! Input: Convection velocity and derivatives real(dp), device, intent(in), dimension(:, :, :) :: & - recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e - integer, value, intent(in) :: n - real(dp), value, intent(in) :: nu + recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e !! Boundary solutions from neighbours + integer, value, intent(in) :: n !! Number of local grid points + real(dp), value, intent(in) :: nu !! Kinematic viscosity real(dp), device, intent(in), dimension(:) :: du_sa, du_sc, du_strch, & dud_sa, dud_sc, dud_strch, & d2u_sa, d2u_sc, d2u_strch, & - d2u_strch_cor + d2u_strch_cor !! Distributed coefficients for all three fields ! Local variables - integer :: i, j, b - real(dp) :: ur, bl, recp - real(dp) :: du_temp, dud_temp, d2u_temp - real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e + integer :: i, j, b !! Thread, loop, and block indices + real(dp) :: ur, bl, recp !! Upper-right, bottom-left, reciprocal for Sherman-Morrison + real(dp) :: du_temp, dud_temp, d2u_temp !! Temporary derivative values + real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e !! Boundary solutions for all three fields i = threadIdx%x b = blockIdx%x diff --git a/src/backend/cuda/kernels/fieldops.f90 b/src/backend/cuda/kernels/fieldops.f90 index 949bc6ab6..d3147fa5a 100644 --- a/src/backend/cuda/kernels/fieldops.f90 +++ b/src/backend/cuda/kernels/fieldops.f90 @@ -1,4 +1,10 @@ module m_cuda_kernels_fieldops + !! CUDA kernels for field operations (copy, scale, vector arithmetic, reductions). + !! + !! Provides GPU kernels for basic field manipulation: copying, scaling, shifting, + !! linear combinations (AXPBY), pointwise multiplication, scalar products, and + !! reductions (max, sum, volume integral). All kernels use thread-per-pencil-point + !! parallelisation with [[m_cuda_common(module):SZ(variable)]] threads per block. use cudafor use m_common, only: dp @@ -7,13 +13,16 @@ module m_cuda_kernels_fieldops contains attributes(global) subroutine copy(n, dst, src) + !! Copy field data: dst = src. implicit none - integer, value, intent(in) :: n - real(dp), device, intent(out), dimension(:, :, :) :: dst - real(dp), device, intent(in), dimension(:, :, :) :: src + integer, value, intent(in) :: n !! Pencil length + real(dp), device, intent(out), dimension(:, :, :) :: dst !! Destination array + real(dp), device, intent(in), dimension(:, :, :) :: src !! Source array - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -25,14 +34,17 @@ attributes(global) subroutine copy(n, dst, src) end subroutine copy attributes(global) subroutine axpby(n, alpha, x, beta, y) + !! Compute linear combination: y = alpha*x + beta*y. implicit none - integer, value, intent(in) :: n - real(dp), value, intent(in) :: alpha, beta - real(dp), device, intent(in), dimension(:, :, :) :: x - real(dp), device, intent(inout), dimension(:, :, :) :: y + integer, value, intent(in) :: n !! Pencil length + real(dp), value, intent(in) :: alpha, beta !! Scalar coefficients + real(dp), device, intent(in), dimension(:, :, :) :: x !! Input array + real(dp), device, intent(inout), dimension(:, :, :) :: y !! Input/Output array - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -44,13 +56,16 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y) end subroutine axpby attributes(global) subroutine pwmul(y, x, n) + !! Pointwise multiplication: y = y * x. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: y - real(dp), device, intent(in), dimension(:, :, :) :: x - integer, value, intent(in) :: n + real(dp), device, intent(inout), dimension(:, :, :) :: y !! Input/Output array + real(dp), device, intent(in), dimension(:, :, :) :: x !! Multiplier array + integer, value, intent(in) :: n !! Pencil length - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -62,13 +77,20 @@ attributes(global) subroutine pwmul(y, x, n) end subroutine pwmul attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) + !! Copy halo regions into send buffers. + !! + !! Extracts first and last n_halo planes into separate buffers for MPI communication. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e - real(dp), device, intent(in), dimension(:, :, :) :: u - integer, value, intent(in) :: n, n_halo + real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s !! Start buffer + real(dp), device, intent(inout), dimension(:, :, :) :: u_send_e !! End buffer + real(dp), device, intent(in), dimension(:, :, :) :: u !! Source field + integer, value, intent(in) :: n !! Pencil length + integer, value, intent(in) :: n_halo !! Halo width - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Halo plane index + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -81,13 +103,16 @@ attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) end subroutine buffer_copy attributes(global) subroutine field_scale(f, alpha, n) + !! Scale field by constant: f = alpha * f. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: f - real(dp), value, intent(in) :: alpha - integer, value, intent(in) :: n + real(dp), device, intent(inout), dimension(:, :, :) :: f !! Field to scale + real(dp), value, intent(in) :: alpha !! Scaling factor + integer, value, intent(in) :: n !! Pencil length - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -99,13 +124,16 @@ attributes(global) subroutine field_scale(f, alpha, n) end subroutine field_scale attributes(global) subroutine field_shift(f, const, n) + !! Shift field by constant: f = f + const. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: f - real(dp), value, intent(in) :: const - integer, value, intent(in) :: n + real(dp), device, intent(inout), dimension(:, :, :) :: f !! Field to shift + real(dp), value, intent(in) :: const !! Shift constant + integer, value, intent(in) :: n !! Pencil length - integer :: i, j, b + integer :: i !! Thread index (pencil point) + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) i = threadIdx%x b = blockIdx%x @@ -117,14 +145,24 @@ attributes(global) subroutine field_shift(f, const, n) end subroutine field_shift attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j) + !! Compute scalar product with atomic reduction: s += sum(x * y). + !! + !! Uses atomic addition to accumulate partial sums from each pencil. implicit none - real(dp), device, intent(inout) :: s - real(dp), device, intent(in), dimension(:, :, :) :: x, y - integer, value, intent(in) :: n, n_i_pad, n_j + real(dp), device, intent(inout) :: s !! Accumulated scalar product + real(dp), device, intent(in), dimension(:, :, :) :: x !! First field + real(dp), device, intent(in), dimension(:, :, :) :: y !! Second field + integer, value, intent(in) :: n !! Pencil length + integer, value, intent(in) :: n_i_pad !! Padded dimension for indexing + integer, value, intent(in) :: n_j !! Active pencil count - real(dp) :: s_pncl !! pencil sum - integer :: i, j, b, b_i, b_j, ierr + real(dp) :: s_pncl !! Pencil sum + integer :: i !! Thread index + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) + integer :: b_i, b_j !! 2D block indices + integer :: ierr !! Atomic operation status i = threadIdx%x b_i = blockIdx%x @@ -142,14 +180,26 @@ attributes(global) subroutine scalar_product(s, x, y, n, n_i_pad, n_j) end subroutine scalar_product attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j) + !! Compute field maximum and sum with atomic reductions. + !! + !! Uses atomic max and add operations to accumulate pencil-wise results. implicit none - real(dp), device, intent(inout) :: max_f, sum_f - real(dp), device, intent(in), dimension(:, :, :) :: f - integer, value, intent(in) :: n, n_i_pad, n_j - - real(dp) :: max_pncl, sum_pncl, val - integer :: i, j, b, b_i, b_j, ierr + real(dp), device, intent(inout) :: max_f !! Accumulated maximum + real(dp), device, intent(inout) :: sum_f !! Accumulated sum + real(dp), device, intent(in), dimension(:, :, :) :: f !! Input field + integer, value, intent(in) :: n !! Pencil length + integer, value, intent(in) :: n_i_pad !! Padded dimension for indexing + integer, value, intent(in) :: n_j !! Active pencil count + + real(dp) :: max_pncl !! Pencil maximum + real(dp) :: sum_pncl !! Pencil sum + real(dp) :: val !! Absolute value + integer :: i !! Thread index + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) + integer :: b_i, b_j !! 2D block indices + integer :: ierr !! Atomic operation status i = threadIdx%x b_i = blockIdx%x @@ -171,15 +221,21 @@ attributes(global) subroutine field_max_sum(max_f, sum_f, f, n, n_i_pad, n_j) end subroutine field_max_sum attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz) - !! Set domain Y_FACE to a constant - !! c_start at the bottom and c_end at the top + !! Set Y-face boundary values to constants. + !! + !! Sets bottom face (y=0) to c_start and top face (y=L) to c_end. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: f - real(dp), value, intent(in) :: c_start, c_end - integer, value, intent(in) :: nx, ny, nz + real(dp), device, intent(inout), dimension(:, :, :) :: f !! Field to modify + real(dp), value, intent(in) :: c_start !! Bottom boundary value + real(dp), value, intent(in) :: c_end !! Top boundary value + integer, value, intent(in) :: nx, ny, nz !! Grid dimensions - integer :: i, j, b, n_mod, b_end + integer :: i !! Thread index + integer :: j !! X-coordinate + integer :: b !! Z-coordinate block + integer :: n_mod !! Modulo for top boundary indexing + integer :: b_end !! Top boundary block index j = threadIdx%x + (blockIdx%x - 1)*blockDim%x ! from 1 to nx b = blockIdx%y ! from 1 to nz @@ -195,14 +251,23 @@ attributes(global) subroutine field_set_y_face(f, c_start, c_end, nx, ny, nz) end subroutine field_set_y_face attributes(global) subroutine volume_integral(s, f, n, n_i_pad, n_j) + !! Compute volume integral with atomic reduction: s += sum(f). + !! + !! Uses atomic addition to accumulate partial sums from each pencil. implicit none - real(dp), device, intent(inout) :: s - real(dp), device, intent(in), dimension(:, :, :) :: f - integer, value, intent(in) :: n, n_i_pad, n_j - - real(dp) :: s_pncl !! pencil sum - integer :: i, j, b, b_i, b_j, ierr + real(dp), device, intent(inout) :: s !! Accumulated integral + real(dp), device, intent(in), dimension(:, :, :) :: f !! Input field + integer, value, intent(in) :: n !! Pencil length + integer, value, intent(in) :: n_i_pad !! Padded dimension for indexing + integer, value, intent(in) :: n_j !! Active pencil count + + real(dp) :: s_pncl !! Pencil sum + integer :: i !! Thread index + integer :: j !! Pencil coordinate + integer :: b !! Block index (pencil number) + integer :: b_i, b_j !! 2D block indices + integer :: ierr !! Atomic operation status i = threadIdx%x b_i = blockIdx%x diff --git a/src/backend/cuda/kernels/reorder.f90 b/src/backend/cuda/kernels/reorder.f90 index 4065a2595..cd96a029b 100644 --- a/src/backend/cuda/kernels/reorder.f90 +++ b/src/backend/cuda/kernels/reorder.f90 @@ -1,4 +1,10 @@ module m_cuda_kernels_reorder + !! CUDA kernels for pencil reordering and accumulation between X/Y/Z orientations. + !! + !! Provides GPU kernels for rearranging field data between different pencil decompositions + !! (X-pencils, Y-pencils, Z-pencils, and Cartesian). Most kernels use shared memory tiles + !! for coalesced memory access. Thread blocks use [[m_cuda_common(module):SZ(variable)]] + !! configuration (32x1 or 32x32 depending on operation). use cudafor use m_common, only: dp @@ -7,14 +13,18 @@ module m_cuda_kernels_reorder contains attributes(global) subroutine reorder_c2x(u_x, u_c, nz) + !! Reorder from Cartesian to X-pencil orientation. + !! + !! Uses shared memory transpose for efficient reordering. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_c - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_x !! Output: X-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_c !! Input: Cartesian data + integer, value, intent(in) :: nz !! Z-dimension size - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory for transpose + integer :: i, j !! Thread indices + integer :: b_i, b_j, b_k !! Block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -42,14 +52,18 @@ attributes(global) subroutine reorder_c2x(u_x, u_c, nz) end subroutine reorder_c2x attributes(global) subroutine reorder_x2c(u_c, u_x, nz) + !! Reorder from X-pencil to Cartesian orientation. + !! + !! Inverse of reorder_c2x. Uses shared memory transpose. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_c - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_c !! Output: Cartesian data + real(dp), device, intent(in), dimension(:, :, :) :: u_x !! Input: X-pencil data + integer, value, intent(in) :: nz !! Z-dimension size - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory for transpose + integer :: i, j !! Thread indices + integer :: b_i, b_j, b_k !! Block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -77,14 +91,18 @@ attributes(global) subroutine reorder_x2c(u_c, u_x, nz) end subroutine reorder_x2c attributes(global) subroutine reorder_x2y(u_y, u_x, nz) + !! Reorder from X-pencil to Y-pencil orientation. + !! + !! Uses shared memory transpose for efficient reordering. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_y - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_y !! Output: Y-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_x !! Input: X-pencil data + integer, value, intent(in) :: nz !! Z-dimension size - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory for transpose + integer :: i, j !! Thread indices + integer :: b_i, b_j, b_k !! Block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -112,13 +130,19 @@ attributes(global) subroutine reorder_x2y(u_y, u_x, nz) end subroutine reorder_x2y attributes(global) subroutine reorder_x2z(u_z, u_x, nz) + !! Reorder from X-pencil to Z-pencil orientation. + !! + !! No shared memory needed - memory access pattern is already favourable. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_z - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_z !! Output: Z-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_x !! Input: X-pencil data + integer, value, intent(in) :: nz !! Z-dimension size - integer :: i, j, b_i, b_j, nx + integer :: i !! Thread index + integer :: j !! Loop index + integer :: b_i, b_j !! Block indices + integer :: nx !! Grid X-dimension i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y nx = gridDim%x @@ -132,14 +156,15 @@ attributes(global) subroutine reorder_x2z(u_z, u_x, nz) end subroutine reorder_x2z attributes(global) subroutine reorder_y2x(u_x, u_y, nz) + !! Reorder from Y-pencil to X-pencil orientation. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_x !! Output: X-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_y !! Input: Y-pencil data + integer, value, intent(in) :: nz !! Z-dimension size - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory for transpose + integer :: i, j, b_i, b_j, b_k !! Thread and block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -167,14 +192,15 @@ attributes(global) subroutine reorder_y2x(u_x, u_y, nz) end subroutine reorder_y2x attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) + !! Reorder from Y-pencil to Z-pencil orientation. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_z - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nx, nz + real(dp), device, intent(out), dimension(:, :, :) :: u_z !! Output: Z-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_y !! Input: Y-pencil data + integer, value, intent(in) :: nx, nz !! Grid dimensions - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory for transpose + integer :: i, j, b_i, b_j, b_k !! Thread and block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -202,13 +228,16 @@ attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) end subroutine reorder_y2z attributes(global) subroutine reorder_z2x(u_x, u_z, nz) + !! Reorder from Z-pencil to X-pencil orientation. + !! + !! No shared memory needed - favourable memory access pattern. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_x !! Output: X-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_z !! Input: Z-pencil data + integer, value, intent(in) :: nz !! Z-dimension size - integer :: i, j, b_i, b_j, nx + integer :: i, j, b_i, b_j, nx !! Thread, loop, block indices and grid size i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y nx = gridDim%x @@ -220,14 +249,17 @@ attributes(global) subroutine reorder_z2x(u_x, u_z, nz) end subroutine reorder_z2x attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) + !! Reorder from Z-pencil to Y-pencil orientation. + !! + !! Uses shared memory tile for coalesced access pattern. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_y - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nx, nz + real(dp), device, intent(out), dimension(:, :, :) :: u_y !! Output: Y-pencil data + real(dp), device, intent(in), dimension(:, :, :) :: u_z !! Input: Z-pencil data + integer, value, intent(in) :: nx, nz !! X and Z dimension sizes - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory tile for transpose + integer :: i, j, b_i, b_j, b_k !! Thread, block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -255,14 +287,18 @@ attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) end subroutine reorder_z2y attributes(global) subroutine sum_yintox(u_x, u_y, nz) + !! Accumulate Y-pencil contributions into X-pencil data. + !! + !! Performs u_x += u_y with reordering. Uses shared memory tile + !! for efficient transpose and coalesced memory access. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz + real(dp), device, intent(inout), dimension(:, :, :) :: u_x !! In/out: X-pencil data to accumulate into + real(dp), device, intent(in), dimension(:, :, :) :: u_y !! Input: Y-pencil data to add + integer, value, intent(in) :: nz !! Z-dimension size - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) !! Shared memory tile for transpose + integer :: i, j, b_i, b_j, b_k !! Thread, block indices i = threadIdx%x; j = threadIdx%y; b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z @@ -294,14 +330,18 @@ attributes(global) subroutine sum_yintox(u_x, u_y, nz) end subroutine sum_yintox attributes(global) subroutine sum_zintox(u_x, u_z, nz) + !! Accumulate Z-pencil contributions into X-pencil data. + !! + !! Performs u_x += u_z with reordering. No shared memory needed + !! due to favourable memory access pattern. implicit none ! Arguments - real(dp), device, intent(inout), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nz + real(dp), device, intent(inout), dimension(:, :, :) :: u_x !! In/out: X-pencil data to accumulate into + real(dp), device, intent(in), dimension(:, :, :) :: u_z !! Input: Z-pencil data to add + integer, value, intent(in) :: nz !! Z-dimension size - integer :: i, j, b_i, b_j, nx + integer :: i, j, b_i, b_j, nx !! Thread, loop, block indices and grid size i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y nx = gridDim%x diff --git a/src/backend/cuda/kernels/spectral_processing.f90 b/src/backend/cuda/kernels/spectral_processing.f90 index 54b27b364..0a97f6cee 100644 --- a/src/backend/cuda/kernels/spectral_processing.f90 +++ b/src/backend/cuda/kernels/spectral_processing.f90 @@ -1,4 +1,14 @@ module m_cuda_spectral + !! CUDA kernels for spectral space processing and FFT post-processing. + !! + !! This module contains kernels for: + !! + !! - Post-processing spectral transforms (forward/backward) + !! - Solving Poisson equations in spectral space + !! - Enforcing and undoing periodicity in Y-direction + !! + !! Implements spectral equivalence method from JCP 228 (2009), 5989-6015, Sec 4. + !! Handles both periodic (000) and non-periodic (010) boundary conditions. use cudafor use m_common, only: dp @@ -8,14 +18,16 @@ module m_cuda_spectral contains attributes(global) subroutine memcpy3D(dst, src, nx, ny, nz) - !! Copy data between x3d2 padded arrays and cuFFTMp descriptors + !! Copy data between x3d2 padded arrays and cuFFTMp descriptors. + !! + !! Each thread handles one Y-Z plane position, looping over X. implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: dst - real(dp), device, intent(in), dimension(:, :, :) :: src - integer, value, intent(in) :: nx, ny, nz + real(dp), device, intent(inout), dimension(:, :, :) :: dst !! Output: Destination array + real(dp), device, intent(in), dimension(:, :, :) :: src !! Input: Source array + integer, value, intent(in) :: nx, ny, nz !! Grid dimensions - integer :: i, j, k + integer :: i, j, k !! Loop and thread indices j = threadIdx%x + (blockIdx%x - 1)*blockDim%x !ny k = blockIdx%y !nz @@ -34,23 +46,19 @@ attributes(global) subroutine process_spectral_000( & !! Post-processes the divergence of velocity in spectral space, including !! scaling w.r.t. grid size. !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Performs forward post-processing, Poisson solve, and backward post-processing + !! using spectral equivalence method. Ref: JCP 228 (2009), 5989-6015, Sec 4. implicit none - !> Divergence of velocity in spectral space - complex(dp), device, intent(inout), dimension(:, :, :) :: div_u - !> Spectral equivalence constants - complex(dp), device, intent(in), dimension(:, :, :) :: waves - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz - !> Grid size in spectral space - integer, value, intent(in) :: nx_spec, ny_spec - !> Offset in y direction in the permuted slabs in spectral space - integer, value, intent(in) :: y_sp_st - !> Grid size - integer, value, intent(in) :: nx, ny, nz + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u !! In/out: Divergence of velocity in spectral space + complex(dp), device, intent(in), dimension(:, :, :) :: waves !! Input: Spectral wavenumbers for Poisson solve + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Input: Spectral equivalence constants + integer, value, intent(in) :: nx_spec, ny_spec !! Spectral space grid size + integer, value, intent(in) :: y_sp_st !! Y-direction offset in the permuted slabs in spectral space + integer, value, intent(in) :: nx, ny, nz !! Physical space grid size - integer :: i, j, k, ix, iy, iz - real(dp) :: tmp_r, tmp_c, div_r, div_c + integer :: i, j, k, ix, iy, iz !! Loop and spectral mode indices + real(dp) :: tmp_r, tmp_c, div_r, div_c !! Temporary real/imaginary components j = threadIdx%x + (blockIdx%x - 1)*blockDim%x k = blockIdx%y ! nz_spec @@ -130,26 +138,22 @@ attributes(global) subroutine process_spectral_010( & div_u, waves, nx_spec, ny_spec, y_sp_st, nx, ny, nz, & ax, bx, ay, by, az, bz & ) - !! Post-processes the divergence of velocity in spectral space, including - !! scaling w.r.t. grid size. + !! Post-process divergence field and solve Poisson equation in spectral space + !! for non-periodic boundary conditions in Y-direction (010). !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Performs forward post-processing with odd/even mode handling, Poisson solve, + !! and backward post-processing. Ref: JCP 228 (2009), 5989-6015, Sec 4. implicit none - !> Divergence of velocity in spectral space - complex(dp), device, intent(inout), dimension(:, :, :) :: div_u - !> Spectral equivalence constants - complex(dp), device, intent(in), dimension(:, :, :) :: waves - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz - !> Grid size in spectral space - integer, value, intent(in) :: nx_spec, ny_spec - !> Offset in y direction in the permuted slabs in spectral space - integer, value, intent(in) :: y_sp_st - !> Grid size - integer, value, intent(in) :: nx, ny, nz + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u !! In/out: Divergence field / pressure solution + complex(dp), device, intent(in), dimension(:, :, :) :: waves !! Input: Spectral wavenumbers for Poisson solve + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Input: Spectral equivalence constants + integer, value, intent(in) :: nx_spec, ny_spec !! Spectral space grid size + integer, value, intent(in) :: y_sp_st !! Y-direction offset in spectral slabs + integer, value, intent(in) :: nx, ny, nz !! Physical space grid size - integer :: i, j, k, ix, iy, iz, iy_rev - real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c + integer :: i, j, k, ix, iy, iz, iy_rev !! Loop, spectral, and reversed mode indices + real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c !! Temporary components for left/right modes i = threadIdx%x + (blockIdx%x - 1)*blockDim%x k = blockIdx%y ! nz_spec @@ -288,25 +292,23 @@ end subroutine process_spectral_010 attributes(global) subroutine process_spectral_010_fw( & div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz & ) - !! Post-processes the divergence of velocity in spectral space, including - !! scaling w.r.t. grid size. + !! Forward post-processing only for non-periodic Y-direction (010). !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Performs normalisation, post-processing in X and Z, and odd/even mode handling + !! in Y. Used when Poisson solve and backward processing are separate steps. implicit none - !> Divergence of velocity in spectral space - complex(dp), device, intent(inout), dimension(:, :, :) :: div_u - !> Spectral equivalence constants - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u !! In/out: Divergence field to post-process + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Input: Spectral equivalence constants !> Grid size in spectral space - integer, value, intent(in) :: nx_spec, ny_spec + integer, value, intent(in) :: nx_spec, ny_spec !! Spectral space grid size !> Offset in y direction in the permuted slabs in spectral space - integer, value, intent(in) :: y_sp_st + integer, value, intent(in) :: y_sp_st !! Y-direction offset in spectral slabs !> Grid size - integer, value, intent(in) :: nx, ny, nz + integer, value, intent(in) :: nx, ny, nz !! Physical space grid size - integer :: i, j, k, ix, iy, iz, iy_rev - real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c + integer :: i, j, k, ix, iy, iz, iy_rev !! Loop, spectral, and reversed mode indices + real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c !! Temporary real/imaginary components i = threadIdx%x + (blockIdx%x - 1)*blockDim%x k = blockIdx%y ! nz_spec @@ -368,22 +370,19 @@ end subroutine process_spectral_010_fw attributes(global) subroutine process_spectral_010_poisson( & div_u, a_re, a_im, off, inc, nx_spec, n, nx, ny, nz & ) - !! Solve the Poisson equation at cell centres with non-perioic BC along y + !! Solve Poisson equation for non-periodic Y-direction using pentadiagonal solver. !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Handles odd/even mode separation using offset and increment parameters. + !! Modifies pentadiagonal coefficients in-place during forward/backward passes. implicit none - !> Divergence of velocity in spectral space - complex(dp), device, intent(inout), dimension(:, :, :) :: div_u - !> Spectral equivalence constants - real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im - !> offset and increment. increment is 2 when considering only odd or even - integer, value, intent(in) :: off, inc - !> Grid size in spectral space - integer, value, intent(in) :: nx_spec, n, nx, ny, nz + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u !! In/out: RHS / Solution + real(dp), device, intent(inout), dimension(:, :, :, :) :: a_re, a_im !! In/out: Pentadiagonal coefficients (real/imag) + integer, value, intent(in) :: off, inc !! Offset and increment for odd/even modes + integer, value, intent(in) :: nx_spec, n, nx, ny, nz !! Grid dimensions - integer :: i, j, k, jm, nm - real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon + integer :: i, j, k, jm, nm !! Loop indices and mapped indices + real(dp) :: tmp_r, tmp_c, div_r, div_c, epsilon !! Temporary variables and tolerance i = threadIdx%x + (blockIdx%x - 1)*blockDim%x k = blockIdx%y ! nz_spec @@ -527,25 +526,23 @@ end subroutine process_spectral_010_poisson attributes(global) subroutine process_spectral_010_bw( & div_u, nx_spec, ny_spec, y_sp_st, nx, ny, nz, ax, bx, ay, by, az, bz & ) - !! Post-processes the divergence of velocity in spectral space, including - !! scaling w.r.t. grid size. + !! Backward post-processing only for non-periodic Y-direction (010). !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Performs odd/even mode recombination and post-processing in X and Z directions. + !! Completes the spectral-to-physical transformation after Poisson solve. implicit none - !> Divergence of velocity in spectral space - complex(dp), device, intent(inout), dimension(:, :, :) :: div_u - !> Spectral equivalence constants - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u !! In/out: Solution field to post-process + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Input: Spectral equivalence constants !> Grid size in spectral space - integer, value, intent(in) :: nx_spec, ny_spec + integer, value, intent(in) :: nx_spec, ny_spec !! Spectral space grid size !> Offset in y direction in the permuted slabs in spectral space - integer, value, intent(in) :: y_sp_st + integer, value, intent(in) :: y_sp_st !! Y-direction offset in spectral slabs !> Grid size - integer, value, intent(in) :: nx, ny, nz + integer, value, intent(in) :: nx, ny, nz !! Physical space grid size - integer :: i, j, k, ix, iy, iz, iy_rev - real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c + integer :: i, j, k, ix, iy, iz, iy_rev !! Loop, spectral, and reversed mode indices + real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c !! Temporary real/imaginary components i = threadIdx%x + (blockIdx%x - 1)*blockDim%x k = blockIdx%y ! nz_spec @@ -605,13 +602,17 @@ attributes(global) subroutine process_spectral_010_bw( & end subroutine process_spectral_010_bw attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny) + !! Enforce Y-direction periodicity by reordering data for non-periodic transforms. + !! + !! Maps full domain [1:ny] to symmetric layout required by non-periodic FFT. + !! First half: odd points, second half: even points in reverse order. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: f_out - real(dp), device, intent(in), dimension(:, :, :) :: f_in - integer, value, intent(in) :: ny + real(dp), device, intent(out), dimension(:, :, :) :: f_out !! Output: Reordered field + real(dp), device, intent(in), dimension(:, :, :) :: f_in !! Input: Original field + integer, value, intent(in) :: ny !! Y-dimension size - integer :: i, j, k + integer :: i, j, k !! Thread and loop indices i = threadIdx%x k = blockIdx%x @@ -626,13 +627,17 @@ attributes(global) subroutine enforce_periodicity_y(f_out, f_in, ny) end subroutine enforce_periodicity_y attributes(global) subroutine undo_periodicity_y(f_out, f_in, ny) + !! Undo Y-direction periodicity reordering after non-periodic transforms. + !! + !! Inverse of enforce_periodicity_y: reconstructs original domain layout + !! from symmetric FFT ordering. Restores odd/even point positions. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: f_out - real(dp), device, intent(in), dimension(:, :, :) :: f_in - integer, value, intent(in) :: ny + real(dp), device, intent(out), dimension(:, :, :) :: f_out !! Output: Restored field + real(dp), device, intent(in), dimension(:, :, :) :: f_in !! Input: Reordered field + integer, value, intent(in) :: ny !! Y-dimension size - integer :: i, j, k + integer :: i, j, k !! Thread and loop indices i = threadIdx%x k = blockIdx%x diff --git a/src/backend/cuda/kernels/thomas.f90 b/src/backend/cuda/kernels/thomas.f90 index b5bf81169..af9522f7a 100644 --- a/src/backend/cuda/kernels/thomas.f90 +++ b/src/backend/cuda/kernels/thomas.f90 @@ -1,4 +1,14 @@ module m_cuda_kernels_thom + !! CUDA kernels for Thomas algorithm-based tridiagonal solvers. + !! + !! Implements compact finite difference schemes using Thomas algorithm + !! for both periodic and non-periodic boundary conditions. Each thread + !! handles one pencil line through the domain. + !! + !! Variants: + !! + !! - `der_univ_thom`: Non-periodic boundaries with explicit near-boundary stencils + !! - `der_univ_thom_per`: Periodic boundaries with cyclic reduction use cudafor use m_common, only: dp @@ -11,18 +21,24 @@ attributes(global) subroutine der_univ_thom( & du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, & thom_f, thom_s, thom_w, strch & ) + !! Compute derivatives using Thomas algorithm with non-periodic boundaries. + !! + !! Forward pass: Apply compact stencil and eliminate sub-diagonal. + !! Backward pass: Back-substitution to solve tridiagonal system. + !! Near-boundary points use explicit stencils from coeffs_s/coeffs_e. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: du - real(dp), device, intent(in), dimension(:, :, :) :: u - integer, value, intent(in) :: n_tds, n_rhs - real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e - real(dp), device, intent(in), dimension(:) :: coeffs - real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch + real(dp), device, intent(out), dimension(:, :, :) :: du !! Output: Derivative field + real(dp), device, intent(in), dimension(:, :, :) :: u !! Input: Field to differentiate + integer, value, intent(in) :: n_tds, n_rhs !! Number of unknowns and RHS points + real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Start/end explicit stencil coefficients + real(dp), device, intent(in), dimension(:) :: coeffs !! Bulk stencil coefficients (9-point) + real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, & + thom_w, strch !! Thomas algorithm coefficients and stretching - integer :: i, j, b + integer :: i, j, b !! Thread, loop, and block indices - real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du + real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, temp_du !! Stencil coefficients and temporary i = threadIdx%x b = blockIdx%x @@ -120,21 +136,26 @@ end subroutine der_univ_thom attributes(global) subroutine der_univ_thom_per( & du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch & ) + !! Compute derivatives using Thomas algorithm with periodic boundaries. + !! + !! Forward pass: Apply periodic compact stencil with modulo indexing. + !! Backward pass: Standard back-substitution. + !! Periodic correction: Sherman-Morrison formula for cyclic system. implicit none - real(dp), device, intent(out), dimension(:, :, :) :: du - real(dp), device, intent(in), dimension(:, :, :) :: u - integer, value, intent(in) :: n - real(dp), device, intent(in), dimension(:) :: coeffs - real(dp), value, intent(in) :: alpha + real(dp), device, intent(out), dimension(:, :, :) :: du !! Output: Derivative field + real(dp), device, intent(in), dimension(:, :, :) :: u !! Input: Field to differentiate + integer, value, intent(in) :: n !! Number of points in periodic direction + real(dp), device, intent(in), dimension(:) :: coeffs !! Stencil coefficients (9-point) + real(dp), value, intent(in) :: alpha !! Periodic coupling coefficient real(dp), device, intent(in), dimension(:) :: thom_f, thom_s, thom_w, & - thom_p, strch + thom_p, strch !! Thomas and periodic correction coefficients - integer :: i, j, b - integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4 + integer :: i, j, b !! Thread, loop, and block indices + integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4 !! Periodic neighbor indices - real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 - real(dp) :: temp_du, ss + real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 !! Stencil coefficients + real(dp) :: temp_du, ss !! Temporary derivative and Sherman-Morrison correction i = threadIdx%x b = blockIdx%x diff --git a/src/backend/cuda/poisson_fft.f90 b/src/backend/cuda/poisson_fft.f90 index 32a362f10..5b421a3ef 100644 --- a/src/backend/cuda/poisson_fft.f90 +++ b/src/backend/cuda/poisson_fft.f90 @@ -1,4 +1,9 @@ module m_cuda_poisson_fft + !! FFT-based Poisson solver on GPU using cuFFT. + !! + !! Extends `poisson_fft_t` with device-resident spectral data and cuFFT plans. + !! Handles forward/backward transforms, spectral post-processing for different + !! boundary conditions, and periodic extensions. use iso_c_binding, only: c_loc, c_ptr, c_f_pointer, c_int, c_float, & c_double_complex, c_float_complex use iso_fortran_env, only: stderr => error_unit @@ -24,7 +29,7 @@ module m_cuda_poisson_fft implicit none type, extends(poisson_fft_t) :: cuda_poisson_fft_t - !! FFT based Poisson solver + !! GPU-accelerated FFT-based Poisson solver with device-resident spectral data. !> Local domain sized array storing the spectral equivalence constants complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev @@ -149,20 +154,28 @@ end subroutine create_fft_plan function init(mesh, xdirps, ydirps, zdirps, lowmem) & result(poisson_fft) + !! Initialise CUDA Poisson FFT solver with cuFFT plans and spectral arrays. + !! + !! Sets up 3D FFT plans, allocates device storage for wave numbers and + !! stretching operators, and configures 1D decomposition (Z in real space, + !! Y in spectral space). implicit none - type(mesh_t), intent(in) :: mesh - type(dirps_t), intent(in) :: xdirps, ydirps, zdirps - logical, optional, intent(in) :: lowmem + type(mesh_t), intent(in) :: mesh !! Computational mesh + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Directional operators + logical, optional, intent(in) :: lowmem !! Low memory mode flag - type(cuda_poisson_fft_t) :: poisson_fft + type(cuda_poisson_fft_t) :: poisson_fft !! Initialised solver - integer :: nx, ny, nz + integer :: nx, ny, nz !! Global grid dimensions - integer :: ierr - integer(int_ptr_kind()) :: worksize + integer :: ierr !! Error code + integer(int_ptr_kind()) :: worksize !! cuFFT workspace size - integer :: dims_glob(3), dims_loc(3), n_spec(3), n_sp_st(3) + integer :: dims_glob(3) !! Global domain dimensions + integer :: dims_loc(3) !! Local domain dimensions + integer :: n_spec(3) !! Spectral space dimensions + integer :: n_sp_st(3) !! Spectral space start indices ! 1D decomposition along Z in real domain, and along Y in spectral space if (mesh%par%nproc_dir(2) /= 1) print *, 'nproc_dir in y-dir must be 1' @@ -282,19 +295,25 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) & end function init subroutine fft_forward_cuda(self, f) + !! Execute forward 3D FFT on device field. + !! + !! Copies padded field data into cuFFT descriptor storage and performs + !! forward transform using cuFFTMp. implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(in) :: f + class(field_t), intent(in) :: f !! Input field in real space - real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :) - real(dp), device, pointer :: f_ptr - type(c_ptr) :: f_c_ptr + real(dp), device, pointer :: padded_dev(:, :, :) !! Padded field data + real(dp), device, pointer :: d_dev(:, :, :) !! cuFFT descriptor data + real(dp), device, pointer :: f_ptr !! Workaround device pointer for cuFFT + type(c_ptr) :: f_c_ptr !! Intermediate C pointer for workaround - type(cudaXtDesc), pointer :: descriptor + type(cudaXtDesc), pointer :: descriptor !! cuFFTMp descriptor - integer :: tsize, ierr - type(dim3) :: blocks, threads + integer :: tsize !! Thread block size + integer :: ierr !! Error code + type(dim3) :: blocks, threads !! CUDA kernel configuration select type (f) type is (cuda_field_t) @@ -340,19 +359,25 @@ subroutine fft_forward_cuda(self, f) end subroutine fft_forward_cuda subroutine fft_backward_cuda(self, f) + !! Execute backward 3D FFT and copy result to device field. + !! + !! Performs inverse transform using cuFFTMp and copies result from + !! descriptor storage back to field's device array. implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(inout) :: f + class(field_t), intent(inout) :: f !! Output field in real space - real(dp), device, pointer :: padded_dev(:, :, :), d_dev(:, :, :) - real(dp), device, pointer :: f_ptr - type(c_ptr) :: f_c_ptr + real(dp), device, pointer :: padded_dev(:, :, :) !! Padded field data + real(dp), device, pointer :: d_dev(:, :, :) !! cuFFT descriptor data + real(dp), device, pointer :: f_ptr !! Workaround device pointer for cuFFT + type(c_ptr) :: f_c_ptr !! Intermediate C pointer for workaround - type(cudaXtDesc), pointer :: descriptor + type(cudaXtDesc), pointer :: descriptor !! cuFFTMp descriptor - integer :: tsize, ierr - type(dim3) :: blocks, threads + integer :: tsize !! Thread block size + integer :: ierr !! Error code + type(dim3) :: blocks, threads !! CUDA kernel configuration select type (f) type is (cuda_field_t) @@ -399,15 +424,19 @@ subroutine fft_backward_cuda(self, f) end subroutine fft_backward_cuda subroutine fft_postprocess_000_cuda(self) + !! Post-process spectral data for Periodic-Periodic-Periodic boundaries. + !! + !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with homogeneous + !! Periodic boundaries in all directions. implicit none class(cuda_poisson_fft_t) :: self - type(cudaXtDesc), pointer :: descriptor + type(cudaXtDesc), pointer :: descriptor !! cuFFTMp descriptor - complex(dp), device, dimension(:, :, :), pointer :: c_dev - type(dim3) :: blocks, threads - integer :: tsize + complex(dp), device, dimension(:, :, :), pointer :: c_dev !! Spectral data + type(dim3) :: blocks, threads !! CUDA kernel configuration + integer :: tsize !! Thread block size ! tsize is different than SZ, because here we work on a 3D Cartesian ! data structure, and free to specify any suitable thread/block size. @@ -438,15 +467,22 @@ subroutine fft_postprocess_000_cuda(self) end subroutine fft_postprocess_000_cuda subroutine fft_postprocess_010_cuda(self) + !! Post-process spectral data for Periodic-Non-periodic-Periodic boundaries. + !! + !! Solves Poisson equation $\nabla^2 p = f$ in spectral space with Periodic + !! boundaries in X and Z, non-periodic in Y. Handles stretched meshes with + !! matrix solves in spectral space. implicit none class(cuda_poisson_fft_t) :: self type(cudaXtDesc), pointer :: descriptor - complex(dp), device, dimension(:, :, :), pointer :: c_dev - type(dim3) :: blocks, threads - integer :: tsize, off, inc + complex(dp), device, dimension(:, :, :), pointer :: c_dev !! Spectral data + type(dim3) :: blocks, threads !! CUDA kernel configuration + integer :: tsize !! Thread block size + integer :: off !! Array offset for odd/even modes + integer :: inc !! Array increment stride ! tsize is different than SZ, because here we work on a 3D Cartesian ! data structure, and free to specify any suitable thread/block size. @@ -542,14 +578,19 @@ subroutine fft_postprocess_010_cuda(self) end subroutine fft_postprocess_010_cuda subroutine enforce_periodicity_y_cuda(self, f_out, f_in) + !! Enforce periodic extension in Y for non-periodic boundaries. + !! + !! Extends field from physical domain size to doubled periodic domain + !! by symmetry (f(y+L) = f(L-y)) for non-periodic boundary FFTs. implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out - class(field_t), intent(in) :: f_in + class(field_t), intent(inout) :: f_out !! Extended periodic field + class(field_t), intent(in) :: f_in !! Original physical field - real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev - type(dim3) :: blocks, threads + real(dp), device, pointer, dimension(:, :, :) :: f_out_dev !! Output device data + real(dp), device, pointer, dimension(:, :, :) :: f_in_dev !! Input device data + type(dim3) :: blocks, threads !! CUDA kernel configuration select type (f_out) type is (cuda_field_t) @@ -569,14 +610,19 @@ subroutine enforce_periodicity_y_cuda(self, f_out, f_in) end subroutine enforce_periodicity_y_cuda subroutine undo_periodicity_y_cuda(self, f_out, f_in) + !! Extract physical domain from periodic extension in Y. + !! + !! Reverses enforce_periodicity_y by extracting original domain size + !! from doubled periodic field after inverse FFT. implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out - class(field_t), intent(in) :: f_in + class(field_t), intent(inout) :: f_out !! Physical domain field + class(field_t), intent(in) :: f_in !! Extended periodic field - real(dp), device, pointer, dimension(:, :, :) :: f_out_dev, f_in_dev - type(dim3) :: blocks, threads + real(dp), device, pointer, dimension(:, :, :) :: f_out_dev !! Output device data + real(dp), device, pointer, dimension(:, :, :) :: f_in_dev !! Input device data + type(dim3) :: blocks, threads !! CUDA kernel configuration select type (f_out) type is (cuda_field_t) diff --git a/src/backend/cuda/sendrecv.f90 b/src/backend/cuda/sendrecv.f90 index 4df5861d3..37ed2184e 100644 --- a/src/backend/cuda/sendrecv.f90 +++ b/src/backend/cuda/sendrecv.f90 @@ -1,4 +1,18 @@ module m_cuda_sendrecv + !! MPI communication for CUDA backend using device pointers. + !! + !! Passes device pointers directly to MPI calls. With GPU-aware MPI + !! implementations (e.g., OpenMPI with CUDA support, MVAPICH2-GDR), + !! data transfers directly between GPU memories without staging through + !! host, reducing latency and increasing bandwidth. + !! + !! Without GPU-aware MPI, the implementation may stage through host + !! memory automatically, still functional but with additional overhead. + !! + !! - `sendrecv_fields`: Single field halo exchange + !! - `sendrecv_3fields`: Batch exchange for three fields (velocity components + !! or derivatives). Batching amortises MPI overhead and enables better + !! network utilisation. use cudafor use mpi @@ -10,11 +24,21 @@ module m_cuda_sendrecv subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & n_data, nproc, prev, next) + !! Exchange boundary halos using MPI with device pointers. + !! + !! MPI_Isend/Irecv allows all four communications (send to prev/next, + !! receive from prev/next) to proceed concurrently, enabling network + !! pipelining. MPI_Waitall synchronises only when results needed. + !! + !! When nproc=1, data copied directly on device without MPI. implicit none - real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e - real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e - integer, intent(in) :: n_data, nproc, prev, next + real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e !! Device receive buffers + real(dp), device, dimension(:, :, :), intent(in) :: f_send_s, f_send_e !! Device send buffers + integer, intent(in) :: n_data !! Number of data elements + integer, intent(in) :: nproc !! Number of processes in direction + integer, intent(in) :: prev !! Previous neighbour rank + integer, intent(in) :: next !! Next neighbour rank integer :: req(4), err(4), ierr, tag = 1234 @@ -41,13 +65,22 @@ subroutine sendrecv_3fields( & f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e, & n_data, nproc, prev, next & ) + !! Exchange three fields simultaneously using batched MPI communication. + !! + !! Used for: (1) velocity component halos (u, v, w) before computing transport + !! equation, (2) derivative field halos (du, dud, d2u) in distributed compact + !! schemes. Batching all three fields amortises MPI setup overhead. Single + !! MPI_Waitall for all 12 operations reduces synchronisation points. implicit none real(dp), device, dimension(:, :, :), intent(out) :: & - f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e + f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e !! Device receive buffers real(dp), device, dimension(:, :, :), intent(in) :: & - f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e - integer, intent(in) :: n_data, nproc, prev, next + f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e !! Device send buffers + integer, intent(in) :: n_data !! Number of data elements per field + integer, intent(in) :: nproc !! Number of processes + integer, intent(in) :: prev !! Previous neighbour rank + integer, intent(in) :: next !! Next neighbour rank integer :: req(12), err(12), ierr, tag = 1234 diff --git a/src/backend/cuda/tdsops.f90 b/src/backend/cuda/tdsops.f90 index b14cf5614..2ce9e47e8 100644 --- a/src/backend/cuda/tdsops.f90 +++ b/src/backend/cuda/tdsops.f90 @@ -1,4 +1,9 @@ module m_cuda_tdsops + !! GPU-resident tridiagonal operator coefficients. + !! + !! Extends base `tdsops_t` with device memory copies of all coefficient + !! arrays. One-time upload to GPU avoids repeated host-device transfers + !! during kernel execution, critical for performance. use iso_fortran_env, only: stderr => error_unit use m_common, only: dp @@ -7,18 +12,15 @@ module m_cuda_tdsops implicit none type, extends(tdsops_t) :: cuda_tdsops_t - !! CUDA extension of the Tridiagonal Solver Operators class. - !! - !! Regular tdsops_t class is initiated and the coefficient arrays are - !! copied into device arrays so that cuda kernels can use them. + !! Tridiagonal operators with device-resident coefficients. real(dp), device, allocatable :: dist_fw_dev(:), dist_bw_dev(:), & dist_sa_dev(:), dist_sc_dev(:), & - dist_af_dev(:) + dist_af_dev(:) !! Distributed compact scheme coefficients real(dp), device, allocatable :: thom_f_dev(:), thom_s_dev(:), & - thom_w_dev(:), thom_p_dev(:) - real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:) + thom_w_dev(:), thom_p_dev(:) !! Thomas algorithm coefficients + real(dp), device, allocatable :: stretch_dev(:), stretch_correct_dev(:) !! Grid stretching factors real(dp), device, allocatable :: coeffs_dev(:), & - coeffs_s_dev(:, :), coeffs_e_dev(:, :) + coeffs_s_dev(:, :), coeffs_e_dev(:, :) !! Finite difference stencils contains end type cuda_tdsops_t @@ -32,11 +34,13 @@ function cuda_tdsops_init( & n_tds, delta, operation, scheme, bc_start, bc_end, & stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu & ) result(tdsops) - !! Constructor function for the cuda_tdsops_t class. - !! See tdsops_t for details. + !! Initialise tridiagonal operators and upload to GPU. + !! + !! Computes coefficients on CPU via base tdsops_init, then copies + !! to device arrays for kernel access. See tdsops_t for parameters. implicit none - type(cuda_tdsops_t) :: tdsops !! return value of the function + type(cuda_tdsops_t) :: tdsops integer, intent(in) :: n_tds real(dp), intent(in) :: delta diff --git a/src/backend/omp/backend.f90 b/src/backend/omp/backend.f90 index 370108a5a..5a812b16f 100644 --- a/src/backend/omp/backend.f90 +++ b/src/backend/omp/backend.f90 @@ -1,4 +1,39 @@ module m_omp_backend + !! OpenMP/CPU backend implementation for X3D2 solver operations. + !! + !! This module provides the CPU-based backend using OpenMP for shared-memory + !! parallelism and MPI for distributed-memory parallelism. It implements all + !! abstract backend operations defined in `base_backend_t`. + !! + !! **Parallelisation Strategy:** + !! + !! - **MPI**: Domain decomposition across nodes/processes + !! - **OpenMP**: Thread parallelism within each MPI rank + !! - **Hybrid MPI+OpenMP**: Enables efficient use of multi-core clusters + !! + !! **Key Features:** + !! + !! - Compact finite difference operators (tridiagonal solves) + !! - Halo exchange for distributed derivatives + !! - FFT-based Poisson solver integration + !! - Vectorised array operations + !! - Optimised data reordering between decomposition directions + !! + !! **Memory Management:** + !! + !! - Send/receive buffers for MPI halo exchange (`u`, `v`, `w`, `du`, `dud`, `d2u`) + !! - Buffers sized based on largest decomposition direction + !! - Persistent buffers to avoid repeated allocation + !! + !! **Solver Operations:** + !! + !! - `transeq`: Transport equation terms with halo exchange + !! - `tds_solve`: Tridiagonal system solves (Thomas algorithm) + !! - `reorder`: Data layout transformations (`DIR_X`, `DIR_Y`, `DIR_Z`) + !! - Field operations: copy, add, multiply, integrate, etc. + !! + !! **Note:** This backend requires 2DECOMP&FFT library for FFT operations + !! when using the spectral Poisson solver. use mpi use m_allocator, only: allocator_t @@ -20,37 +55,53 @@ module m_omp_backend private :: transeq_halo_exchange, transeq_dist_component type, extends(base_backend_t) :: omp_backend_t - !character(len=*), parameter :: name = 'omp' + !! OpenMP/CPU backend for solver operations. + !! + !! Extends `base_backend_t` with CPU-specific implementations using + !! OpenMP for threading and MPI for distributed parallelism. Maintains + !! communication buffers for halo exchange operations. + !! + !! **Communication Buffers:** + !! Arrays sized (SZ, n_halo, n_groups) where: + !! - SZ: maximum pencil size for data reordering + !! - n_halo: halo region depth (typically 4 for compact schemes) + !! - n_groups: maximum number of groups across all directions + !! + !! Buffer naming convention: {field}_{send/recv}_{s/e} + !! - field: u, v, w (velocity), du, dud, d2u (derivatives) + !! - send/recv: data direction + !! - s/e: start/end of domain (neighbouring ranks) + !character(len=*), parameter :: name = 'omp' !! Backend identifier real(dp), allocatable, dimension(:, :, :) :: & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - v_recv_s, v_recv_e, v_send_s, v_send_e, & - w_recv_s, w_recv_e, w_send_s, w_send_e, & - du_send_s, du_send_e, du_recv_s, du_recv_e, & - dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & - d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + u_recv_s, u_recv_e, u_send_s, u_send_e, & !! Velocity u halo buffers + v_recv_s, v_recv_e, v_send_s, v_send_e, & !! Velocity v halo buffers + w_recv_s, w_recv_e, w_send_s, w_send_e, & !! Velocity w halo buffers + du_send_s, du_send_e, du_recv_s, du_recv_e, & !! First derivative buffers + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & !! Mixed derivative buffers + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e !! Second derivative buffers contains - procedure :: alloc_tdsops => alloc_omp_tdsops - procedure :: transeq_x => transeq_x_omp - procedure :: transeq_y => transeq_y_omp - procedure :: transeq_z => transeq_z_omp - procedure :: transeq_species => transeq_species_omp - procedure :: tds_solve => tds_solve_omp - procedure :: reorder => reorder_omp - procedure :: sum_yintox => sum_yintox_omp - procedure :: sum_zintox => sum_zintox_omp - procedure :: veccopy => veccopy_omp - procedure :: vecadd => vecadd_omp - procedure :: vecmult => vecmult_omp - procedure :: scalar_product => scalar_product_omp - procedure :: field_max_mean => field_max_mean_omp - procedure :: field_scale => field_scale_omp - procedure :: field_shift => field_shift_omp - procedure :: field_set_face => field_set_face_omp - procedure :: field_volume_integral => field_volume_integral_omp - procedure :: copy_data_to_f => copy_data_to_f_omp - procedure :: copy_f_to_data => copy_f_to_data_omp - procedure :: init_poisson_fft => init_omp_poisson_fft - procedure :: transeq_omp_dist + procedure :: alloc_tdsops => alloc_omp_tdsops !! Allocate tridiagonal operators + procedure :: transeq_x => transeq_x_omp !! Transport equation in X + procedure :: transeq_y => transeq_y_omp !! Transport equation in Y + procedure :: transeq_z => transeq_z_omp !! Transport equation in Z + procedure :: transeq_species => transeq_species_omp !! Transport for species/scalars + procedure :: tds_solve => tds_solve_omp !! Tridiagonal solve + procedure :: reorder => reorder_omp !! Data reordering + procedure :: sum_yintox => sum_yintox_omp !! Sum Y data into X + procedure :: sum_zintox => sum_zintox_omp !! Sum Z data into X + procedure :: veccopy => veccopy_omp !! Vector copy + procedure :: vecadd => vecadd_omp !! Vector add + procedure :: vecmult => vecmult_omp !! Vector multiply + procedure :: scalar_product => scalar_product_omp !! Scalar product + procedure :: field_max_mean => field_max_mean_omp !! Compute max and mean + procedure :: field_scale => field_scale_omp !! Scale field + procedure :: field_shift => field_shift_omp !! Shift field values + procedure :: field_set_face => field_set_face_omp !! Set face values + procedure :: field_volume_integral => field_volume_integral_omp !! Volume integral + procedure :: copy_data_to_f => copy_data_to_f_omp !! Copy data to field + procedure :: copy_f_to_data => copy_f_to_data_omp !! Copy field to data + procedure :: init_poisson_fft => init_omp_poisson_fft !! Initialise FFT Poisson + procedure :: transeq_omp_dist !! Distributed transeq (internal) end type omp_backend_t interface omp_backend_t @@ -60,11 +111,21 @@ module m_omp_backend contains function init(mesh, allocator) result(backend) + !! Initialise OpenMP backend with mesh and allocator. + !! + !! Sets up the backend by: + !! 1. Calling base initialisation + !! 2. Linking mesh and allocator + !! 3. Determining maximum number of groups across directions + !! 4. Allocating communication buffers for halo exchange + !! + !! **Buffer Sizing:** Buffers are sized based on the largest decomposition + !! direction to handle all reordering operations efficiently. implicit none - type(mesh_t), target, intent(inout) :: mesh - class(allocator_t), target, intent(inout) :: allocator - type(omp_backend_t) :: backend + type(mesh_t), target, intent(inout) :: mesh !! Mesh with decomposition + class(allocator_t), target, intent(inout) :: allocator !! Memory allocator + type(omp_backend_t) :: backend !! Initialised backend instance integer :: n_groups @@ -113,19 +174,24 @@ subroutine alloc_omp_tdsops( & self, tdsops, n_tds, delta, operation, scheme, bc_start, bc_end, & stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu & ) + !! Allocate and initialise tridiagonal operator for OMP backend. + !! + !! Creates a `tdsops_t` object configured for the specified operation + !! (derivative, interpolation) with chosen compact scheme and boundary + !! conditions. Handles grid stretching and viscous corrections. implicit none - class(omp_backend_t) :: self - class(tdsops_t), allocatable, intent(inout) :: tdsops - integer, intent(in) :: n_tds - real(dp), intent(in) :: delta - character(*), intent(in) :: operation, scheme - integer, intent(in) :: bc_start, bc_end - real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) - integer, optional, intent(in) :: n_halo - character(*), optional, intent(in) :: from_to - logical, optional, intent(in) :: sym - real(dp), optional, intent(in) :: c_nu, nu0_nu + class(omp_backend_t) :: self !! Backend instance + class(tdsops_t), allocatable, intent(inout) :: tdsops !! Tridiagonal operator to allocate + integer, intent(in) :: n_tds !! Number of points in direction + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: operation, scheme !! Operation type and scheme name + integer, intent(in) :: bc_start, bc_end !! Boundary condition codes + real(dp), optional, intent(in) :: stretch(:), stretch_correct(:) !! Grid stretching + integer, optional, intent(in) :: n_halo !! Halo depth + character(*), optional, intent(in) :: from_to !! Data location transition + logical, optional, intent(in) :: sym !! Symmetry flag + real(dp), optional, intent(in) :: c_nu, nu0_nu !! Viscous correction parameters allocate (tdsops_t :: tdsops) @@ -139,26 +205,40 @@ subroutine alloc_omp_tdsops( & end subroutine alloc_omp_tdsops subroutine transeq_x_omp(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation RHS in X direction. + !! + !! Evaluates convection and diffusion terms for momentum equations: + !! \( du/dt = -u \cdot \nabla u + \nu \nabla^2 u \) + !! + !! Delegates to `transeq_omp_dist` which handles halo exchange and + !! distributed compact schemes. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du, dv, dw !! Output: velocity RHS + class(field_t), intent(in) :: u, v, w !! Input: velocity fields + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Directional operators call self%transeq_omp_dist(du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_x_omp subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation RHS in Y direction. + !! + !! Calculates convective and viscous terms for Y-pencil decomposition. + !! Velocity components are reordered (v, u, w) to align primary + !! direction with pencil orientation before calling distributed kernel. + !! + !! See [[transeq_x_omp]] for transport equation formulation. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du, dv, dw !! Time derivatives (output) + class(field_t), intent(in) :: u, v, w !! Velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Spectral operators ! u, v, w is reordered so that we pass v, u, w call self%transeq_omp_dist(dv, du, dw, v, u, w, nu, dirps) @@ -166,13 +246,20 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_y_omp subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps) + !! Compute transport equation RHS in Z direction. + !! + !! Calculates convective and viscous terms for Z-pencil decomposition. + !! Velocity components are reordered (w, u, v) to align primary + !! direction with pencil orientation before calling distributed kernel. + !! + !! See [[transeq_x_omp]] for transport equation formulation. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du, dv, dw !! Time derivatives (output) + class(field_t), intent(in) :: u, v, w !! Velocity components + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Spectral operators ! u, v, w is reordered so that we pass w, u, v call self%transeq_omp_dist(dw, du, dv, w, u, v, nu, dirps) @@ -180,18 +267,24 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_z_omp subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync) - !! Compute the convection and diffusion for the given field - !! in the given direction. - !! Halo exchange for the given field is necessary - !! When sync is true, halo exchange of momentum is necessary + !! Compute transport equation RHS for scalar species. + !! + !! Calculates convective and diffusive terms for a passive scalar + !! (temperature, concentration, etc.) transported by velocity field. + !! + !! **Equation:** `$\partial\phi/\partial t = -\mathbf{u}\cdot\nabla\phi + \nu\nabla^2\phi$` where $\phi$ is the scalar species. + !! + !! **Synchronisation:** When `sync=.true.`, performs halo exchange + !! for velocity field before computation. Always exchanges scalar halos. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: dspec - class(field_t), intent(in) :: uvw, spec - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps - logical, intent(in) :: sync + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: dspec !! Time derivative of species (output) + class(field_t), intent(in) :: uvw !! Velocity component in pencil direction + class(field_t), intent(in) :: spec !! Species concentration/temperature + real(dp), intent(in) :: nu !! Diffusivity coefficient + type(dirps_t), intent(in) :: dirps !! Spectral operators + logical, intent(in) :: sync !! Perform velocity halo exchange if true integer :: n_groups @@ -229,13 +322,21 @@ subroutine transeq_species_omp(self, dspec, uvw, spec, nu, dirps, sync) end subroutine transeq_species_omp subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps) + !! Internal: Distributed transport equation implementation. + !! + !! Orchestrates the complete transport equation calculation for + !! all velocity components. First performs halo exchange for + !! distributed compact derivatives, then computes each component's + !! RHS using transeq_dist_component. + !! + !! **Called by:** transeq_x/y/z_omp after velocity reordering implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - real(dp), intent(in) :: nu - type(dirps_t), intent(in) :: dirps + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du, dv, dw !! Time derivatives (output) + class(field_t), intent(in) :: u, v, w !! Velocity components (reordered for pencil direction) + real(dp), intent(in) :: nu !! Kinematic viscosity + type(dirps_t), intent(in) :: dirps !! Spectral operators call transeq_halo_exchange(self, u, v, w, dirps%dir) @@ -258,9 +359,17 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, nu, dirps) end subroutine transeq_omp_dist subroutine transeq_halo_exchange(self, u, v, w, dir) - class(omp_backend_t) :: self - class(field_t), intent(in) :: u, v, w - integer, intent(in) :: dir + !! Internal: Perform halo exchange for all velocity components. + !! + !! Exchanges 4-point halos between neighbouring MPI processes for + !! distributed compact finite difference stencils. Copies boundary + !! data into send buffers, performs MPI sendrecv, stores in receive + !! buffers for use in derivative calculations. + !! + !! **Operation:** Copy to buffers $\rightarrow$ MPI_Sendrecv $\rightarrow$ Store halos + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(in) :: u, v, w !! Velocity components + integer, intent(in) :: dir !! Communication direction integer :: n, nproc_dir, pprev, pnext integer :: n_groups @@ -296,20 +405,27 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, & u_recv_s, u_recv_e, & conv_recv_s, conv_recv_e, & tdsops_du, tdsops_dud, tdsops_d2u, dir) - !! Computes RHS_x^u following: + !! Internal: Compute single component of transport equation RHS. + !! + !! Calculates RHS for one velocity component using skew-symmetric form: + !! + !! **Formula:** `rhs = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2` + !! + !! Uses distributed compact FD kernels with halo data from neighbours. + !! Allocates temporary storage for derivatives and releases after use. !! - !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2 - class(omp_backend_t) :: self + !! **Skew-symmetric:** Reduces aliasing errors in nonlinear convection. + class(omp_backend_t) :: self !! Backend instance !> The result field, it is also used as temporary storage - class(field_t), intent(inout) :: rhs_du - class(field_t), intent(in) :: u, conv - real(dp), intent(in) :: nu + class(field_t), intent(inout) :: rhs_du !! RHS output (also temp storage) + class(field_t), intent(in) :: u, conv !! Velocity component and convecting velocity + real(dp), intent(in) :: nu !! Kinematic viscosity real(dp), dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e, & - conv_recv_s, conv_recv_e - class(tdsops_t), intent(in) :: tdsops_du - class(tdsops_t), intent(in) :: tdsops_dud - class(tdsops_t), intent(in) :: tdsops_d2u - integer, intent(in) :: dir + conv_recv_s, conv_recv_e !! Halo data from neighbours + class(tdsops_t), intent(in) :: tdsops_du !! First derivative operator + class(tdsops_t), intent(in) :: tdsops_dud !! Product derivative operator + class(tdsops_t), intent(in) :: tdsops_d2u !! Second derivative operator + integer, intent(in) :: dir !! Direction index class(field_t), pointer :: d2u, dud dud => self%allocator%get_block(dir) @@ -334,12 +450,20 @@ subroutine transeq_dist_component(self, rhs_du, u, conv, nu, & end subroutine transeq_dist_component subroutine tds_solve_omp(self, du, u, tdsops) + !! Solve tridiagonal system for compact finite difference operation. + !! + !! Applies compact scheme operator to field using Thomas algorithm. + !! Handles both local (single-process) and distributed (multi-process) + !! solves depending on decomposition configuration. + !! + !! **Data Location:** Updates output data location based on operator's + !! `move` specification (e.g., CELL to VERT for interpolation). implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - class(tdsops_t), intent(in) :: tdsops + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du !! Output field + class(field_t), intent(in) :: u !! Input field + class(tdsops_t), intent(in) :: tdsops !! Tridiagonal operator ! Check if direction matches for both in/out fields if (u%dir /= du%dir) then @@ -355,12 +479,22 @@ subroutine tds_solve_omp(self, du, u, tdsops) end subroutine tds_solve_omp subroutine tds_solve_dist(self, du, u, tdsops) + !! Internal: Distributed tridiagonal solve with halo exchange. + !! + !! Solves compact finite difference system across multiple MPI processes. + !! Performs halo exchange before calling distributed Thomas algorithm + !! kernel. Used when domain decomposition splits the pencil direction. + !! + !! **Algorithm:** + !! 1. Copy boundary data into send buffers + !! 2. MPI_Sendrecv for halo exchange + !! 3. Distributed Thomas algorithm with boundary coupling implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - class(tdsops_t), intent(in) :: tdsops + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: du !! Solution field (output) + class(field_t), intent(in) :: u !! RHS field + class(tdsops_t), intent(in) :: tdsops !! Tridiagonal operator integer :: n_groups, dir dir = u%dir @@ -387,12 +521,24 @@ subroutine tds_solve_dist(self, du, u, tdsops) end subroutine tds_solve_dist subroutine reorder_omp(self, u_, u, direction) + !! Reorder field data between different pencil decompositions. + !! + !! Transforms field layout from one decomposition direction to another + !! (e.g., X-pencils to Y-pencils). Uses MPI All-to-All communication + !! to redistribute data across processes. + !! + !! **Directions:** DIR_X, DIR_Y, DIR_Z specify pencil orientations. + !! Each pencil is contiguous along its direction and distributed in + !! the other two dimensions. + !! + !! **Performance:** Critical operation for multi-dimensional algorithms. + !! Uses `get_index_reordering` for efficient cache-friendly reordering. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - integer, intent(in) :: direction + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: u_ !! Output field (reordered) + class(field_t), intent(in) :: u !! Input field + integer, intent(in) :: direction !! Reordering direction code integer, dimension(3) :: dims, cart_padded integer :: i, j, k integer :: out_i, out_j, out_k @@ -420,33 +566,50 @@ subroutine reorder_omp(self, u_, u, direction) end subroutine reorder_omp subroutine sum_yintox_omp(self, u, u_) + !! Sum Y-pencils into X-pencils through reordering. + !! + !! Performs directional reduction by reordering from Y to X pencils + !! and summing the result into the destination field. Useful for + !! integrating quantities along the Y direction. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u - class(field_t), intent(in) :: u_ + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: u !! Destination field (X-pencils, accumulates result) + class(field_t), intent(in) :: u_ !! Source field (Y-pencils) call sum_intox_omp(self, u, u_, DIR_Y) end subroutine sum_yintox_omp subroutine sum_zintox_omp(self, u, u_) + !! Sum Z-pencils into X-pencils through reordering. + !! + !! Performs directional reduction by reordering from Z to X pencils + !! and summing the result into the destination field. Useful for + !! integrating quantities along the Z direction. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u - class(field_t), intent(in) :: u_ + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: u !! Destination field (X-pencils, accumulates result) + class(field_t), intent(in) :: u_ !! Source field (Z-pencils) call sum_intox_omp(self, u, u_, DIR_Z) end subroutine sum_zintox_omp subroutine sum_intox_omp(self, u, u_, dir_to) + !! Internal helper: Sum reordered field into X-pencils. + !! + !! Reorders source field from X-pencils to specified direction, + !! then accumulates into destination field. Called by sum_yintox_omp + !! and sum_zintox_omp for directional integration. + !! + !! **Algorithm:** Reorder with index mapping, accumulate with += - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u - class(field_t), intent(in) :: u_ - integer, intent(in) :: dir_to + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: u !! Destination field (accumulates result) + class(field_t), intent(in) :: u_ !! Source field + integer, intent(in) :: dir_to !! Target direction (DIR_Y or DIR_Z) integer :: dir_from integer, dimension(3) :: dims, cart_padded @@ -473,12 +636,16 @@ subroutine sum_intox_omp(self, u, u_, dir_to) end subroutine sum_intox_omp subroutine veccopy_omp(self, dst, src) + !! Copy field data from source to destination. + !! + !! Element-wise copy with OpenMP parallelisation. Both fields + !! must have the same decomposition direction and dimensions. implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: dst - class(field_t), intent(in) :: src - integer :: i, j, k + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: dst !! Destination field + class(field_t), intent(in) :: src !! Source field + integer :: i, j, k !! Loop indices if (src%dir /= dst%dir) then error stop "Called vector copy with incompatible fields" @@ -501,13 +668,19 @@ subroutine veccopy_omp(self, dst, src) end subroutine veccopy_omp subroutine vecadd_omp(self, a, x, b, y) + !! Vector addition: y = a*x + b*y (in-place AXPBY). + !! + !! Scaled in-place vector addition with OpenMP parallelisation + !! and SIMD vectorisation. Implements the BLAS AXPBY operation. + !! + !! **Formula:** `y := a*x + b*y` where a, b are scalars. implicit none - class(omp_backend_t) :: self - real(dp), intent(in) :: a - class(field_t), intent(in) :: x - real(dp), intent(in) :: b - class(field_t), intent(inout) :: y + class(omp_backend_t) :: self !! Backend instance + real(dp), intent(in) :: a !! Scalar multiplier for x + class(field_t), intent(in) :: x !! First input field + real(dp), intent(in) :: b !! Scalar multiplier for y + class(field_t), intent(inout) :: y !! Second input field (overwritten with result) integer :: i, j, k if (x%dir /= y%dir) then @@ -531,13 +704,18 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp subroutine vecmult_omp(self, y, x) + !! Element-wise multiplication: y = y * x (in-place). + !! + !! In-place element-wise multiplication with OpenMP parallelisation + !! and SIMD vectorisation. Often used for applying masks or + !! multiplying solution components. !! [[m_base_backend(module):vecmult(interface)]] implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: y - class(field_t), intent(in) :: x - integer :: i, j, k + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: y !! Field to multiply and store result + class(field_t), intent(in) :: x !! Multiplier field + integer :: i, j, k !! Loop indices if (x%dir /= y%dir) then error stop "Called vector multiply with incompatible fields" @@ -560,16 +738,24 @@ subroutine vecmult_omp(self, y, x) end subroutine vecmult_omp real(dp) function scalar_product_omp(self, x, y) result(s) + !! Compute global scalar product (dot product) of two fields. + !! + !! Calculates the dot product $\sum(x_i \times y_i)$ across all grid points + !! and all MPI processes. Uses OpenMP parallelisation with reduction + !! and MPI_Allreduce for global sum. + !! + !! **Algorithm:** Local parallel reduction $\rightarrow$ MPI_Allreduce + !! **Data location:** Both fields must be at the same location (CELL/VERT). !! [[m_base_backend(module):scalar_product(interface)]] implicit none - class(omp_backend_t) :: self - class(field_t), intent(in) :: x, y - class(field_t), pointer :: x_, y_ - integer, dimension(3) :: dims - integer :: i, j, k, ii - integer :: nvec, remstart - integer :: ierr + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(in) :: x, y !! Input fields + class(field_t), pointer :: x_, y_ !! Pointers for data access + integer, dimension(3) :: dims !! Field dimensions + integer :: i, j, k, ii !! Loop indices + integer :: nvec, remstart !! Vectorisation variables + integer :: ierr !! MPI error code if ((x%data_loc == NULL_LOC) .or. (y%data_loc == NULL_LOC)) then error stop "You must set the data_loc before calling scalar product" @@ -623,14 +809,22 @@ real(dp) function scalar_product_omp(self, x, y) result(s) end function scalar_product_omp subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups) + !! Internal helper: Copy halo data into send buffers. + !! + !! Extracts 4-point halos from start and end of domain for + !! MPI communication. Used in transeq_halo_exchange to prepare + !! boundary data for neighbour processes. + !! + !! **Buffer layout:** (SZ, 4, n_groups) for cache efficiency implicit none - real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e - real(dp), dimension(:, :, :), intent(in) :: u - integer, intent(in) :: n - integer, intent(in) :: n_groups - integer :: i, j, k - integer :: n_halo = 4 + real(dp), dimension(:, :, :), intent(out) :: u_send_s !! Send buffer for start boundary + real(dp), dimension(:, :, :), intent(out) :: u_send_e !! Send buffer for end boundary + real(dp), dimension(:, :, :), intent(in) :: u !! Field data + integer, intent(in) :: n !! Domain size in communication direction + integer, intent(in) :: n_groups !! Number of pencil groups + integer :: i, j, k !! Loop indices + integer :: n_halo = 4 !! Halo width (compact scheme stencil) !$omp parallel do do k = 1, n_groups @@ -648,13 +842,25 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_groups) end subroutine copy_into_buffers subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc) + !! Compute global maximum and mean of a field. + !! + !! Calculates maximum and mean values across all grid points and + !! MPI processes. Uses data location (CELL/VERT) to determine + !! valid domain extents, excluding padding and ghost cells. + !! + !! **Algorithm:** + !! 1. Local parallel max/sum reduction with OpenMP + !! 2. MPI_Allreduce for global max/sum + !! 3. Mean = global_sum / global_count + !! + !! **Data location:** Can be enforced or read from field metadata. !! [[m_base_backend(module):field_max_mean(interface)]] implicit none - class(omp_backend_t) :: self - real(dp), intent(out) :: max_val, mean_val - class(field_t), intent(in) :: f - integer, optional, intent(in) :: enforced_data_loc + class(omp_backend_t) :: self !! Backend instance + real(dp), intent(out) :: max_val, mean_val !! Global maximum and mean values + class(field_t), intent(in) :: f !! Input field + integer, optional, intent(in) :: enforced_data_loc !! Override data location if provided real(dp) :: val, max_p, sum_p, max_pncl, sum_pncl integer :: data_loc, dims(3), dims_padded(3), n, n_i, n_i_pad, n_j @@ -721,33 +927,48 @@ subroutine field_max_mean_omp(self, max_val, mean_val, f, enforced_data_loc) end subroutine field_max_mean_omp subroutine field_scale_omp(self, f, a) + !! Scale field by constant: f = a * f. + !! + !! Multiplies all field values by scalar a in-place. + !! Uses Fortran array syntax for simplicity. implicit none - class(omp_backend_t) :: self - class(field_t), intent(in) :: f - real(dp), intent(in) :: a + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(in) :: f !! Field to scale (modified in-place) + real(dp), intent(in) :: a !! Scaling factor f%data = a*f%data end subroutine field_scale_omp subroutine field_shift_omp(self, f, a) + !! Shift field by constant: f = f + a. + !! + !! Adds scalar a to all field values in-place. + !! Uses Fortran array syntax for simplicity. implicit none - class(omp_backend_t) :: self - class(field_t), intent(in) :: f - real(dp), intent(in) :: a + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(in) :: f !! Field to shift (modified in-place) + real(dp), intent(in) :: a !! Shift amount f%data = f%data + a end subroutine field_shift_omp subroutine field_set_face_omp(self, f, c_start, c_end, face) + !! Set boundary face values to specified constants. + !! + !! Sets values on a specified domain face (X/Y/Z start/end) + !! to given constants. Used for boundary condition enforcement. + !! + !! **Faces:** VERT_START_FACE, VERT_END_FACE, etc. !! [[m_base_backend(module):field_set_face(subroutine)]] implicit none - class(omp_backend_t) :: self - class(field_t), intent(inout) :: f - real(dp), intent(in) :: c_start, c_end - integer, intent(in) :: face + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(inout) :: f !! Field to modify + real(dp), intent(in) :: c_start !! Value for start side of face + real(dp), intent(in) :: c_end !! Value for end side of face + integer, intent(in) :: face !! Face identifier constant integer :: dims(3), k, j, i_mod, k_end @@ -784,11 +1005,18 @@ subroutine field_set_face_omp(self, f, c_start, c_end, face) end subroutine field_set_face_omp real(dp) function field_volume_integral_omp(self, f) result(s) - !! volume integral of a field + !! Compute volume integral of field over domain. + !! + !! Calculates $\int f \,dV$ by summing all field values (at cell centres) + !! and multiplying by grid cell volumes. Uses MPI_Allreduce for + !! global sum across all processes. + !! + !! **Formula:** $\int f \,dV = \sum(f_i \times \Delta V_i)$ where $\Delta V$ from mesh + !! **Assumption:** Field at cell centres (data_loc = CELL) implicit none - class(omp_backend_t) :: self - class(field_t), intent(in) :: f + class(omp_backend_t) :: self !! Backend instance + class(field_t), intent(in) :: f !! Field to integrate real(dp) :: sum_p, sum_pncl integer :: dims(3), stacked, i, j, k, k_i, k_j, ierr @@ -829,32 +1057,48 @@ real(dp) function field_volume_integral_omp(self, f) result(s) end function field_volume_integral_omp subroutine copy_data_to_f_omp(self, f, data) - class(omp_backend_t), intent(inout) :: self - class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: data + !! Copy raw array into field structure. + !! + !! Simple wrapper for field initialisation from external data. + !! Uses Fortran array assignment for efficiency. + class(omp_backend_t), intent(inout) :: self !! Backend instance + class(field_t), intent(inout) :: f !! Target field + real(dp), dimension(:, :, :), intent(in) :: data !! Source data array f%data = data end subroutine copy_data_to_f_omp subroutine copy_f_to_data_omp(self, data, f) - class(omp_backend_t), intent(inout) :: self - real(dp), dimension(:, :, :), intent(out) :: data - class(field_t), intent(in) :: f + !! Copy field structure into raw array. + !! + !! Simple wrapper for field extraction to external data. + !! Uses Fortran array assignment for efficiency. + class(omp_backend_t), intent(inout) :: self !! Backend instance + real(dp), dimension(:, :, :), intent(out) :: data !! Destination data array + class(field_t), intent(in) :: f !! Source field data = f%data end subroutine copy_f_to_data_omp subroutine init_omp_poisson_fft(self, mesh, xdirps, ydirps, zdirps, lowmem) + !! Initialise FFT-based Poisson solver for OMP backend. + !! + !! Creates and configures omp_poisson_fft_t solver for pressure + !! correction step. Uses 2DECOMP&FFT library for parallel FFTs + !! in pencil decomposition. + !! + !! **Requirement:** WITH_2DECOMPFFT must be defined at compile time. + !! **Low-memory mode:** Optional flag to reduce memory footprint. #ifdef WITH_2DECOMPFFT use m_omp_poisson_fft, only: omp_poisson_fft_t #endif implicit none - class(omp_backend_t) :: self - type(mesh_t), intent(in) :: mesh - type(dirps_t), intent(in) :: xdirps, ydirps, zdirps - logical, optional, intent(in) :: lowmem + class(omp_backend_t) :: self !! Backend instance + type(mesh_t), intent(in) :: mesh !! Mesh with grid spacing + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Spectral operators for each direction + logical, optional, intent(in) :: lowmem !! Enable low-memory mode #ifdef WITH_2DECOMPFFT allocate (omp_poisson_fft_t :: self%poisson_fft) diff --git a/src/backend/omp/common.f90 b/src/backend/omp/common.f90 index 6d3df179a..b687782d8 100644 --- a/src/backend/omp/common.f90 +++ b/src/backend/omp/common.f90 @@ -1,6 +1,15 @@ module m_omp_common + !! Common constants for OpenMP backend implementation. + !! + !! Defines compile-time constants used throughout the OMP backend + !! for performance tuning and buffer sizing. + !! + !! **SZ (pencil size):** Maximum pencil dimension for data reordering + !! operations. Set to 16 for optimal cache utilisation and vectorisation + !! on typical CPU architectures. Larger values may improve performance + !! for very large problems but increase memory overhead. implicit none - integer, parameter :: SZ = 16 + integer, parameter :: SZ = 16 !! Maximum pencil size for reordering buffers end module m_omp_common diff --git a/src/backend/omp/exec_dist.f90 b/src/backend/omp/exec_dist.f90 index d1334512e..e56c3a61f 100644 --- a/src/backend/omp/exec_dist.f90 +++ b/src/backend/omp/exec_dist.f90 @@ -1,4 +1,23 @@ module m_omp_exec_dist + !! Distributed compact finite difference execution for OMP backend. + !! + !! Orchestrates parallel execution of distributed compact schemes across + !! MPI processes. Manages OpenMP threading, halo exchanges, forward/backward + !! sweeps, and boundary system solves for multi-process compact operators. + !! + !! **Key features:** + !! + !! - Forward/backward elimination with boundary coupling + !! - Non-blocking MPI communication for 2x2 boundary systems + !! - OpenMP parallelisation over pencil groups + !! - Fused kernels for transport equation efficiency + !! + !! **Distributed algorithm:** + !! + !! 1. Forward/backward sweep on local domain \(\rightarrow\) generate boundary systems + !! 2. MPI exchange boundary data between neighbours + !! 3. Solve coupled 2x2 systems at process interfaces + !! 4. Substitution sweep to complete solution use mpi use m_common, only: dp @@ -15,21 +34,34 @@ module m_omp_exec_dist subroutine exec_dist_tds_compact( & du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, & tdsops, nproc, pprev, pnext, n_groups) + !! Execute distributed compact finite difference operation. + !! + !! Applies compact scheme operator across multiple MPI processes using + !! distributed Thomas algorithm. Performs forward/backward elimination, + !! exchanges boundary systems via MPI, then completes with substitution. + !! + !! **Algorithm:** + !! 1. `der_univ_dist`: Forward/backward sweep $\rightarrow$ boundary 2x2 systems + !! 2. `sendrecv_fields`: MPI exchange boundary data with neighbours + !! 3. `der_univ_subs`: Solve boundaries $\rightarrow$ back-substitution + !! + !! **Parallelisation:** OpenMP over pencil groups, MPI across processes implicit none ! du = d(u) - real(dp), dimension(:, :, :), intent(out) :: du - real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e + real(dp), dimension(:, :, :), intent(out) :: du !! Derivative output + real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e !! Field and halos ! The ones below are intent(out) just so that we can write data in them, ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care about the data they pass back real(dp), dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e !! Boundary system buffers (scratch) - type(tdsops_t), intent(in) :: tdsops - integer, intent(in) :: nproc, pprev, pnext - integer, intent(in) :: n_groups + type(tdsops_t), intent(in) :: tdsops !! Compact scheme operator + integer, intent(in) :: nproc !! Number of processes in direction + integer, intent(in) :: pprev, pnext !! Previous/next neighbour ranks + integer, intent(in) :: n_groups !! Number of pencil groups integer :: n_data integer :: k @@ -71,31 +103,49 @@ subroutine exec_dist_transeq_compact( & u, u_recv_s, u_recv_e, & v, v_recv_s, v_recv_e, & tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_groups) + !! Execute distributed transport equation RHS calculation. + !! + !! Computes three compact derivative operations required for transport + !! equation in skew-symmetric form, then fuses final RHS assembly. + !! All three derivatives (du, d(u*v), d2u) computed in parallel with + !! single halo exchange pass. + !! + !! **Derivatives computed:** + !! - `du`: First derivative of u + !! - `dud`: First derivative of u*v (product computed locally with halos) + !! - `d2u`: Second derivative of u (viscous term) + !! + !! **Fused assembly:** Final RHS combines all three derivatives with + !! viscosity scaling in single kernel (der_univ_fused_subs). + !! + !! **Optimisation:** Product u*v computed on-the-fly to avoid storing + !! extra field. Reduces memory footprint. implicit none !> The result array, it is also used as temporary storage - real(dp), dimension(:, :, :), intent(out) :: rhs_du + real(dp), dimension(:, :, :), intent(out) :: rhs_du !! Transport equation RHS output !> Temporary storage arrays - real(dp), dimension(:, :, :), intent(out) :: dud, d2u + real(dp), dimension(:, :, :), intent(out) :: dud, d2u !! Product derivative and second derivative ! The ones below are intent(out) just so that we can write data in them, ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care about the data they pass back real(dp), dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e !! Boundary buffers for du (scratch) real(dp), dimension(:, :, :), intent(out) :: & - dud_send_s, dud_send_e, dud_recv_s, dud_recv_e + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e !! Boundary buffers for dud (scratch) real(dp), dimension(:, :, :), intent(out) :: & - d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e !! Boundary buffers for d2u (scratch) - real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e - real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e + real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e !! Velocity component and halos + real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e !! Convecting velocity and halos - type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u - real(dp), intent(in) :: nu - integer, intent(in) :: nproc, pprev, pnext - integer, intent(in) :: n_groups + type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u !! Operators for each derivative + real(dp), intent(in) :: nu !! Kinematic viscosity + integer, intent(in) :: nproc !! Number of processes in direction + integer, intent(in) :: pprev, pnext !! Previous/next neighbour ranks + integer, intent(in) :: n_groups !! Number of pencil groups real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e diff --git a/src/backend/omp/exec_thom.f90 b/src/backend/omp/exec_thom.f90 index b1f0c6028..575da1f8b 100644 --- a/src/backend/omp/exec_thom.f90 +++ b/src/backend/omp/exec_thom.f90 @@ -1,4 +1,19 @@ module m_exec_thom + !! Local Thomas algorithm execution for OMP backend. + !! + !! Provides parallel execution of compact finite difference schemes using + !! standard Thomas algorithm (tridiagonal solver). Used when domain is not + !! decomposed in the derivative direction (all data local to process). + !! + !! **Two variants:** + !! + !! - **Non-periodic:** Standard Thomas with arbitrary boundary conditions + !! - **Periodic:** Modified Thomas for cyclic tridiagonal systems + !! + !! **Parallelisation:** OpenMP over pencil groups (no MPI needed) + !! + !! **Contrast with distributed:** exec_dist handles multi-process case, + !! this module handles single-process-per-direction case. use m_common, only: dp use m_tdsops, only: tdsops_t @@ -13,11 +28,22 @@ module m_exec_thom contains subroutine exec_thom_tds_compact(du, u, tdsops, n_groups) + !! Execute local Thomas algorithm for compact scheme. + !! + !! Applies compact finite difference operator using tridiagonal solver. + !! Chooses periodic or non-periodic variant based on operator configuration. + !! All computation local to process (no MPI communication). + !! + !! **Algorithm selection:** + !! - `periodic=.true.`: Sherman-Morrison formula for cyclic system + !! - `periodic=.false.`: Standard forward/backward Thomas algorithm + !! + !! **Parallelisation:** OpenMP parallel loop over pencil groups - real(dp), dimension(:, :, :), intent(out) :: du - real(dp), dimension(:, :, :), intent(in) :: u - type(tdsops_t), intent(in) :: tdsops - integer, intent(in) :: n_groups + real(dp), dimension(:, :, :), intent(out) :: du !! Derivative output + real(dp), dimension(:, :, :), intent(in) :: u !! Input field + type(tdsops_t), intent(in) :: tdsops !! Compact scheme operator + integer, intent(in) :: n_groups !! Number of pencil groups integer :: k diff --git a/src/backend/omp/kernels/distributed.f90 b/src/backend/omp/kernels/distributed.f90 index a02f39f8d..8d0b85866 100644 --- a/src/backend/omp/kernels/distributed.f90 +++ b/src/backend/omp/kernels/distributed.f90 @@ -1,4 +1,31 @@ module m_omp_kernels_dist + !! Distributed compact finite difference kernels for OpenMP backend. + !! + !! This module implements high-performance kernels for distributed compact + !! finite difference operators. These operators require halo exchange across + !! MPI ranks to compute derivatives near subdomain boundaries. + !! + !! **Key Features:** + !! + !! - 9-point stencil compact schemes (4th-6th order accuracy) + !! - Explicit vectorisation with OpenMP SIMD directives + !! - Near and far boundary treatments for non-periodic domains + !! - Forward and backward elimination phases for distributed solves + !! + !! **Kernels:** + !! + !! - `der_univ_dist`: Universal derivative (1st/2nd) with halo exchange + !! - `interpl_dist`: Interpolation from cell to vertices or vice versa + !! + !! **Distributed Algorithm:** + !! Compact schemes couple neighbouring points via implicit systems. + !! In distributed memory: + !! + !! 1. Near-boundary points use special coefficients incorporating halo data + !! 2. Interior points use standard bulk coefficients + !! 3. Modified Thomas algorithm handles cross-process dependencies + !! + !! **Performance:** Explicitly vectorized inner loops for SIMD execution. use omp_lib use m_common, only: dp @@ -12,15 +39,24 @@ subroutine der_univ_dist( & du, send_u_s, send_u_e, u, u_s, u_e, & n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, ffr, fbc, faf & ) + !! Compute distributed compact derivative (1st or 2nd order). + !! + !! Evaluates derivative using compact finite difference scheme across + !! distributed domain. Handles boundary points with halo data and applies + !! appropriate scaling factors. + !! + !! **Stencil:** 9-point compact scheme requiring 4-point halo on each side. + !! Near boundaries (first/last 4 points): use boundary-specific coefficients. + !! Interior: use uniform bulk coefficients for efficiency. implicit none ! Arguments - real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e - real(dp), intent(in), dimension(:, :) :: u, u_s, u_e - integer, intent(in) :: n_tds, n_rhs - real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end - real(dp), intent(in), dimension(:) :: coeffs - real(dp), intent(in), dimension(:) :: ffr, fbc, faf + real(dp), intent(out), dimension(:, :) :: du, send_u_s, send_u_e !! Output derivative and send buffers + real(dp), intent(in), dimension(:, :) :: u, u_s, u_e !! Input field and halo data (start/end) + integer, intent(in) :: n_tds, n_rhs !! System sizes + real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Boundary coefficients + real(dp), intent(in), dimension(:) :: coeffs !! Bulk stencil coefficients + real(dp), intent(in), dimension(:) :: ffr, fbc, faf !! Scaling factors ! Local variables integer :: i, j diff --git a/src/backend/omp/kernels/spectral_processing.f90 b/src/backend/omp/kernels/spectral_processing.f90 index 75d3bbd1a..fbc188928 100644 --- a/src/backend/omp/kernels/spectral_processing.f90 +++ b/src/backend/omp/kernels/spectral_processing.f90 @@ -1,4 +1,20 @@ module m_omp_spectral + !! Spectral space processing for FFT-based Poisson solver. + !! + !! Provides kernels for solving Poisson equation in Fourier space with + !! spectral equivalence transformations. Handles different boundary + !! condition combinations: fully periodic (000) and non-periodic in Y (010). + !! + !! **Spectral equivalence:** Modified wavenumbers for finite-difference + !! grid (Lele 1992). Ensures spectral solver matches compact FD schemes. + !! + !! **Reference:** JCP 228 (2009), 5989-6015, Section 4 + !! + !! **Processing steps:** + !! + !! 1. Forward spectral equivalence transform (physical \(\rightarrow\) modified wavenumbers) + !! 2. Solve: \(\hat{\phi}_k = -\hat{f}_k / k^2\) + !! 3. Backward spectral equivalence transform (modified wavenumbers \(\rightarrow\) physical) use m_common, only: dp implicit none @@ -8,22 +24,34 @@ subroutine process_spectral_000( & div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, & nx, ny, nz, ax, bx, ay, by, az, bz & ) - !! Post-process div U* in spectral space for all periodic BCs. + !! Solve Poisson in spectral space for (0,0,0) boundary conditions. !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! Processes fully periodic case. Applies spectral equivalence transforms + !! in all three directions, divides by squared wavenumber, then applies + !! inverse transforms. + !! + !! **Algorithm:** + !! 1. Normalise by grid size (FFT convention) + !! 2. Forward spectral equivalence: physical $\rightarrow$ modified waves (Z, Y, X order) + !! 3. Solve: $\phi_k = -f_k / k^2$ (handle zero mode specially) + !! 4. Backward spectral equivalence: modified waves $\rightarrow$ physical + !! + !! **Special case:** Zero wavenumber (k=0) set to zero to remove constant mode. + !! + !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4 implicit none !> Divergence of velocity in spectral space - complex(dp), intent(inout), dimension(:, :, :) :: div_u + complex(dp), intent(inout), dimension(:, :, :) :: div_u !! In: RHS, Out: Solution !> Spectral equivalence constants - complex(dp), intent(in), dimension(:, :, :) :: waves - real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz + complex(dp), intent(in), dimension(:, :, :) :: waves !! Modified wavenumbers squared + real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Spectral equivalence coefficients !> Grid size in spectral space - integer, intent(in) :: nx_spec, ny_spec, nz_spec + integer, intent(in) :: nx_spec, ny_spec, nz_spec !! Local spectral dimensions !> Offsets in the permuted pencils in spectral space - integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st + integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st !! Global offsets !> Global cell size - integer, intent(in) :: nx, ny, nz + integer, intent(in) :: nx, ny, nz !! Global grid dimensions integer :: i, j, k, ix, iy, iz real(dp) :: tmp_r, tmp_c, div_r, div_c @@ -109,22 +137,37 @@ subroutine process_spectral_010( & div_u, waves, nx_spec, ny_spec, nz_spec, x_sp_st, y_sp_st, z_sp_st, & nx, ny, nz, ax, bx, ay, by, az, bz & ) - !! Post-process div U* in spectral space, for non-periodic BC in y-dir. + !! Solve Poisson in spectral space for (0,1,0) boundary conditions. + !! + !! Processes non-periodic in Y, periodic in X and Z. Uses sine series + !! in Y-direction (symmetry/antisymmetry transform) combined with + !! Fourier in X and Z. + !! + !! **Algorithm:** + !! 1. Normalise by grid size + !! 2. Forward spectral equivalence in Z and X (not Y, handled separately) + !! 3. Apply Y symmetry transform (combine left/right halves) + !! 4. Solve: $\phi_k = -f_k / k^2$ + !! 5. Inverse Y symmetry transform + !! 6. Backward spectral equivalence in X and Z + !! + !! **Y-direction:** Sine series requires special symmetric processing + !! to maintain real-valued solution with non-periodic BCs. !! - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + !! **Ref.** JCP 228 (2009), 5989–6015, Sec 4 implicit none !> Divergence of velocity in spectral space - complex(dp), intent(inout), dimension(:, :, :) :: div_u + complex(dp), intent(inout), dimension(:, :, :) :: div_u !! In: RHS, Out: Solution !> Spectral equivalence constants - complex(dp), intent(in), dimension(:, :, :) :: waves - real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz + complex(dp), intent(in), dimension(:, :, :) :: waves !! Modified wavenumbers squared + real(dp), intent(in), dimension(:) :: ax, bx, ay, by, az, bz !! Spectral equivalence coefficients !> Grid size in spectral space - integer, intent(in) :: nx_spec, ny_spec, nz_spec + integer, intent(in) :: nx_spec, ny_spec, nz_spec !! Local spectral dimensions !> Offsets in the permuted pencils in spectral space - integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st + integer, intent(in) :: x_sp_st, y_sp_st, z_sp_st !! Global offsets !> Global cell size - integer, intent(in) :: nx, ny, nz + integer, intent(in) :: nx, ny, nz !! Global grid dimensions integer :: i, j, k, ix, iy, iz, iy_r real(dp) :: tmp_r, tmp_c, div_r, div_c, l_r, l_c, r_r, r_c diff --git a/src/backend/omp/kernels/thomas.f90 b/src/backend/omp/kernels/thomas.f90 index 88ec8d771..bc0d5c69f 100644 --- a/src/backend/omp/kernels/thomas.f90 +++ b/src/backend/omp/kernels/thomas.f90 @@ -1,4 +1,18 @@ module m_omp_kernels_thom + !! Thomas algorithm kernels for local compact finite differences. + !! + !! Implements tridiagonal solvers for compact schemes when domain is + !! not decomposed in derivative direction. Provides both standard + !! (non-periodic) and cyclic (periodic) Thomas algorithm variants. + !! + !! **Thomas algorithm:** Standard forward elimination and backward + !! substitution for tridiagonal systems, O(n) complexity. + !! + !! **Periodic Thomas:** Sherman-Morrison formula to handle cyclic + !! tridiagonal systems arising from periodic boundary conditions. + !! + !! **Vectorisation:** Explicit SIMD directives for SZ-wide vectors, + !! processing multiple pencils simultaneously. use m_common, only: dp use m_omp_common, only: SZ @@ -8,14 +22,32 @@ module m_omp_kernels_thom subroutine der_univ_thom(du, u, n_tds, n_rhs, coeffs_s, coeffs_e, coeffs, & thom_f, thom_s, thom_w, strch) + !! Thomas algorithm for non-periodic compact finite differences. + !! + !! Solves tridiagonal system arising from compact scheme with arbitrary + !! boundary conditions. Uses standard forward elimination followed by + !! backward substitution. + !! + !! **Algorithm:** + !! 1. Forward pass: Eliminate lower diagonal, form modified RHS + !! 2. Backward pass: Back-substitution with grid stretching correction + !! + !! **Boundary treatment:** Special stencils at start (j=1..4) and + !! end (j=n-3..n) to handle non-periodic boundaries. + !! + !! **Stretching:** Applied during backward pass via `strch` array. implicit none - real(dp), dimension(:, :), intent(out) :: du - real(dp), dimension(:, :), intent(in) :: u - integer, intent(in) :: n_tds, n_rhs - real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e ! start/end - real(dp), intent(in), dimension(:) :: coeffs - real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, strch + real(dp), dimension(:, :), intent(out) :: du !! Solution (derivative) + real(dp), dimension(:, :), intent(in) :: u !! Input field + integer, intent(in) :: n_tds !! Number of unknowns (tridiagonal size) + integer, intent(in) :: n_rhs !! Number of RHS points (stencil size) + real(dp), intent(in), dimension(:, :) :: coeffs_s, coeffs_e !! Start/end stencil coefficients + real(dp), intent(in), dimension(:) :: coeffs !! Interior stencil coefficients (9-point) + real(dp), intent(in), dimension(:) :: thom_f !! Forward elimination factors + real(dp), intent(in), dimension(:) :: thom_s !! Subdiagonal elimination factors + real(dp), intent(in), dimension(:) :: thom_w !! Diagonal weights for back-substitution + real(dp), intent(in), dimension(:) :: strch !! Grid stretching correction factors integer :: i, j real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4 @@ -132,14 +164,34 @@ end subroutine der_univ_thom subroutine der_univ_thom_per( & du, u, n, coeffs, alpha, thom_f, thom_s, thom_w, thom_p, strch & ) + !! Periodic Thomas algorithm for cyclic tridiagonal systems. + !! + !! Solves compact scheme with periodic boundary conditions using + !! Sherman-Morrison formula. Handles wraparound coupling between + !! first and last grid points. + !! + !! **Algorithm:** + !! 1. Forward pass: Standard elimination with periodic indexing + !! 2. Backward pass: Standard back-substitution + !! 3. Periodic correction: Sherman-Morrison adjustment for cyclic coupling + !! + !! **Periodic indexing:** Uses modulo arithmetic for stencil access + !! to handle wraparound at domain boundaries. + !! + !! **Sherman-Morrison:** Adds rank-1 correction to handle tridiagonal + !! system modified by periodic coupling terms. implicit none - real(dp), dimension(:, :), intent(out) :: du - real(dp), dimension(:, :), intent(in) :: u - integer, intent(in) :: n - real(dp), intent(in), dimension(:) :: coeffs - real(dp), intent(in) :: alpha - real(dp), intent(in), dimension(:) :: thom_f, thom_s, thom_w, thom_p, strch + real(dp), dimension(:, :), intent(out) :: du !! Solution (derivative) + real(dp), dimension(:, :), intent(in) :: u !! Input field + integer, intent(in) :: n !! Number of grid points + real(dp), intent(in), dimension(:) :: coeffs !! Stencil coefficients (9-point) + real(dp), intent(in) :: alpha !! Tridiagonal sub/super-diagonal value + real(dp), intent(in), dimension(:) :: thom_f !! Forward elimination factors + real(dp), intent(in), dimension(:) :: thom_s !! Subdiagonal elimination factors + real(dp), intent(in), dimension(:) :: thom_w !! Diagonal weights + real(dp), intent(in), dimension(:) :: thom_p !! Periodic correction vector + real(dp), intent(in), dimension(:) :: strch !! Grid stretching correction factors integer :: i, j integer :: jm4, jm3, jm2, jm1, jp1, jp2, jp3, jp4 diff --git a/src/backend/omp/poisson_fft.f90 b/src/backend/omp/poisson_fft.f90 index 72a94521c..db5241803 100644 --- a/src/backend/omp/poisson_fft.f90 +++ b/src/backend/omp/poisson_fft.f90 @@ -1,4 +1,24 @@ module m_omp_poisson_fft + !! FFT-based Poisson solver for OMP backend. + !! + !! Solves \(\nabla^2 \phi = f\) using spectral methods with 2DECOMP&FFT library. + !! Transforms to Fourier space, solves diagonal system in spectral space, + !! then transforms back to physical space. + !! + !! **Algorithm:** + !! + !! 1. Forward FFT: physical \(\rightarrow\) spectral space + !! 2. Spectral solve: \(\hat{\phi}_k = \hat{f}_k / k^2\) (with modifications for boundary conditions) + !! 3. Backward FFT: spectral \(\rightarrow\) physical space + !! + !! **Boundary conditions:** + !! + !! - (0,0,0): Periodic in all directions + !! - (0,1,0): Non-periodic in Y, periodic in X/Z (uses symmetry transform) + !! + !! **Parallelisation:** MPI via 2DECOMP&FFT pencil decomposition + !! + !! **Limitation:** Does not support Y-direction grid stretching use decomp_2d_constants, only: PHYSICAL_IN_X use decomp_2d_fft, only: decomp_2d_fft_init, decomp_2d_fft_3d, & @@ -16,14 +36,14 @@ module m_omp_poisson_fft type, extends(poisson_fft_t) :: omp_poisson_fft_t !! FFT based Poisson solver - complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z + complex(dp), allocatable, dimension(:, :, :) :: c_x !! Spectral space buffer (X-pencil oriented) contains - procedure :: fft_forward => fft_forward_omp - procedure :: fft_backward => fft_backward_omp - procedure :: fft_postprocess_000 => fft_postprocess_000_omp - procedure :: fft_postprocess_010 => fft_postprocess_010_omp - procedure :: enforce_periodicity_y => enforce_periodicity_y_omp - procedure :: undo_periodicity_y => undo_periodicity_y_omp + procedure :: fft_forward => fft_forward_omp !! Transform to spectral space + procedure :: fft_backward => fft_backward_omp !! Transform to physical space + procedure :: fft_postprocess_000 => fft_postprocess_000_omp !! Spectral solve for (0,0,0) BCs + procedure :: fft_postprocess_010 => fft_postprocess_010_omp !! Spectral solve for (0,1,0) BCs + procedure :: enforce_periodicity_y => enforce_periodicity_y_omp !! Symmetry transform for Y non-periodic + procedure :: undo_periodicity_y => undo_periodicity_y_omp !! Inverse symmetry transform end type omp_poisson_fft_t interface omp_poisson_fft_t @@ -35,15 +55,22 @@ module m_omp_poisson_fft contains function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft) + !! Initialise FFT-based Poisson solver. + !! + !! Sets up 2DECOMP&FFT library and allocates spectral space buffers. + !! Computes wavenumbers and coefficients for spectral solve. + !! + !! **Error checking:** Fails if Y-direction grid stretching requested + !! (not supported by FFT method). implicit none - type(mesh_t), intent(in) :: mesh - class(dirps_t), intent(in) :: xdirps, ydirps, zdirps - logical, optional, intent(in) :: lowmem - integer, dimension(3) :: istart, iend, isize - integer :: dims(3) + type(mesh_t), intent(in) :: mesh !! Mesh with grid spacing + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Spectral operators + logical, optional, intent(in) :: lowmem !! Low-memory flag (ignored for OMP) + integer, dimension(3) :: istart, iend, isize !! Local spectral dimensions + integer :: dims(3) !! Global grid dimensions - type(omp_poisson_fft_t) :: poisson_fft + type(omp_poisson_fft_t) :: poisson_fft !! Initialised solver if (mesh%par%is_root()) then print *, "Initialising 2decomp&fft" @@ -75,29 +102,43 @@ function init(mesh, xdirps, ydirps, zdirps, lowmem) result(poisson_fft) end function init subroutine fft_forward_omp(self, f_in) + !! Forward FFT: physical space to spectral space. + !! + !! Transforms input field from physical (real) to spectral (complex) + !! representation using 2DECOMP&FFT. Result stored in `self%c_x`. implicit none - class(omp_poisson_fft_t) :: self - class(field_t), intent(in) :: f_in + class(omp_poisson_fft_t) :: self !! Solver instance + class(field_t), intent(in) :: f_in !! Physical space field (RHS) call decomp_2d_fft_3d(f_in%data, self%c_x) end subroutine fft_forward_omp subroutine fft_backward_omp(self, f_out) + !! Backward FFT: spectral space to physical space. + !! + !! Transforms spectral solution back to physical (real) space using + !! inverse FFT. Reads from `self%c_x`, writes to output field. implicit none - class(omp_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out + class(omp_poisson_fft_t) :: self !! Solver instance + class(field_t), intent(inout) :: f_out !! Physical space solution call decomp_2d_fft_3d(self%c_x, f_out%data) end subroutine fft_backward_omp subroutine fft_postprocess_000_omp(self) + !! Spectral solve for (0,0,0) boundary conditions. + !! + !! Solves Poisson equation in spectral space for fully periodic domain. + !! Divides each Fourier mode by its corresponding $k^2$ eigenvalue. + !! + !! **Formula:** $\hat{\phi}_k = \hat{f}_k / (k_x^2 + k_y^2 + k_z^2)$ implicit none - class(omp_poisson_fft_t) :: self + class(omp_poisson_fft_t) :: self !! Solver instance call process_spectral_000( & self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, & @@ -109,9 +150,16 @@ subroutine fft_postprocess_000_omp(self) end subroutine fft_postprocess_000_omp subroutine fft_postprocess_010_omp(self) + !! Spectral solve for (0,1,0) boundary conditions. + !! + !! Solves Poisson equation with non-periodic BCs in Y-direction, + !! periodic in X and Z. Uses modified wavenumbers accounting for + !! symmetry transformation (sine series in Y). + !! + !! **Formula:** Modified $k_y$ for sine series representation implicit none - class(omp_poisson_fft_t) :: self + class(omp_poisson_fft_t) :: self !! Solver instance call process_spectral_010( & self%c_x, self%waves, self%nx_spec, self%ny_spec, self%nz_spec, & @@ -123,11 +171,18 @@ subroutine fft_postprocess_010_omp(self) end subroutine fft_postprocess_010_omp subroutine enforce_periodicity_y_omp(self, f_out, f_in) + !! Apply symmetry transform for Y non-periodic boundary conditions. + !! + !! Converts physical field to symmetric/antisymmetric representation + !! suitable for sine series FFT. Used before forward FFT when Y-direction + !! has non-periodic BCs. + !! + !! **Transformation:** Maps domain to symmetric extension for sine basis. implicit none - class(omp_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out - class(field_t), intent(in) :: f_in + class(omp_poisson_fft_t) :: self !! Solver instance + class(field_t), intent(inout) :: f_out !! Transformed field + class(field_t), intent(in) :: f_in !! Original field integer :: i, j, k @@ -149,11 +204,17 @@ subroutine enforce_periodicity_y_omp(self, f_out, f_in) end subroutine enforce_periodicity_y_omp subroutine undo_periodicity_y_omp(self, f_out, f_in) + !! Inverse symmetry transform for Y non-periodic boundary conditions. + !! + !! Converts symmetric/antisymmetric representation back to physical + !! field. Used after backward FFT when Y-direction has non-periodic BCs. + !! + !! **Transformation:** Extracts physical domain from symmetric extension. implicit none - class(omp_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out - class(field_t), intent(in) :: f_in + class(omp_poisson_fft_t) :: self !! Solver instance + class(field_t), intent(inout) :: f_out !! Physical field + class(field_t), intent(in) :: f_in !! Transformed field integer :: i, j, k diff --git a/src/backend/omp/sendrecv.f90 b/src/backend/omp/sendrecv.f90 index 3aba6e82c..39d8c6caa 100644 --- a/src/backend/omp/sendrecv.f90 +++ b/src/backend/omp/sendrecv.f90 @@ -1,4 +1,14 @@ module m_omp_sendrecv + !! MPI halo exchange utilities for OMP backend. + !! + !! Provides non-blocking point-to-point communication for exchanging + !! boundary halos between neighbouring MPI processes. Used in distributed + !! compact finite difference schemes that require off-process data. + !! + !! **Communication pattern:** Bidirectional simultaneous send/recv with + !! neighbours in one decomposition direction. + !! + !! **Single-process optimisation:** Direct copy when no MPI communication needed. use mpi use m_common, only: dp, MPI_X3D2_DP @@ -9,11 +19,31 @@ module m_omp_sendrecv subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & n_data, nproc, prev, next) + !! Exchange boundary halos with neighbouring MPI processes. + !! + !! Performs bidirectional halo exchange using non-blocking MPI + !! communication (MPI_Isend/MPI_Irecv). Sends data to both neighbours + !! simultaneously and receives from both, then waits for all operations + !! to complete. + !! + !! **Special case:** Single-process (nproc=1) uses direct memory copy + !! for periodic boundaries without MPI overhead. + !! + !! **Communication pattern:** + !! - Send start halo to previous process + !! - Receive end halo from next process + !! - Send end halo to next process + !! - Receive start halo from previous process + !! + !! **Non-blocking:** All 4 operations initiated before waiting for completion. implicit none - real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e - real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e - integer, intent(in) :: n_data, nproc, prev, next + real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e !! Receive buffers (start/end halos) + real(dp), dimension(:, :, :), intent(in) :: f_send_s, f_send_e !! Send buffers (start/end halos) + integer, intent(in) :: n_data !! Number of data elements to transfer + integer, intent(in) :: nproc !! Number of processes in this direction + integer, intent(in) :: prev !! Rank of previous neighbour + integer, intent(in) :: next !! Rank of next neighbour integer :: req(4), err(4), ierr, tag = 1234 diff --git a/src/case/base_case.f90 b/src/case/base_case.f90 index e7a673d81..8efadcaf8 100644 --- a/src/case/base_case.f90 +++ b/src/case/base_case.f90 @@ -1,8 +1,39 @@ module m_base_case - !! Provides the base case for running a simulation. New cases are - !! implemented by extending this to specify the initial and boundary - !! conditions, forcing terms and case-specific postprocessing and analysis. - + !! Base class for flow simulation cases. + !! + !! This abstract base class provides the framework for implementing specific + !! flow cases (channel, TGV, generic, etc.). New cases extend this class and + !! override deferred procedures to specify: + !! + !! - **Initial conditions**: Set velocity and other field initial states + !! - **Boundary conditions**: Apply physical boundary conditions each timestep + !! - **Forcing terms**: Add body forces or model-specific source terms + !! - **Pre-correction**: Modify velocity before pressure correction (e.g., IBM) + !! - **Postprocessing**: Compute statistics, output diagnostics, etc. + !! + !! **Simulation Workflow:** + !! + !! The `run()` method orchestrates the time integration loop: + !! + !! 1. Apply boundary conditions + !! 2. Advance solution one timestep via solver%step() + !! 3. Write checkpoints/snapshots (via checkpoint_mgr) + !! 4. Perform case-specific postprocessing + !! 5. Repeat until final time reached + !! + !! **Time Integration:** + !! + !! Each timestep involves multiple stages (for RK) or steps (for AB): + !! + !! - Transport equation (transeq) computes velocity derivatives + !! - Forcing terms applied after transeq + !! - Pre-correction modifies velocity (e.g., for immersed boundaries) + !! - Pressure correction enforces incompressibility + !! + !! **Restart Capability:** + !! + !! The checkpoint manager handles restart from saved states automatically + !! if a restart file is detected. use m_allocator, only: allocator_t use m_base_backend, only: base_backend_t use m_common, only: dp, DIR_X, DIR_Z, DIR_C, VERT @@ -15,82 +46,109 @@ module m_base_case implicit none type, abstract :: base_case_t - class(solver_t), allocatable :: solver - type(io_manager_t) :: checkpoint_mgr + !! Abstract base type for flow cases. + !! + !! Derived types must implement all deferred procedures to define + !! case-specific behaviour. + class(solver_t), allocatable :: solver !! Incompressible Navier-Stokes solver + type(io_manager_t) :: checkpoint_mgr !! Checkpoint and snapshot manager contains - procedure(boundary_conditions), deferred :: boundary_conditions - procedure(initial_conditions), deferred :: initial_conditions - procedure(forcings), deferred :: forcings - procedure(pre_correction), deferred :: pre_correction - procedure(postprocess), deferred :: postprocess - procedure :: case_init - procedure :: case_finalise - procedure :: set_init - procedure :: run - procedure :: print_enstrophy - procedure :: print_div_max_mean + procedure(boundary_conditions), deferred :: boundary_conditions !! Apply BCs (deferred) + procedure(initial_conditions), deferred :: initial_conditions !! Set ICs (deferred) + procedure(forcings), deferred :: forcings !! Add forcing terms (deferred) + procedure(pre_correction), deferred :: pre_correction !! Pre-pressure correction (deferred) + procedure(postprocess), deferred :: postprocess !! Case-specific analysis (deferred) + procedure :: case_init !! Initialise case and solver + procedure :: case_finalise !! Clean up and finalise + procedure :: set_init !! Set initial conditions and prepare for run + procedure :: run !! Main time integration loop + procedure :: print_enstrophy !! Print enstrophy diagnostic + procedure :: print_div_max_mean !! Print divergence diagnostics end type base_case_t abstract interface subroutine boundary_conditions(self) - !! Applies case-specific boundary coinditions + !! Abstract interface for applying boundary conditions. + !! + !! Called each timestep before computing derivatives. Implementations + !! should set velocity and scalar values at domain boundaries according + !! to the physical boundary conditions (Dirichlet, Neumann, periodic, etc.). import :: base_case_t implicit none - class(base_case_t) :: self + class(base_case_t) :: self !! Case instance end subroutine boundary_conditions subroutine initial_conditions(self) - !! Sets case-specific initial conditions + !! Abstract interface for setting initial conditions. + !! + !! Called once during initialisation to set the initial state of velocity + !! and scalar fields. Implementations should populate \(u, v, w\) (and species + !! if present) with case-appropriate initial values. import :: base_case_t implicit none - class(base_case_t) :: self + class(base_case_t) :: self !! Case instance end subroutine initial_conditions subroutine forcings(self, du, dv, dw, iter) - !! Applies case-specific or model realated forcings after transeq + !! Abstract interface for applying forcing terms. + !! + !! Called after transport equation (transeq) but before pressure correction. + !! Add body forces, source terms, or model-specific forcings (e.g., mean + !! pressure gradient for channel flow, immersed boundary forces, etc.). import :: base_case_t import :: field_t implicit none - class(base_case_t) :: self - class(field_t), intent(inout) :: du, dv, dw - integer, intent(in) :: iter + class(base_case_t) :: self !! Case instance + class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify + integer, intent(in) :: iter !! Current iteration number end subroutine forcings subroutine pre_correction(self, u, v, w) - !! Applies case-specific pre-correction to the velocity fields before - !! pressure correction + !! Abstract interface for pre-pressure correction modifications. + !! + !! Called after forcings but before pressure correction. Used for operations + !! that need to modify the velocity field before enforcing incompressibility, + !! such as immersed boundary method (IBM) velocity corrections. import :: base_case_t import :: field_t implicit none - class(base_case_t) :: self - class(field_t), intent(inout) :: u, v, w + class(base_case_t) :: self !! Case instance + class(field_t), intent(inout) :: u, v, w !! Velocity fields to modify end subroutine pre_correction subroutine postprocess(self, iter, t) - !! Triggers case-specific postprocessings at user specified intervals + !! Abstract interface for case-specific postprocessing. + !! + !! Called at user-specified intervals during time integration. Implement + !! this to compute statistics, output diagnostics, write custom data files, + !! or perform any case-specific analysis. import :: base_case_t import :: dp implicit none - class(base_case_t) :: self - integer, intent(in) :: iter - real(dp), intent(in) :: t + class(base_case_t) :: self !! Case instance + integer, intent(in) :: iter !! Current iteration number + real(dp), intent(in) :: t !! Current simulation time end subroutine postprocess end interface contains subroutine case_init(self, backend, mesh, host_allocator) + !! Initialise case with solver and checkpoint manager. + !! + !! Creates the solver instance and initialises the checkpoint/snapshot + !! manager. If a restart file is detected, loads the saved state. implicit none - class(base_case_t) :: self - class(base_backend_t), target, intent(inout) :: backend - type(mesh_t), target, intent(inout) :: mesh - type(allocator_t), target, intent(inout) :: host_allocator + class(base_case_t) :: self !! Case instance + class(base_backend_t), target, intent(inout) :: backend !! Computational backend + type(mesh_t), target, intent(inout) :: mesh !! Mesh with decomposition + type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator self%solver = init(backend, mesh, host_allocator) @@ -104,7 +162,14 @@ subroutine case_init(self, backend, mesh, host_allocator) end subroutine case_init subroutine case_finalise(self) - class(base_case_t) :: self + !! Finalise the case and clean up resources. + !! + !! Performs cleanup operations at the end of a simulation run: + !! - Finalises the checkpoint manager (closes files, flushes buffers) + !! - Prints completion message on root process + !! + !! This should be called after the main time integration loop completes. + class(base_case_t) :: self !! Case instance to finalise if (self%solver%mesh%par%is_root()) print *, 'run end' @@ -112,18 +177,35 @@ subroutine case_finalise(self) end subroutine case_finalise subroutine set_init(self, field, field_func) + !! Initialise a field using an analytical function. + !! + !! This utility subroutine sets a field's values by evaluating a + !! user-provided pure function at each grid point. The function + !! is evaluated on the host, then transferred to the backend device + !! (if using GPU backend). + !! + !! **Usage Example:** + !! ```fortran + !! call self%set_init(self%solver%u, u_initial) + !! ``` + !! where `u_initial` is a pure function taking coordinates [x,y,z] + !! and returning the initial velocity value. + !! + !! This is commonly used in `initial_conditions()` implementations + !! to set velocity or scalar fields from analytical expressions. implicit none - class(base_case_t) :: self - class(field_t), intent(inout) :: field + class(base_case_t) :: self !! Case instance + class(field_t), intent(inout) :: field !! Field to initialise interface pure function field_func(coords) result(r) + !! Pure function defining field values at each point. import dp implicit none - real(dp), intent(in) :: coords(3) - real(dp) :: r + real(dp), intent(in) :: coords(3) !! Spatial coordinates [x, y, z] + real(dp) :: r !! Field value at this location end function field_func end interface @@ -151,11 +233,24 @@ end function field_func end subroutine set_init subroutine print_enstrophy(self, u, v, w) - !! Reports the enstrophy + !! Compute and print the volume-averaged enstrophy. + !! + !! Enstrophy is a measure of the rotational kinetic energy density: + !! \[ E = \frac{1}{2V} \int_V |\omega|^2 \, dV = \frac{1}{2V} \int_V |\nabla \times \mathbf{u}|^2 \, dV \] + !! + !! where \( \omega = \nabla \times \mathbf{u} \) is the vorticity. + !! + !! This diagnostic is useful for monitoring: + !! - Flow transition to turbulence (enstrophy increases) + !! - Energy cascade to small scales + !! - Numerical stability (sudden spikes indicate problems) + !! - Comparison with theoretical predictions (e.g., TGV decay) + !! + !! Only the root MPI rank prints the result. implicit none - class(base_case_t), intent(in) :: self - class(field_t), intent(in) :: u, v, w + class(base_case_t), intent(in) :: self !! Case instance + class(field_t), intent(in) :: u, v, w !! Velocity components class(field_t), pointer :: du, dv, dw real(dp) :: enstrophy @@ -180,11 +275,31 @@ subroutine print_enstrophy(self, u, v, w) end subroutine print_enstrophy subroutine print_div_max_mean(self, u, v, w) - !! Reports the div(u) at cell centres + !! Compute and print maximum and mean divergence. + !! + !! For incompressible flow, the velocity divergence should be zero: + !! \[ \nabla \cdot \mathbf{u} = 0 \] + !! + !! This diagnostic reports: + !! + !! - **Maximum divergence**: Largest local violation of incompressibility + !! - **Mean divergence**: Volume-averaged divergence (should be near machine zero) + !! + !! **Purpose:** + !! + !! - Monitor quality of pressure correction (divergence should be ~ 1e-10 or smaller) + !! - Detect numerical issues (large divergence indicates solver problems) + !! - Verify proper boundary condition implementation + !! - Check convergence of iterative Poisson solvers + !! + !! Divergence is computed at cell centres from vertex velocities using + !! staggered derivatives and interpolation. + !! + !! Only the root MPI rank prints the result. implicit none - class(base_case_t), intent(in) :: self - class(field_t), intent(in) :: u, v, w + class(base_case_t), intent(in) :: self !! Case instance + class(field_t), intent(in) :: u, v, w !! Velocity components class(field_t), pointer :: div_u real(dp) :: div_u_max, div_u_mean @@ -202,11 +317,33 @@ subroutine print_div_max_mean(self, u, v, w) end subroutine print_div_max_mean subroutine run(self) - !! Runs the solver forwards in time from t=t_0 to t=T, performing - !! postprocessing/IO and reporting diagnostics. + !! Main time integration loop for the simulation. + !! + !! Advances the solution from initial time t=t_0 to final time t=T, + !! orchestrating all aspects of the simulation: + !! + !! **Each Timestep:** + !! + !! 1. Apply boundary conditions + !! 2. Compute derivatives and advance via time_integrator%step() + !! 3. Handle checkpointing and snapshot output (via checkpoint_mgr) + !! 4. Perform case-specific postprocessing + !! 5. Print diagnostics (divergence, enstrophy) + !! + !! **Time Integration Stages:** + !! + !! For multi-stage methods (RK), each timestep involves multiple stages. + !! The solver%step() method handles the stage-by-stage advancement, + !! calling transeq, forcings, pre_correction, and pressure_correction + !! at appropriate points. + !! + !! **Restart Support:** + !! + !! If a restart file is detected, continues from the saved iteration + !! and time rather than starting from t=0. implicit none - class(base_case_t), intent(inout) :: self + class(base_case_t), intent(inout) :: self !! Case instance type(flist_t), allocatable :: curr(:) type(flist_t), allocatable :: deriv(:) diff --git a/src/case/channel.f90 b/src/case/channel.f90 index 846dcf93e..b20222234 100644 --- a/src/case/channel.f90 +++ b/src/case/channel.f90 @@ -1,4 +1,32 @@ module m_case_channel + !! Turbulent channel flow case with optional rotation. + !! + !! This module implements a turbulent channel flow simulation between + !! two parallel walls. The flow is driven by a mean pressure gradient + !! to maintain a target bulk velocity. + !! + !! **Flow Configuration:** + !! + !! - Domain: Periodic in \(X\) and \(Z\), wall-bounded in \(Y\) + !! - Walls at \(y = 0\) and \(y = L_y\) with no-slip boundary conditions + !! - Mean pressure gradient maintains constant bulk velocity + !! - Optional rotation forcing (Coriolis-like terms) for rotating channel + !! + !! **Initial Conditions:** + !! + !! - Parabolic base profile: \( u = 1 - y^2 \) + !! - Random perturbations with configurable amplitude (noise parameter) + !! - Perturbations concentrated near centreline for faster transition + !! + !! **Boundary Conditions:** + !! + !! - No-slip walls: \( u = v = w = 0 \) at \( y = 0 \) and \( y = L_y \) + !! - Enforces mean bulk velocity via volume shift (simulates pressure gradient) + !! + !! **Forcing:** + !! + !! - Mean pressure gradient (constant in time, via bulk velocity constraint) + !! - Optional Coriolis forcing for rotating channel flows use iso_fortran_env, only: stderr => error_unit use mpi @@ -13,13 +41,14 @@ module m_case_channel implicit none type, extends(base_case_t) :: case_channel_t - type(channel_config_t) :: channel_cfg + !! Channel flow case with optional rotation forcing. + type(channel_config_t) :: channel_cfg !! Channel-specific configuration contains - procedure :: boundary_conditions => boundary_conditions_channel - procedure :: initial_conditions => initial_conditions_channel - procedure :: forcings => forcings_channel - procedure :: pre_correction => pre_correction_channel - procedure :: postprocess => postprocess_channel + procedure :: boundary_conditions => boundary_conditions_channel !! Apply bulk velocity constraint + procedure :: initial_conditions => initial_conditions_channel !! Set perturbed parabolic profile + procedure :: forcings => forcings_channel !! Apply rotation forcing (if enabled) + procedure :: pre_correction => pre_correction_channel !! Enforce wall boundary conditions + procedure :: postprocess => postprocess_channel !! Compute statistics end type case_channel_t interface case_channel_t @@ -29,12 +58,15 @@ module m_case_channel contains function case_channel_init(backend, mesh, host_allocator) result(flow_case) + !! Initialise channel flow case. + !! + !! Reads channel-specific configuration and initialises the base case. implicit none - class(base_backend_t), target, intent(inout) :: backend - type(mesh_t), target, intent(inout) :: mesh - type(allocator_t), target, intent(inout) :: host_allocator - type(case_channel_t) :: flow_case + class(base_backend_t), target, intent(inout) :: backend !! Computational backend + type(mesh_t), target, intent(inout) :: mesh !! Mesh with decomposition + type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator + type(case_channel_t) :: flow_case !! Initialised channel case call flow_case%channel_cfg%read(nml_file=get_argument(1)) @@ -43,9 +75,14 @@ function case_channel_init(backend, mesh, host_allocator) result(flow_case) end function case_channel_init subroutine boundary_conditions_channel(self) + !! Apply boundary conditions to enforce target bulk velocity. + !! + !! Computes the current bulk (volume-averaged) velocity and applies + !! a uniform shift to maintain the target value of 2/3. This simulates + !! the effect of a mean pressure gradient driving the flow. implicit none - class(case_channel_t) :: self + class(case_channel_t) :: self !! Channel case instance real(dp) :: can, ub integer :: ierr @@ -63,9 +100,18 @@ subroutine boundary_conditions_channel(self) end subroutine boundary_conditions_channel subroutine initial_conditions_channel(self) + !! Set initial velocity field with perturbed parabolic profile. + !! + !! Creates a laminar parabolic profile \( u = 1 - y^2 \) and adds random + !! perturbations scaled by the noise parameter. Perturbations are + !! amplitude-modulated with a Gaussian centred at the channel centreline + !! to concentrate disturbances where they are most effective for + !! triggering turbulent transition. + !! + !! No-slip conditions (u = v = w = 0) are enforced at walls (y=0, y=L_y). implicit none - class(case_channel_t) :: self + class(case_channel_t) :: self !! Channel case instance class(field_t), pointer :: u_init, v_init, w_init @@ -119,13 +165,32 @@ subroutine initial_conditions_channel(self) end subroutine initial_conditions_channel subroutine forcings_channel(self, du, dv, dw, iter) + !! Apply rotation forcing (Coriolis-like terms) if enabled. + !! + !! For rotating channel flows, adds Coriolis-like forcing terms that + !! couple the streamwise (u) and spanwise (v) velocities: + !! + !! \[ \frac{du}{dt} = \ldots - \Omega v \] + !! \[ \frac{dv}{dt} = \ldots + \Omega u \] + !! + !! where \( \Omega \) is the rotation rate (omega_rot). + !! + !! **Configuration:** + !! - Activated via `channel_cfg%rotation = .true.` + !! - Rotation rate set by `channel_cfg%omega_rot` + !! - Applied only for first `n_rotate` iterations to allow spin-up + !! + !! **Physical Interpretation:** + !! Mimics effects of system rotation (e.g., rotating reference frame) + !! without explicitly implementing Coriolis force. Useful for studying + !! rotation effects on turbulent channel flows. implicit none - class(case_channel_t) :: self - class(field_t), intent(inout) :: du, dv, dw - integer, intent(in) :: iter + class(case_channel_t) :: self !! Channel case instance + class(field_t), intent(inout) :: du, dv, dw !! Velocity derivatives to modify + integer, intent(in) :: iter !! Current iteration number - real(dp) :: rot + real(dp) :: rot !! Rotation rate for current forcing application if (self%channel_cfg%rotation .and. iter < self%channel_cfg%n_rotate) then rot = self%channel_cfg%omega_rot @@ -136,10 +201,30 @@ subroutine forcings_channel(self, du, dv, dw, iter) end subroutine forcings_channel subroutine pre_correction_channel(self, u, v, w) + !! Enforce no-slip boundary conditions at channel walls. + !! + !! Sets all velocity components to zero at the wall boundaries (Y-faces): + !! - Lower wall: y = 0 + !! - Upper wall: y = L_y + !! + !! This implements the no-slip condition: + !! \[ u = v = w = 0 \quad \text{at walls} \] + !! + !! **Implementation:** + !! Uses `field_set_face` to directly set values on Y-direction faces + !! (boundaries perpendicular to Y-axis). This is applied after the + !! time integration step but before pressure correction, ensuring that + !! the corrected velocity field satisfies both incompressibility and + !! no-slip boundary conditions. + !! + !! **Note:** + !! This is the standard approach for wall-bounded flows. For periodic + !! or other boundary conditions, this subroutine would be modified or + !! left empty. implicit none - class(case_channel_t) :: self - class(field_t), intent(inout) :: u, v, w + class(case_channel_t) :: self !! Channel case instance + class(field_t), intent(inout) :: u, v, w !! Velocity components to correct call self%solver%backend%field_set_face(u, 0._dp, 0._dp, Y_FACE) call self%solver%backend%field_set_face(v, 0._dp, 0._dp, Y_FACE) diff --git a/src/case/generic.f90 b/src/case/generic.f90 index 193767999..a1659361a 100644 --- a/src/case/generic.f90 +++ b/src/case/generic.f90 @@ -1,6 +1,26 @@ module m_case_generic - !! An example case set up to run and sustain a freestream flow. - !! This is a good place to start for adding a new flow case. + !! Generic freestream flow case for general-purpose simulations. + !! + !! This module provides a minimal template for setting up custom flow + !! cases. It implements a simple uniform freestream flow (\(u=1, v=0, w=0\)) + !! with no forcing or boundary corrections. + !! + !! **Use Cases:** + !! - Starting point for implementing new flow cases + !! - Testing solver functionality with simple initial conditions + !! - Freestream simulations with immersed boundaries (add IBM via forcings) + !! - Custom flow setups requiring minimal default behaviour + !! + !! **Default Configuration:** + !! - Initial condition: Uniform flow \(u=1, v=0, w=0\) + !! - No boundary condition corrections + !! - No forcing terms + !! - No pre-correction + !! - Minimal postprocessing + !! + !! **Customisation:** + !! Users can extend this case or modify the procedures directly to implement + !! specific flow physics, boundary conditions, or forcing terms. use iso_fortran_env, only: stderr => error_unit use m_allocator, only: allocator_t @@ -14,12 +34,13 @@ module m_case_generic implicit none type, extends(base_case_t) :: case_generic_t + !! Generic case with minimal default behaviour. contains - procedure :: boundary_conditions => boundary_conditions_generic - procedure :: initial_conditions => initial_conditions_generic - procedure :: forcings => forcings_generic - procedure :: pre_correction => pre_correction_generic - procedure :: postprocess => postprocess_generic + procedure :: boundary_conditions => boundary_conditions_generic !! No action (use domain BCs) + procedure :: initial_conditions => initial_conditions_generic !! Uniform freestream + procedure :: forcings => forcings_generic !! No forcing + procedure :: pre_correction => pre_correction_generic !! No correction + procedure :: postprocess => postprocess_generic !! Minimal diagnostics end type case_generic_t interface case_generic_t @@ -29,12 +50,13 @@ module m_case_generic contains function case_generic_init(backend, mesh, host_allocator) result(flow_case) + !! Initialise generic flow case. implicit none - class(base_backend_t), target, intent(inout) :: backend - type(mesh_t), target, intent(inout) :: mesh - type(allocator_t), target, intent(inout) :: host_allocator - type(case_generic_t) :: flow_case + class(base_backend_t), target, intent(inout) :: backend !! Computational backend + type(mesh_t), target, intent(inout) :: mesh !! Mesh with decomposition + type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator + type(case_generic_t) :: flow_case !! Initialised generic case call flow_case%case_init(backend, mesh, host_allocator) @@ -48,9 +70,19 @@ subroutine boundary_conditions_generic(self) end subroutine boundary_conditions_generic subroutine initial_conditions_generic(self) + !! Set initial velocity field for generic freestream case. + !! + !! Initialises a uniform flow field with: + !! - \( u = 1 \) (streamwise velocity) + !! - \( v = 0 \) (cross-stream velocity) + !! - \( w = 0 \) (spanwise velocity) + !! + !! All velocity components are located at vertices (VERT). + !! This simple uniform flow serves as a starting point that users + !! can modify for their specific applications. implicit none - class(case_generic_t) :: self + class(case_generic_t) :: self !! Generic case instance call self%solver%u%fill(1._dp) call self%solver%v%fill(0._dp) diff --git a/src/case/tgv.f90 b/src/case/tgv.f90 index 971ee29fb..bd094717a 100644 --- a/src/case/tgv.f90 +++ b/src/case/tgv.f90 @@ -1,4 +1,40 @@ module m_case_tgv + !! Taylor-Green vortex (TGV) case for validation and benchmarking. + !! + !! The Taylor-Green vortex is a canonical test case for incompressible + !! Navier-Stokes solvers. It features an analytically-defined initial + !! condition that transitions from laminar to turbulent flow, providing + !! a rigorous test of: + !! + !! - Spatial discretisation accuracy + !! - Time integration stability + !! - Energy conservation properties + !! - Transition to turbulence physics + !! + !! **Initial Conditions:** + !! + !! \[ u = \sin(x) \cos(y) \cos(z) \] + !! \[ v = -\cos(x) \sin(y) \cos(z) \] + !! \[ w = 0 \] + !! + !! This satisfies incompressibility (\( \nabla \cdot \mathbf{u} = 0 \)) exactly and is periodic + !! in all three directions. + !! + !! **Domain:** + !! + !! Typically \( [0, 2\pi]^3 \) with periodic boundary conditions in all directions. + !! + !! **Validation Metrics:** + !! + !! - Kinetic energy decay rate + !! - Enstrophy evolution + !! - Dissipation rate + !! - Vorticity dynamics + !! + !! **Reference:** + !! + !! Taylor, G. I., & Green, A. E. (1937). Mechanism of the production of + !! small eddies from large ones. Proc. R. Soc. Lond. A, 158(895), 499-521. use iso_fortran_env, only: stderr => error_unit use m_allocator, only: allocator_t @@ -12,12 +48,13 @@ module m_case_tgv implicit none type, extends(base_case_t) :: case_tgv_t + !! Taylor-Green vortex case (no additional state needed beyond base). contains - procedure :: boundary_conditions => boundary_conditions_tgv - procedure :: initial_conditions => initial_conditions_tgv - procedure :: forcings => forcings_tgv - procedure :: pre_correction => pre_correction_tgv - procedure :: postprocess => postprocess_tgv + procedure :: boundary_conditions => boundary_conditions_tgv !! No action (periodic BCs) + procedure :: initial_conditions => initial_conditions_tgv !! Set TGV velocity field + procedure :: forcings => forcings_tgv !! No forcing + procedure :: pre_correction => pre_correction_tgv !! No correction + procedure :: postprocess => postprocess_tgv !! Compute diagnostics end type case_tgv_t interface case_tgv_t @@ -27,21 +64,27 @@ module m_case_tgv contains function case_tgv_init(backend, mesh, host_allocator) result(flow_case) + !! Initialise Taylor-Green vortex case. implicit none - class(base_backend_t), target, intent(inout) :: backend - type(mesh_t), target, intent(inout) :: mesh - type(allocator_t), target, intent(inout) :: host_allocator - type(case_tgv_t) :: flow_case + class(base_backend_t), target, intent(inout) :: backend !! Computational backend + type(mesh_t), target, intent(inout) :: mesh !! Mesh with decomposition + type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator + type(case_tgv_t) :: flow_case !! Initialised TGV case call flow_case%case_init(backend, mesh, host_allocator) end function case_tgv_init subroutine initial_conditions_tgv(self) + !! Set Taylor-Green vortex initial velocity field. + !! + !! Initialises the three velocity components according to the TGV + !! analytical solution. The field is exactly divergence-free and + !! periodic, making it ideal for testing solver accuracy. implicit none - class(case_tgv_t) :: self + class(case_tgv_t) :: self !! TGV case instance call self%set_init(self%solver%u, u_func) call self%set_init(self%solver%v, v_func) @@ -54,19 +97,25 @@ subroutine initial_conditions_tgv(self) end subroutine initial_conditions_tgv pure function u_func(coords) result(r) + !! Compute x-velocity component of TGV at given coordinates. + !! + !! \[ u = \sin(x) \cos(y) \cos(z) \] implicit none - real(dp), intent(in) :: coords(3) - real(dp) :: r + real(dp), intent(in) :: coords(3) !! Position [x, y, z] + real(dp) :: r !! Velocity component u r = sin(coords(1))*cos(coords(2))*cos(coords(3)) end function u_func pure function v_func(coords) result(r) + !! Compute y-velocity component of TGV at given coordinates. + !! + !! \[ v = -\cos(x) \sin(y) \cos(z) \] implicit none - real(dp), intent(in) :: coords(3) - real(dp) :: r + real(dp), intent(in) :: coords(3) !! Position [x, y, z] + real(dp) :: r !! Velocity component v r = -cos(coords(1))*sin(coords(2))*cos(coords(3)) end function v_func diff --git a/src/common.f90 b/src/common.f90 index 8eba41c6f..3c5c98ff0 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -1,42 +1,62 @@ module m_common + !! Common module containing global constants, parameters, and utility functions. + !! + !! This module provides: + !! + !! - Precision definitions (single or double precision based on compilation flags) + !! - Mathematical constants (e.g., \(\pi\)) + !! - Direction and reordering constants for domain decomposition + !! - Data location flags (vertex, cell, face, edge centered) + !! - Boundary condition type constants + !! - Utility functions for argument parsing and data manipulation use mpi implicit none #ifdef SINGLE_PREC - integer, parameter :: dp = kind(0.0e0) - integer, parameter :: nbytes = 4 - integer, parameter :: MPI_X3D2_DP = MPI_REAL - logical, parameter :: is_sp = .true. + integer, parameter :: dp = kind(0.0e0) !! Double precision kind parameter (single precision) + integer, parameter :: nbytes = 4 !! Number of bytes for real numbers + integer, parameter :: MPI_X3D2_DP = MPI_REAL !! MPI datatype for real numbers + logical, parameter :: is_sp = .true. !! Flag indicating single precision #else - integer, parameter :: dp = kind(0.0d0) - integer, parameter :: nbytes = 8 - integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION - logical, parameter :: is_sp = .false. + integer, parameter :: dp = kind(0.0d0) !! Double precision kind parameter (double precision) + integer, parameter :: nbytes = 8 !! Number of bytes for real numbers + integer, parameter :: MPI_X3D2_DP = MPI_DOUBLE_PRECISION !! MPI datatype for real numbers + logical, parameter :: is_sp = .false. !! Flag indicating double precision #endif - integer, parameter :: sp = kind(0.0e0) - integer, parameter :: i8 = selected_int_kind(18) + integer, parameter :: sp = kind(0.0e0) !! Single precision kind parameter + integer, parameter :: i8 = selected_int_kind(18) !! Integer kind for 64-bit integers - real(dp), parameter :: pi = 4*atan(1.0_dp) + real(dp), parameter :: pi = 4*atan(1.0_dp) !! Mathematical constant \(\pi\) + !> Reordering constants for data layout transformations between directions. + !! Format: RDR_2 where directions are X, Y, Z, or C (complete/cell-centered) integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, & RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, & RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34 - integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 - integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 - integer, parameter :: VERT = 0000, & ! Vertex centered data - CELL = 1110, & ! Cell centered data - X_FACE = 1100, & ! Data on faces normal to X - Y_FACE = 1010, & ! Data on faces normal to Y - Z_FACE = 0110, & ! Data on faces normal to Z - X_EDGE = 0010, & ! Data on edges along X - Y_EDGE = 0100, & ! Data on edges along Y - Z_EDGE = 1000, & ! Data on edges along Z - NULL_LOC = -0001 ! The location of data isn't specified - integer, parameter :: BC_PERIODIC = 0, BC_NEUMANN = 1, BC_DIRICHLET = 2, & - BC_HALO = -1 + integer, parameter :: DIR_X = 1 !! X direction index + integer, parameter :: DIR_Y = 2 !! Y direction index + integer, parameter :: DIR_Z = 3 !! Z direction index + integer, parameter :: DIR_C = 4 !! Complete/cell-centered direction index + integer, parameter :: POISSON_SOLVER_FFT = 0 !! FFT-based Poisson solver + integer, parameter :: POISSON_SOLVER_CG = 1 !! Conjugate gradient Poisson solver + integer, parameter :: VERT = 0000, & !! Vertex centered data + CELL = 1110, & !! Cell centered data + X_FACE = 1100, & !! Data on faces normal to X + Y_FACE = 1010, & !! Data on faces normal to Y + Z_FACE = 0110, & !! Data on faces normal to Z + X_EDGE = 0010, & !! Data on edges along X + Y_EDGE = 0100, & !! Data on edges along Y + Z_EDGE = 1000, & !! Data on edges along Z + NULL_LOC = -0001 !! The location of data isn't specified + integer, parameter :: BC_PERIODIC = 0 !! Periodic boundary condition + integer, parameter :: BC_NEUMANN = 1 !! Neumann boundary condition + integer, parameter :: BC_DIRICHLET = 2 !! Dirichlet boundary condition + integer, parameter :: BC_HALO = -1 !! Halo/ghost cell boundary condition + !> Reordering map matrix for direction transformations. + !! Maps from direction (row) to direction (column), yielding the reordering constant. integer, protected :: & rdr_map(4, 4) = reshape([0, RDR_Y2X, RDR_Z2X, RDR_C2X, & RDR_X2Y, 0, RDR_Z2Y, RDR_C2Y, & @@ -46,8 +66,13 @@ module m_common contains pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir) - integer, intent(out) :: dir_from, dir_to - integer, intent(in) :: rdr_dir + !! Extract source and destination directions from a reordering constant. + !! + !! Given a reordering constant (e.g., RDR_X2Y), this subroutine determines + !! the source direction and destination direction. + integer, intent(out) :: dir_from !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) + integer, intent(out) :: dir_to !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) + integer, intent(in) :: rdr_dir !! Reordering constant (e.g., RDR_X2Y) integer, dimension(2) :: dirs dirs = findloc(rdr_map, rdr_dir) @@ -57,15 +82,23 @@ pure subroutine get_dirs_from_rdr(dir_from, dir_to, rdr_dir) end subroutine pure integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir) - !! Returns RDR_?2? value based on two direction inputs - integer, intent(in) :: dir_from, dir_to + !! Returns reordering constant based on two direction inputs. + !! + !! Given a source and destination direction, this function returns the + !! corresponding reordering constant (e.g., RDR_X2Y for X to Y). + integer, intent(in) :: dir_from !! Source direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) + integer, intent(in) :: dir_to !! Destination direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) rdr_dir = rdr_map(dir_from, dir_to) end function get_rdr_from_dirs function get_argument(pos) result(arg) - integer, intent(in) :: pos - character(:), allocatable :: arg + !! Retrieve a command-line argument at the specified position. + !! + !! This function wraps the intrinsic get_command_argument with error checking + !! and automatic string trimming. + integer, intent(in) :: pos !! Position of the command-line argument (1-indexed) + character(:), allocatable :: arg !! The retrieved command-line argument character(len=200) :: temp integer :: stat @@ -82,7 +115,14 @@ function get_argument(pos) result(arg) end function get_argument integer function move_data_loc(in_data_loc, dir, move) result(out_data_loc) - integer, intent(in) :: in_data_loc, dir, move + !! Update data location by shifting along a specified direction. + !! + !! This function modifies a data location flag by moving it along one direction + !! (X, Y, or Z) by a specified amount. The data location encoding uses powers of 10 + !! to represent positions in each direction. + integer, intent(in) :: in_data_loc !! Input data location flag + integer, intent(in) :: dir !! Direction to move (DIR_X, DIR_Y, or DIR_Z) + integer, intent(in) :: move !! Amount to move (typically -1, 0, or 1) out_data_loc = in_data_loc + move*(10**dir) end function move_data_loc diff --git a/src/config.f90 b/src/config.f90 index 076de2232..ce28d6e98 100644 --- a/src/config.f90 +++ b/src/config.f90 @@ -6,54 +6,86 @@ module m_config implicit none - integer, parameter :: n_species_max = 99 + integer, parameter :: n_species_max = 99 !! Maximum number of transported species type, abstract :: base_config_t - !! All config types have a method read to initialise their data + !! Base abstract type for all configuration types. + !! + !! All config types have a deferred read method to initialise their data + !! from either a namelist file or a namelist string. contains procedure(read), deferred :: read end type base_config_t type, extends(base_config_t) :: domain_config_t - character(len=30) :: flow_case_name - real(dp) :: L_global(3) - integer :: dims_global(3), nproc_dir(3) - character(len=20) :: BC_x(2), BC_y(2), BC_z(2) - character(len=20) :: stretching(3) - real(dp) :: beta(3) + !! Domain configuration type containing mesh and decomposition settings. + !! + !! This type stores all parameters related to the computational domain, + !! including global dimensions, boundary conditions, mesh stretching, + !! and MPI decomposition. + character(len=30) :: flow_case_name !! Name of the flow case (e.g., 'channel', 'tgv', 'generic') + real(dp) :: L_global(3) !! Global domain lengths in each direction + integer :: dims_global(3) !! Global number of grid points in each direction + integer :: nproc_dir(3) !! Number of processors in each direction + character(len=20) :: BC_x(2) !! Boundary conditions in x-direction (lower, upper) + character(len=20) :: BC_y(2) !! Boundary conditions in y-direction (lower, upper) + character(len=20) :: BC_z(2) !! Boundary conditions in z-direction (lower, upper) + character(len=20) :: stretching(3) !! Mesh stretching type in each direction + real(dp) :: beta(3) !! Stretching parameters in each direction contains procedure :: read => read_domain_nml end type domain_config_t type, extends(base_config_t) :: solver_config_t - real(dp) :: Re, dt - logical :: ibm_on - real(dp), dimension(:), allocatable :: pr_species - integer :: n_iters, n_output, n_species - logical :: lowmem_transeq, lowmem_fft - character(3) :: poisson_solver_type, time_intg - character(30) :: der1st_scheme, der2nd_scheme, & - interpl_scheme, stagder_scheme + !! Solver configuration type containing numerical and physical parameters. + !! + !! This type stores parameters related to the numerical solver including + !! Reynolds number, time step, iteration counts, discretisation schemes, + !! and solver options. + real(dp) :: Re !! Reynolds number + real(dp) :: dt !! Time step size + logical :: ibm_on !! Flag to enable immersed boundary method + real(dp), dimension(:), allocatable :: pr_species !! Prandtl numbers for each species + integer :: n_iters !! Total number of iterations + integer :: n_output !! Output frequency (every n_output iterations) + integer :: n_species !! Number of transported scalar species + logical :: lowmem_transeq !! Use low-memory implementation for transport equation + logical :: lowmem_fft !! Use low-memory implementation for FFT + character(3) :: poisson_solver_type !! Poisson solver type ('FFT' or 'CG') + character(3) :: time_intg !! Time integration scheme (e.g., 'RK3', 'AB2') + character(30) :: der1st_scheme !! First derivative scheme (e.g., 'compact6') + character(30) :: der2nd_scheme !! Second derivative scheme (e.g., 'compact6') + character(30) :: interpl_scheme !! Interpolation scheme (e.g., 'classic') + character(30) :: stagder_scheme !! Staggered derivative scheme (e.g., 'compact6') contains procedure :: read => read_solver_nml end type solver_config_t type, extends(base_config_t) :: channel_config_t - real(dp) :: noise, omega_rot - logical :: rotation - integer :: n_rotate + !! Channel flow configuration type. + !! + !! This type contains parameters specific to channel flow simulations, + !! including initial perturbations and rotation effects. + real(dp) :: noise !! Initial noise amplitude for perturbations + real(dp) :: omega_rot !! Rotation rate for rotating channel flow + logical :: rotation !! Flag to enable rotation + integer :: n_rotate !! Number of directions to rotate contains procedure :: read => read_channel_nml end type channel_config_t type, extends(base_config_t) :: checkpoint_config_t + !! Checkpoint and snapshot configuration type. + !! + !! This type manages simulation restart and output settings including + !! checkpoint frequency, snapshot frequency, and file naming conventions. integer :: checkpoint_freq = 0 !! Frequency of checkpointing (0 = off) integer :: snapshot_freq = 0 !! Frequency of snapshots (0 = off) logical :: keep_checkpoint = .true. !! If false, only keep latest checkpoint - character(len=256) :: checkpoint_prefix = "checkpoint" - character(len=256) :: snapshot_prefix = "snapshot" - logical :: restart_from_checkpoint = .false. - character(len=256) :: restart_file = "" + character(len=256) :: checkpoint_prefix = "checkpoint" !! Filename prefix for checkpoint files + character(len=256) :: snapshot_prefix = "snapshot" !! Filename prefix for snapshot files + logical :: restart_from_checkpoint = .false. !! Flag to restart from a checkpoint + character(len=256) :: restart_file = "" !! Path to checkpoint file for restart integer, dimension(3) :: output_stride = [2, 2, 2] !! Spatial stride for snapshot output logical :: snapshot_sp = .false. !! if true, snapshot in single precision contains @@ -64,9 +96,9 @@ module m_config subroutine read(self, nml_file, nml_string) !& !! Assigns the member variables either from a file or text source. !! - !! nml_file can be an absolute or relative path - !! nml_string is a character string that contains the namelist. - !! For example, nml_string="&foobar_nml foo=0, bar='this'/" + !! `nml_file` can be an absolute or relative path + !! `nml_string` is a character string that contains the namelist. + !! For example, `nml_string="&foobar_nml foo=0, bar='this'/"` import :: base_config_t class(base_config_t) :: self @@ -78,11 +110,16 @@ end subroutine read contains subroutine read_domain_nml(self, nml_file, nml_string) + !! Read domain configuration from a namelist file or string. + !! + !! This subroutine reads the domain_settings namelist containing mesh + !! and domain decomposition parameters. Exactly one of nml_file or + !! nml_string must be provided. implicit none - class(domain_config_t) :: self - character(*), optional, intent(in) :: nml_file - character(*), optional, intent(in) :: nml_string + class(domain_config_t) :: self !! Domain configuration object to populate + character(*), optional, intent(in) :: nml_file !! Path to namelist file + character(*), optional, intent(in) :: nml_string !! Namelist as a string integer :: unit @@ -96,6 +133,8 @@ subroutine read_domain_nml(self, nml_file, nml_string) namelist /domain_settings/ flow_case_name, L_global, dims_global, & nproc_dir, BC_x, BC_y, BC_z, stretching, beta + !! Specifies the computational domain geometry, mesh resolution, boundary conditions, + !! and MPI decomposition for the simulation. if (present(nml_file) .and. present(nml_string)) then error stop 'Reading domain config failed! & @@ -124,11 +163,16 @@ subroutine read_domain_nml(self, nml_file, nml_string) end subroutine read_domain_nml subroutine read_solver_nml(self, nml_file, nml_string) + !! Read solver configuration from a namelist file or string. + !! + !! This subroutine reads the solver_params namelist containing numerical + !! and physical parameters for the solver. Exactly one of nml_file or + !! nml_string must be provided. implicit none - class(solver_config_t) :: self - character(*), optional, intent(in) :: nml_file - character(*), optional, intent(in) :: nml_string + class(solver_config_t) :: self !! Solver configuration object to populate + character(*), optional, intent(in) :: nml_file !! Path to namelist file + character(*), optional, intent(in) :: nml_string !! Namelist as a string integer :: unit @@ -147,6 +191,8 @@ subroutine read_solver_nml(self, nml_file, nml_string) n_species, pr_species, lowmem_transeq, lowmem_fft, & time_intg, der1st_scheme, der2nd_scheme, interpl_scheme, & stagder_scheme, ibm_on + !! Specifies numerical solver settings including Reynolds number, time integration, + !! discretization schemes, and solver options for the Navier-Stokes equations. if (present(nml_file) .and. present(nml_string)) then error stop 'Reading solver config failed! & @@ -181,11 +227,16 @@ subroutine read_solver_nml(self, nml_file, nml_string) end subroutine read_solver_nml subroutine read_channel_nml(self, nml_file, nml_string) + !! Read channel flow configuration from a namelist file or string. + !! + !! This subroutine reads the channel_nml namelist containing parameters + !! specific to channel flow simulations. Exactly one of nml_file or + !! nml_string must be provided. implicit none - class(channel_config_t) :: self - character(*), optional, intent(in) :: nml_file - character(*), optional, intent(in) :: nml_string + class(channel_config_t) :: self !! Channel configuration object to populate + character(*), optional, intent(in) :: nml_file !! Path to namelist file + character(*), optional, intent(in) :: nml_string !! Namelist as a string integer :: unit @@ -194,6 +245,8 @@ subroutine read_channel_nml(self, nml_file, nml_string) integer :: n_rotate namelist /channel_nml/ noise, rotation, omega_rot, n_rotate + !! Specifies parameters specific to turbulent channel flow simulations, + !! including initial perturbations and optional rotation effects. if (present(nml_file) .and. present(nml_string)) then error stop 'Reading channel config failed! & @@ -217,11 +270,16 @@ subroutine read_channel_nml(self, nml_file, nml_string) end subroutine read_channel_nml subroutine read_checkpoint_nml(self, nml_file, nml_string) + !! Read checkpoint/snapshot configuration from a namelist file or string. + !! + !! This subroutine reads the checkpoint_params namelist containing settings + !! for checkpointing and snapshot output. Exactly one of nml_file or + !! nml_string must be provided. Uses default values if namelist is missing. implicit none - class(checkpoint_config_t) :: self - character(*), optional, intent(in) :: nml_file - character(*), optional, intent(in) :: nml_string + class(checkpoint_config_t) :: self !! Checkpoint configuration object to populate + character(*), optional, intent(in) :: nml_file !! Path to namelist file + character(*), optional, intent(in) :: nml_string !! Namelist as a string integer :: unit, ierr @@ -238,6 +296,8 @@ subroutine read_checkpoint_nml(self, nml_file, nml_string) namelist /checkpoint_params/ checkpoint_freq, snapshot_freq, & keep_checkpoint, checkpoint_prefix, snapshot_prefix, & restart_from_checkpoint, restart_file, output_stride, snapshot_sp + !! Specifies checkpoint and snapshot settings for simulation output and restart, + !! including file naming, frequency, and spatial output stride. if (present(nml_file) .and. present(nml_string)) then error stop 'Reading checkpoint config failed! & &Provide only a file name or source, not both.' diff --git a/src/field.f90 b/src/field.f90 index 878b5d247..7d2e69c4a 100644 --- a/src/field.f90 +++ b/src/field.f90 @@ -1,25 +1,32 @@ module m_field + !! Field data structure module for managing computational grid data. + !! + !! This module provides the `field_t` type for storing 3D scalar fields + !! on the computational grid. Fields can be organised in linked lists + !! for memory management and support different data orientations + !! (x-pencil, y-pencil, z-pencil). use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C type :: field_t - !! Memory block type holding both a data field and a pointer - !! to the next block. The `field_t` type also holds a integer - !! `refcount` that counts the number of references to this - !! field. User code is currently responsible for incrementing - !! the reference count. - class(field_t), pointer :: next - real(dp), pointer, private :: p_data(:) - real(dp), pointer, contiguous :: data(:, :, :) - integer :: dir - integer :: data_loc - integer :: refcount = 0 - integer :: id !! An integer identifying the memory block. + !! Memory block type holding a 3D scalar field with metadata. + !! + !! The field_t type stores both a data field and a pointer to the next + !! block, enabling linked list structures for memory management. The type + !! tracks a reference count (currently managed by user code), data + !! orientation (x-, y-, or z-pencil), and data location on the staggered grid. + class(field_t), pointer :: next !! Pointer to next field in linked list + real(dp), pointer, private :: p_data(:) !! 1D array storage for data + real(dp), pointer, contiguous :: data(:, :, :) !! 3D view of data array + integer :: dir !! Data direction (DIR_X, DIR_Y, DIR_Z, or DIR_C) + integer :: data_loc !! Data location flag (VERT, CELL, etc.) + integer :: refcount = 0 !! Reference count for memory management + integer :: id !! Unique identifier for this memory block contains - procedure :: fill - procedure :: get_shape - procedure :: set_shape - procedure :: set_data_loc + procedure :: fill !! Fill field with a constant value + procedure :: get_shape !! Get 3D dimensions of data array + procedure :: set_shape !! Set 3D dimensions by reshaping p_data + procedure :: set_data_loc !! Set data location flag end type field_t interface field_t @@ -27,16 +34,25 @@ module m_field end interface field_t type :: flist_t - !! Use for creating a list of field pointers - class(field_t), pointer :: ptr + !! Wrapper type for creating arrays of field pointers. + !! + !! This type is used to create lists or arrays of field pointers, + !! useful for managing multiple fields such as velocity components + !! or transported scalar species. + class(field_t), pointer :: ptr !! Pointer to a field end type flist_t contains function field_init(ngrid, next, id) result(f) - integer, intent(in) :: ngrid, id - type(field_t), pointer, intent(in) :: next - type(field_t) :: f + !! Initialise a new field with allocated memory. + !! + !! Creates a new field_t instance with allocated storage for ngrid points. + !! The field is linked to the next field in the list and assigned a unique ID. + integer, intent(in) :: ngrid !! Total number of grid points to allocate + type(field_t), pointer, intent(in) :: next !! Pointer to next field in linked list + integer, intent(in) :: id !! Unique identifier for this field + type(field_t) :: f !! Initialised field allocate (f%p_data(ngrid)) f%refcount = 0 @@ -45,38 +61,52 @@ function field_init(ngrid, next, id) result(f) end function field_init subroutine fill(self, c) + !! Fill the entire field with a constant value. + !! + !! Sets all grid points in the field to the specified constant value. implicit none - class(field_t) :: self - real(dp), intent(in) :: c + class(field_t) :: self !! Field to fill + real(dp), intent(in) :: c !! Constant value to fill with self%p_data(:) = c end subroutine fill subroutine set_data_loc(self, data_loc) - class(field_t) :: self - integer, intent(in) :: data_loc + !! Set the data location flag for this field. + !! + !! The data location specifies where on the staggered grid the data + !! is located (e.g., VERT, CELL, X_FACE, etc.). + class(field_t) :: self !! Field to modify + integer, intent(in) :: data_loc !! Data location flag self%data_loc = data_loc end subroutine function get_shape(self) result(dims) + !! Get the 3D dimensions of the field data. + !! + !! Returns the current shape of the 3D data array. implicit none - class(field_t) :: self - integer :: dims(3) + class(field_t) :: self !! Field to query + integer :: dims(3) !! Array dimensions [nx, ny, nz] dims = shape(self%data) end function get_shape subroutine set_shape(self, dims) + !! Reshape the field data to specified 3D dimensions. + !! + !! Maps the 1D storage array (p_data) to a 3D view with the specified + !! dimensions. The total size must match the allocated storage. implicit none - class(field_t) :: self - integer, intent(in) :: dims(3) + class(field_t) :: self !! Field to reshape + integer, intent(in) :: dims(3) !! Target dimensions [nx, ny, nz] self%data(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data diff --git a/src/io/adios2/io.f90 b/src/io/adios2/io.f90 index 26976b98d..613db079f 100644 --- a/src/io/adios2/io.f90 +++ b/src/io/adios2/io.f90 @@ -1,24 +1,40 @@ module m_io_backend -!! @brief Provides ADIOS2-specific implementation of the I/O backend interface -!! -!! @details This module contains the concrete backend implementation for ADIOS2 -!! (ADaptive Input Output System v2) library. It acts as a translation layer -!! converting generic I/O calls from the session interface into specific calls -!! to the ADIOS2 API. -!! -!! The `adios2_reader_t` and `adios2_writer_t` types defined here extend the -!! abstract base types from `m_io_base` and implement required procedures -!! -!! This backend leverages several key features of the underlying ADIOS2 library -!! - engine abstraction - the same API can be used for different transport -!! methods (e.g. BP4, BP5, HDF5) -!! - Asynchronous I/O - by default ADIOS2 uses a deferred transport mode -!! which can improve performance by overlapping computation and I/O -!! - MPI integration - it is designed for large-scale paralle I/O and -!! integrates with MPI, though serial operation is also supported -!! -!! @note This is an internal backend module and should never be used directly. -!! All user interaction must go through `m_io_session`. + !! ADIOS2-specific implementation of the I/O backend interface. + !! + !! This module provides the concrete backend implementation for ADIOS2 + !! (Adaptable Input Output System v2), a high-performance parallel I/O + !! library. It acts as a translation layer converting generic I/O calls + !! from the session interface into specific ADIOS2 API calls. + !! + !! **Architecture:** + !! + !! - Extends abstract base types from `m_io_base` + !! - Implements all required I/O procedures (init, open, read, write, etc.) + !! - Manages ADIOS2-specific objects (adios, io, engine) + !! - Handles step-based I/O for time-series data + !! + !! **ADIOS2 Features Leveraged:** + !! + !! - **Engine Abstraction**: Same API for different formats (BP4, BP5, HDF5) + !! - **Asynchronous I/O**: Deferred transport mode overlaps computation and I/O + !! - **MPI Integration**: Designed for large-scale parallel I/O + !! - **Variable/Attribute Management**: Efficient metadata handling + !! - **Hyperslab Selection**: Parallel distributed array I/O + !! + !! **Type Hierarchy:** + !! + !! ``` + !! io_base (abstract) + !! |-- io_reader_t (abstract) + !! | |-- io_adios2_reader_t (concrete) + !! |-- io_writer_t (abstract) + !! | |-- io_adios2_writer_t (concrete) + !! |-- io_file_t (abstract) + !! |-- io_adios2_file_t (concrete) + !! ``` + !! + !! **Note:** This is an internal backend module and should never be used + !! directly. All user interaction must go through `m_io_session`. use adios2, only: adios2_adios, adios2_io, adios2_engine, & adios2_variable, adios2_attribute, & adios2_mode_sync, adios2_mode_write, & @@ -45,56 +61,71 @@ module m_io_backend public :: allocate_io_reader, allocate_io_writer public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2 - integer, parameter :: IO_BACKEND_DUMMY = 0 - integer, parameter :: IO_BACKEND_ADIOS2 = 1 + integer, parameter :: IO_BACKEND_DUMMY = 0 !! Dummy backend identifier + integer, parameter :: IO_BACKEND_ADIOS2 = 1 !! ADIOS2 backend identifier type, extends(io_reader_t) :: io_adios2_reader_t + !! ADIOS2 reader implementation for reading data from files. + !! + !! Manages ADIOS2 objects required for reading operations including + !! the global ADIOS handler, I/O object, and tracks step state for + !! time-series data reading. private type(adios2_adios) :: adios !! ADIOS2 global handler - type(adios2_io) :: io_handle !! ADIOS2 IO object for managing I/O - logical :: is_step_active = .false. !! Flag to track if a step is active - integer :: comm = MPI_COMM_NULL !! MPI communicator + type(adios2_io) :: io_handle !! ADIOS2 I/O object for managing variables + logical :: is_step_active = .false. !! Flag tracking if a step is active + integer :: comm = MPI_COMM_NULL !! MPI communicator for parallel I/O contains - procedure :: init => reader_init_adios2 - procedure :: open => reader_open_adios2 - procedure :: read_data_i8 => read_data_i8_adios2 - procedure :: read_data_integer => read_data_integer_adios2 - procedure :: read_data_real => read_data_real_adios2 - procedure :: read_data_array_3d => read_data_array_3d_adios2 - procedure :: finalise => finalise_reader_adios2 - procedure, private :: handle_error => handle_error_reader + procedure :: init => reader_init_adios2 !! Initialise reader + procedure :: open => reader_open_adios2 !! Open file for reading + procedure :: read_data_i8 => read_data_i8_adios2 !! Read 64-bit integer + procedure :: read_data_integer => read_data_integer_adios2 !! Read default integer + procedure :: read_data_real => read_data_real_adios2 !! Read double precision real + procedure :: read_data_array_3d => read_data_array_3d_adios2 !! Read 3D array with hyperslab + procedure :: finalise => finalise_reader_adios2 !! Finalise and clean up + procedure, private :: handle_error => handle_error_reader !! Error handling (internal) end type io_adios2_reader_t type, extends(io_writer_t) :: io_adios2_writer_t + !! ADIOS2 writer implementation for writing data to files. + !! + !! Manages ADIOS2 objects required for writing operations including + !! the global ADIOS handler, I/O object, and tracks step state for + !! time-series data writing. private type(adios2_adios) :: adios !! ADIOS2 global handler - type(adios2_io) :: io_handle !! ADIOS2 IO object for managing I/O - logical :: is_step_active = .false. !! Flag to track if a step is active - integer :: comm = MPI_COMM_NULL !! MPI communicator + type(adios2_io) :: io_handle !! ADIOS2 I/O object for managing variables + logical :: is_step_active = .false. !! Flag tracking if a step is active + integer :: comm = MPI_COMM_NULL !! MPI communicator for parallel I/O contains - procedure :: init => writer_init_adios2 - procedure :: open => writer_open_adios2 - procedure :: write_data_i8 => write_data_i8_adios2 - procedure :: write_data_integer => write_data_integer_adios2 - procedure :: write_data_real => write_data_real_adios2 - procedure :: write_data_array_3d => write_data_array_3d_adios2 - procedure :: write_attribute_string => write_attribute_string_adios2 + procedure :: init => writer_init_adios2 !! Initialise writer + procedure :: open => writer_open_adios2 !! Open file for writing + procedure :: write_data_i8 => write_data_i8_adios2 !! Write 64-bit integer + procedure :: write_data_integer => write_data_integer_adios2 !! Write default integer + procedure :: write_data_real => write_data_real_adios2 !! Write double precision real + procedure :: write_data_array_3d => write_data_array_3d_adios2 !! Write 3D array with hyperslab + procedure :: write_attribute_string => write_attribute_string_adios2 !! Write string attribute procedure :: write_attribute_array_1d_real => & - write_attribute_array_1d_real_adios2 - procedure :: finalise => finalise_writer_adios2 - procedure, private :: handle_error => handle_error_writer + write_attribute_array_1d_real_adios2 !! Write 1D real array attribute + procedure :: finalise => finalise_writer_adios2 !! Finalise and clean up + procedure, private :: handle_error => handle_error_writer !! Error handling (internal) end type io_adios2_writer_t type, extends(io_file_t) :: io_adios2_file_t + !! ADIOS2 file handle for open file operations. + !! + !! Wraps the ADIOS2 engine object and manages step-based I/O for + !! time-series data. Tracks whether file is opened for reading or + !! writing and current step state. private - type(adios2_engine) :: engine !! ADIOS2 engine for data reading/writing - logical :: is_step_active = .false. !! Flag to track if a step is active - logical :: is_writer = .false. !! Flag to track if this is for writing + type(adios2_engine) :: engine !! ADIOS2 engine for data transport + logical :: is_step_active = .false. !! Flag tracking if a step is active + logical :: is_writer = .false. !! True if file opened for writing contains - procedure :: close => file_close_adios2 - procedure :: begin_step => file_begin_step_adios2 - procedure :: end_step => file_end_step_adios2 - procedure, private :: handle_error => handle_error_file + procedure :: close => file_close_adios2 !! Close file and engine + procedure :: begin_step => file_begin_step_adios2 !! Begin new I/O step + procedure :: end_step => file_end_step_adios2 !! End current I/O step + procedure, private :: handle_error => handle_error_file !! Error handling (internal) end type io_adios2_file_t contains diff --git a/src/io/checkpoint_manager.f90 b/src/io/checkpoint_manager.f90 index 8bd2ed97f..205f08d0e 100644 --- a/src/io/checkpoint_manager.f90 +++ b/src/io/checkpoint_manager.f90 @@ -1,20 +1,36 @@ module m_checkpoint_manager -! @brief Manages the creation and restoration of simulation checkpoints -!! for restart capabilities. -!! -!! @details This module is responsible for periodically saving the full, unstrided -!! simulation state to a file. This allows a simulation to be stopped and resumed -!! from the exact state it was in. -!! -!! Key features include: -!! - Reading all checkpoint settings from a configuration file -!! - Periodically writing the full-resolution simulation state -!! - Handling the full logic for restarting a simulation from -!! a specified checkpoint file. -!! - A safe-write strategy that writes to a temporary file first, -!! then atomically renames it to the final filename to -!! prevent corrupted checkpoints. -!! - Optional cleanup of old checkpoint files to conserve disk space. + !! Manages creation and restoration of simulation checkpoints for restart. + !! + !! This module is responsible for periodically saving the full simulation + !! state to checkpoint files and restoring from them for restarts. This + !! allows simulations to be stopped and resumed from the exact state. + !! + !! **Key Features:** + !! + !! - Configuration via namelist (checkpoint frequency, prefix, etc.) + !! - Periodic writing of full-resolution simulation state + !! - Complete restart logic from specified checkpoint file + !! - Safe-write strategy: temporary file then atomic rename + !! - Optional cleanup of old checkpoints to conserve disk space + !! - Stores velocity fields (\(u, v, w\)), timestep, and simulation time + !! + !! **Safe-Write Strategy:** + !! + !! To prevent corrupted checkpoints from crashes during write: + !! + !! 1. Write to temporary file (e.g., `checkpoint_0001000.tmp.bp`) + !! 2. Atomic rename to final name (`checkpoint_0001000.bp`) + !! 3. Optionally delete previous checkpoint if `keep_checkpoint=false` + !! + !! **Configuration:** + !! + !! Controlled via `checkpoint_config_t` read from input namelist: + !! + !! - `checkpoint_freq`: write interval (iterations) + !! - `keep_checkpoint`: retain all checkpoints vs overwrite old ones + !! - `checkpoint_prefix`: filename prefix + !! - `restart_from_checkpoint`: enable restart + !! - `restart_file`: checkpoint file to restart from use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank, MPI_Abort use m_common, only: dp, i8, DIR_X, get_argument use m_field, only: field_t @@ -30,38 +46,48 @@ module m_checkpoint_manager implicit none type :: raw_old_field_buffer_t - real(dp), allocatable :: data(:, :, :) + !! Temporary buffer for field data (used internally). + real(dp), allocatable :: data(:, :, :) !! 3D array storage end type raw_old_field_buffer_t private public :: checkpoint_manager_t type :: checkpoint_manager_t - type(checkpoint_config_t) :: config - integer :: last_checkpoint_step = -1 - integer, dimension(3) :: full_resolution = [1, 1, 1] - type(field_buffer_map_t), allocatable :: field_buffers(:) - integer(i8), dimension(3) :: last_shape_dims = 0 - integer, dimension(3) :: last_stride_factors = 0 - integer(i8), dimension(3) :: last_output_shape = 0 + !! Manager for checkpoint file operations (writing and reading). + !! + !! Handles all aspects of checkpoint I/O including periodic writes + !! during simulation and restoration during restart. Maintains state + !! needed for consistent checkpoint operations across multiple writes. + type(checkpoint_config_t) :: config !! Checkpoint configuration settings + integer :: last_checkpoint_step = -1 !! Timestep of last checkpoint written + integer, dimension(3) :: full_resolution = [1, 1, 1] !! Global domain resolution [nx, ny, nz] + type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O + integer(i8), dimension(3) :: last_shape_dims = 0 !! Shape dimensions from last write + integer, dimension(3) :: last_stride_factors = 0 !! Stride factors from last write + integer(i8), dimension(3) :: last_output_shape = 0 !! Output shape from last write contains - procedure :: init - procedure :: handle_restart - procedure :: handle_checkpoint_step - procedure :: is_restart - procedure :: finalise - procedure, private :: write_checkpoint - procedure, private :: restart_checkpoint - procedure, private :: write_fields - procedure, private :: cleanup_output_buffers + procedure :: init !! Initialise checkpoint manager + procedure :: handle_restart !! Restore from checkpoint file + procedure :: handle_checkpoint_step !! Write checkpoint if needed at timestep + procedure :: is_restart !! Check if this is a restart run + procedure :: finalise !! Clean up and finalise + procedure, private :: write_checkpoint !! Write checkpoint file (internal) + procedure, private :: restart_checkpoint !! Read checkpoint file (internal) + procedure, private :: write_fields !! Write field data to file (internal) + procedure, private :: cleanup_output_buffers !! Free output buffers (internal) end type checkpoint_manager_t contains subroutine init(self, comm) - !! Initialise checkpoint manager - class(checkpoint_manager_t), intent(inout) :: self - integer, intent(in) :: comm + !! Initialise checkpoint manager from configuration. + !! + !! Reads checkpoint settings from input namelist and configures + !! output if checkpoint frequency is positive. Prints checkpoint + !! settings on root process. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + integer, intent(in) :: comm !! MPI communicator self%config = checkpoint_config_t() call self%config%read(nml_file=get_argument(1)) @@ -72,10 +98,13 @@ subroutine init(self, comm) end subroutine init subroutine configure_output(self, comm) - !! Configure checkpoint output settings + !! Configure and print checkpoint output settings. + !! + !! Displays checkpoint configuration on root process including + !! frequency, retention policy, and file prefix. use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY - class(checkpoint_manager_t), intent(inout) :: self - integer, intent(in) :: comm + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + integer, intent(in) :: comm !! MPI communicator integer :: myrank, ierr @@ -89,18 +118,25 @@ subroutine configure_output(self, comm) end subroutine configure_output function is_restart(self) result(restart) - !! Check if this is a restart run - class(checkpoint_manager_t), intent(in) :: self - logical :: restart + !! Check if this is a restart run. + !! + !! Queries configuration to determine if simulation should restart + !! from an existing checkpoint file. + class(checkpoint_manager_t), intent(in) :: self !! Checkpoint manager instance + logical :: restart !! True if restarting from checkpoint restart = self%config%restart_from_checkpoint end function is_restart subroutine handle_restart(self, solver, comm) - !! Handle restart from checkpoint - class(checkpoint_manager_t), intent(inout) :: self - class(solver_t), intent(inout) :: solver - integer, intent(in), optional :: comm + !! Restore solver state from checkpoint file. + !! + !! Reads velocity fields, timestep, and time from the checkpoint file + !! specified in configuration. Updates solver's current iteration counter. + !! Prints restart information on root process. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + class(solver_t), intent(inout) :: solver !! Solver to restore state into + integer, intent(in), optional :: comm !! MPI communicator (optional) character(len=256) :: restart_file integer :: restart_timestep @@ -123,11 +159,15 @@ subroutine handle_restart(self, solver, comm) end subroutine handle_restart subroutine handle_checkpoint_step(self, solver, timestep, comm) - !! Handle checkpoint writing at a given timestep - class(checkpoint_manager_t), intent(inout) :: self - class(solver_t), intent(in) :: solver - integer, intent(in) :: timestep - integer, intent(in), optional :: comm + !! Write checkpoint if frequency condition is met. + !! + !! Checks if current timestep is a checkpoint interval (divisible by + !! checkpoint_freq) and writes checkpoint file if so. Called each + !! timestep from main simulation loop. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + class(solver_t), intent(in) :: solver !! Solver containing current state + integer, intent(in) :: timestep !! Current timestep number + integer, intent(in), optional :: comm !! MPI communicator (optional) integer :: comm_to_use @@ -138,11 +178,26 @@ subroutine handle_checkpoint_step(self, solver, timestep, comm) end subroutine handle_checkpoint_step subroutine write_checkpoint(self, solver, timestep, comm) - !! Write a checkpoint file for simulation restart - class(checkpoint_manager_t), intent(inout) :: self - class(solver_t), intent(in) :: solver - integer, intent(in) :: timestep - integer, intent(in) :: comm + !! Write checkpoint file using safe-write strategy (internal). + !! + !! Implements the checkpoint writing logic with atomic file operations + !! to prevent corruption. The procedure: + !! 1. Check if checkpoint is due (frequency condition) + !! 2. Write to temporary file (_temp.bp) + !! 3. Write metadata (timestep, time, dt, data location) + !! 4. Write velocity fields (u, v, w) via write_fields + !! 5. Write time integrator state (AB scheme coefficients if applicable) + !! 6. Close temporary file + !! 7. Atomic rename: temp file to final name + !! 8. Optionally delete previous checkpoint if keep_checkpoint=false + !! + !! **Safe-Write Strategy:** Writing to a temporary file and then renaming + !! ensures that if a crash occurs during write, the previous valid + !! checkpoint remains intact. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + class(solver_t), intent(in) :: solver !! Solver with state to save + integer, intent(in) :: timestep !! Current timestep number + integer, intent(in) :: comm !! MPI communicator character(len=256) :: filename, temp_filename, old_filename integer :: ierr, myrank @@ -307,13 +362,29 @@ end subroutine write_checkpoint subroutine restart_checkpoint( & self, solver, filename, timestep, restart_time, comm & ) - !! Restart simulation state from checkpoint file - class(checkpoint_manager_t), intent(inout) :: self - class(solver_t), intent(inout) :: solver - character(len=*), intent(in) :: filename - integer, intent(out) :: timestep - real(dp), intent(out) :: restart_time - integer, intent(in) :: comm + !! Restore simulation state from checkpoint file (internal). + !! + !! Reads all data from checkpoint file and restores solver state: + !! 1. Verify checkpoint file exists (abort if missing) + !! 2. Open checkpoint file for reading + !! 3. Read metadata (timestep, time, dt, data location) + !! 4. Read time integrator state (AB coefficients, order, step counters) + !! 5. Read velocity fields (u, v, w) with correct dimensions + !! 6. Restore time integrator state including history (olds arrays) + !! 7. Set solver data location to match checkpoint + !! + !! **Data Location:** Checkpoint records whether fields were stored at + !! vertices (VERT) or cell centers (CELL), and restoration preserves this. + !! + !! **Time Integrator State:** For Adams-Bashforth schemes, restores the + !! history of old field values (du_olds, dv_olds, dw_olds) needed for + !! multi-step time integration. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + class(solver_t), intent(inout) :: solver !! Solver to restore state into + character(len=*), intent(in) :: filename !! Checkpoint file path + integer, intent(out) :: timestep !! Timestep from checkpoint + real(dp), intent(out) :: restart_time !! Simulation time from checkpoint + integer, intent(in) :: comm !! MPI communicator type(reader_session_t) :: reader_session integer :: ierr, myrank, data_loc @@ -456,13 +527,28 @@ end subroutine restart_checkpoint subroutine write_fields( & self, field_names, host_fields, solver, writer_session, data_loc & ) - !! Write field data for checkpoints (no striding) - class(checkpoint_manager_t), intent(inout) :: self - character(len=*), dimension(:), intent(in) :: field_names - class(field_ptr_t), dimension(:), target, intent(in) :: host_fields - class(solver_t), intent(in) :: solver - type(writer_session_t), intent(inout) :: writer_session - integer, intent(in) :: data_loc + !! Write velocity field data to checkpoint file (internal). + !! + !! Writes field data at full resolution (no striding for checkpoints). + !! The procedure: + !! 1. Prepare field buffers for full resolution output + !! 2. Calculate output dimensions and hyperslab selection + !! 3. For each field (u, v, w): + !! - Copy field data to output buffer + !! - Write buffer to file with proper hyperslab parameters + !! + !! **Full Resolution:** Unlike snapshots (which can be strided), + !! checkpoints always write full-resolution data to enable exact restart. + !! + !! **Parallel I/O:** Each MPI rank writes its local subdomain using + !! hyperslab selection (output_start, output_count) to assemble the + !! global field in the file. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance + character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"] + class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers + class(solver_t), intent(in) :: solver !! Solver containing mesh info + type(writer_session_t), intent(inout) :: writer_session !! I/O writer session + integer, intent(in) :: data_loc !! Data location (VERT or CELL) integer :: i_field integer(i8), dimension(3) :: output_start, output_count @@ -505,15 +591,22 @@ subroutine write_fields( & end subroutine write_fields subroutine cleanup_output_buffers(self) - !! Clean up dynamic field buffers - class(checkpoint_manager_t), intent(inout) :: self + !! Clean up dynamically allocated field buffers (internal). + !! + !! Frees memory allocated for field I/O buffers. Called during + !! finalisation to prevent memory leaks. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance call cleanup_field_buffers(self%field_buffers) end subroutine cleanup_output_buffers subroutine finalise(self) - !! Clean up checkpoint manager - class(checkpoint_manager_t), intent(inout) :: self + !! Finalise checkpoint manager and free resources. + !! + !! Cleans up all dynamically allocated buffers. Should be called + !! at the end of simulation or when checkpoint manager is no longer + !! needed. + class(checkpoint_manager_t), intent(inout) :: self !! Checkpoint manager instance call self%cleanup_output_buffers() end subroutine finalise diff --git a/src/io/dummy/io.f90 b/src/io/dummy/io.f90 index 69dbb3c73..0e282fbbe 100644 --- a/src/io/dummy/io.f90 +++ b/src/io/dummy/io.f90 @@ -1,20 +1,31 @@ module m_io_backend -!! @brief Provides a dummy, non-functional I/O backend for when an I/O backend -!! is not available -!! -!! @details This module provides a fallback implementation of the I/O backend -!! interface. It is used when no real I/O backend (e.g. ADIOS2) is enabled at -!! compile time. -!! -!! The primary purpose of this dummy backend is to allow the full program to -!! compile and link against the session interface (`m_io_session`) without -!! requiring a functional I/O library. -!! -!! @warning This is a non-functional stub. Calling any of its I/O procedures -!! will immediately terminate the program with an error message. -!! -!! @note If you require file I/O, you must recompile the code with a functional -!! backend + !! Dummy (non-functional) I/O backend for when no real backend is available. + !! + !! This module provides a fallback implementation of the I/O backend + !! interface used when no real I/O backend (e.g., ADIOS2) is enabled at + !! compile time. It allows the code to compile and link without a functional + !! I/O library. + !! + !! **Purpose:** + !! + !! - Enables compilation without external I/O library dependencies + !! - Provides informative error messages when I/O operations are attempted + !! - Allows code structure to remain consistent regardless of I/O backend + !! + !! **Behaviour:** + !! + !! - Write operations are silently ignored (no-op) + !! - Read operations terminate with error message directing user to recompile + !! - File open/close operations are tracked but perform no actual I/O + !! + !! **Use Cases:** + !! + !! - Testing/debugging without I/O overhead + !! - Systems where ADIOS2 is unavailable + !! - Dry runs to validate simulation setup + !! + !! **Warning:** This is a non-functional stub. If you require actual file I/O, + !! recompile with `-DWITH_ADIOS2=ON` to enable the ADIOS2 backend. use iso_fortran_env, only: stderr => error_unit use m_io_base, only: io_reader_t, io_writer_t, io_file_t, io_mode_read, & io_mode_write @@ -26,45 +37,48 @@ module m_io_backend public :: allocate_io_reader, allocate_io_writer public :: get_default_backend, IO_BACKEND_DUMMY, IO_BACKEND_ADIOS2 - logical, save :: write_warning_shown = .false. + logical, save :: write_warning_shown = .false. !! Track if warning has been displayed - integer, parameter :: IO_BACKEND_DUMMY = 0 - integer, parameter :: IO_BACKEND_ADIOS2 = 1 + integer, parameter :: IO_BACKEND_DUMMY = 0 !! Dummy backend identifier + integer, parameter :: IO_BACKEND_ADIOS2 = 1 !! ADIOS2 backend identifier type, extends(io_file_t) :: io_dummy_file_t - logical :: is_open = .false. + !! Dummy file handle (tracks state but performs no I/O). + logical :: is_open = .false. !! File open state flag contains - procedure :: close => file_close_dummy - procedure :: begin_step => file_begin_step_dummy - procedure :: end_step => file_end_step_dummy - procedure :: is_file_functional => is_file_functional_dummy + procedure :: close => file_close_dummy !! Close file (no-op) + procedure :: begin_step => file_begin_step_dummy !! Begin step (no-op) + procedure :: end_step => file_end_step_dummy !! End step (no-op) + procedure :: is_file_functional => is_file_functional_dummy !! Check if functional end type io_dummy_file_t type, extends(io_reader_t) :: io_dummy_reader_t - logical :: initialised = .false. + !! Dummy reader (errors on read attempts). + logical :: initialised = .false. !! Initialisation state flag contains - procedure :: init => reader_init_dummy - procedure :: open => reader_open_dummy - procedure :: finalise => reader_finalise_dummy - procedure :: read_data_i8 => read_data_i8_dummy - procedure :: read_data_integer => read_data_integer_dummy - procedure :: read_data_real => read_data_real_dummy - procedure :: read_data_array_3d => read_data_array_3d_dummy + procedure :: init => reader_init_dummy !! Initialise reader + procedure :: open => reader_open_dummy !! Open file (returns non-functional handle) + procedure :: finalise => reader_finalise_dummy !! Finalise (no-op) + procedure :: read_data_i8 => read_data_i8_dummy !! Read i8 (errors) + procedure :: read_data_integer => read_data_integer_dummy !! Read integer (errors) + procedure :: read_data_real => read_data_real_dummy !! Read real (errors) + procedure :: read_data_array_3d => read_data_array_3d_dummy !! Read 3D array (errors) end type io_dummy_reader_t type, extends(io_writer_t) :: io_dummy_writer_t - logical :: initialised = .false. + !! Dummy writer (silently ignores write operations). + logical :: initialised = .false. !! Initialisation state flag contains - procedure :: init => writer_init_dummy - procedure :: open => writer_open_dummy - procedure :: finalise => writer_finalise_dummy - procedure :: write_data_i8 => write_data_i8_dummy - procedure :: write_data_integer => write_data_integer_dummy - procedure :: write_data_real => write_data_real_dummy - procedure :: write_data_array_3d => write_data_array_3d_dummy - procedure :: write_attribute_string => write_attribute_string_dummy + procedure :: init => writer_init_dummy !! Initialise writer + procedure :: open => writer_open_dummy !! Open file (returns non-functional handle) + procedure :: finalise => writer_finalise_dummy !! Finalise (no-op) + procedure :: write_data_i8 => write_data_i8_dummy !! Write i8 (no-op) + procedure :: write_data_integer => write_data_integer_dummy !! Write integer (no-op) + procedure :: write_data_real => write_data_real_dummy !! Write real (no-op) + procedure :: write_data_array_3d => write_data_array_3d_dummy !! Write 3D array (no-op) + procedure :: write_attribute_string => write_attribute_string_dummy !! Write string attribute (no-op) procedure :: write_attribute_array_1d_real => & - write_attribute_array_1d_real_dummy + write_attribute_array_1d_real_dummy !! Write 1D real array attribute (no-op) end type io_dummy_writer_t contains diff --git a/src/io/io_base.f90 b/src/io/io_base.f90 index c6860409f..84537be06 100644 --- a/src/io/io_base.f90 +++ b/src/io/io_base.f90 @@ -1,32 +1,31 @@ module m_io_base -!! @brief Provides the abstract base types and interfaces for the session-based -!! I/O architecture. -!! -!! @details This internal module defines the fundamental building blocks of -!! the I/O system. It establishes a polymorphic layer that allows the -!! high-level user session to interact with various I/O backends through a -!! consistent interface. -!! -!! The architecture is designed in distinct layers: -!! User code -!! - interacts only with the Session layer -!! -!! Session layer (`m_io_session`) -!! - manages all I/O complexity (file handles, state, etc.) -!! - instantiates the I/O backend selected at compile-time -!! - provides `reader_session_t` and `writer_session_t` for users -!! -!! Backend layer (`m_io_backend`) -!! - concrete implementation of an I/O backed (e.g., ADIOS2) -!! - extends the abstract base types defined in this module -!! -!! Base layer (`m_io_base`, this module) -!! - provides abstract `reader_base_t` and `writer_base_t` types -!! - enforces a consistent interface for all backends -!! -!! @note This is an internal module and should not be used directly by users. -!! The sole public interface for I/O is the high-level session API provided in -!! `m_io_session`. + !! Abstract base types and interfaces for session-based I/O architecture. + !! + !! This internal module defines the fundamental building blocks of the I/O + !! system. It establishes a polymorphic layer that allows the high-level + !! user session to interact with various I/O backends (e.g., ADIOS2, dummy) + !! through a consistent interface. + !! + !! **Architecture Layers:** + !! + !! 1. **User Code** - interacts only with the Session layer + !! + !! 2. **Session Layer** (`m_io_session`) + !! - Manages all I/O complexity (file handles, state, etc.) + !! - Instantiates the I/O backend selected at compile-time + !! - Provides `reader_session_t` and `writer_session_t` for users + !! + !! 3. **Backend Layer** (`m_io_backend`) + !! - Concrete implementation of an I/O backend (e.g., ADIOS2) + !! - Extends the abstract base types defined in this module + !! + !! 4. **Base Layer** (`m_io_base`, this module) + !! - Provides abstract `io_reader_t` and `io_writer_t` types + !! - Enforces a consistent interface for all backends + !! + !! **Note:** This is an internal module and should not be used directly by + !! users. The sole public interface for I/O is the high-level session API + !! provided in `m_io_session`. use m_common, only: dp, i8 @@ -36,50 +35,64 @@ module m_io_base public :: io_reader_t, io_writer_t, io_file_t public :: io_mode_read, io_mode_write - integer, parameter :: io_mode_read = 1 - integer, parameter :: io_mode_write = 2 + integer, parameter :: io_mode_read = 1 !! Read mode flag for opening files + integer, parameter :: io_mode_write = 2 !! Write mode flag for opening files - !> Base file handle for I/O operations type :: io_file_t + !! Base file handle for I/O operations. + !! + !! This abstract type represents an open file handle. Concrete backends + !! extend this type to implement backend-specific file operations. + !! Provides step-based I/O for time-series data. contains - procedure :: close => base_close - procedure :: begin_step => base_begin_step - procedure :: end_step => base_end_step - procedure :: is_file_functional => base_is_file_functional + procedure :: close => base_close !! Close the file + procedure :: begin_step => base_begin_step !! Begin a new I/O step + procedure :: end_step => base_end_step !! End current I/O step + procedure :: is_file_functional => base_is_file_functional !! Check if file is operational end type io_file_t - !> Base I/O reader type for polymorphic usage type :: io_reader_t + !! Base I/O reader type for polymorphic usage. + !! + !! This abstract type provides the interface for reading data from files. + !! Concrete backends (e.g., ADIOS2) extend this type to implement + !! backend-specific reading operations. Supports reading scalars and + !! 3D arrays with optional hyperslab selection. contains - procedure :: init => base_reader_init - procedure :: open => base_reader_open - procedure :: finalise => base_reader_finalise + procedure :: init => base_reader_init !! Initialise reader + procedure :: open => base_reader_open !! Open file for reading + procedure :: finalise => base_reader_finalise !! Finalise and clean up ! Generic interfaces for session usage generic :: read_data => read_data_i8, read_data_integer, read_data_real, & - read_data_array_3d - procedure :: read_data_i8 - procedure :: read_data_integer - procedure :: read_data_real - procedure :: read_data_array_3d + read_data_array_3d !! Read data (generic interface) + procedure :: read_data_i8 !! Read 64-bit integer + procedure :: read_data_integer !! Read default integer + procedure :: read_data_real !! Read double precision real + procedure :: read_data_array_3d !! Read 3D array end type io_reader_t - !> Base I/O writer type for polymorphic usage type :: io_writer_t + !! Base I/O writer type for polymorphic usage. + !! + !! This abstract type provides the interface for writing data to files. + !! Concrete backends (e.g., ADIOS2) extend this type to implement + !! backend-specific writing operations. Supports writing scalars, + !! 3D arrays, and attributes. contains - procedure :: init => base_writer_init - procedure :: open => base_writer_open - procedure :: finalise => base_writer_finalise + procedure :: init => base_writer_init !! Initialise writer + procedure :: open => base_writer_open !! Open file for writing + procedure :: finalise => base_writer_finalise !! Finalise and clean up generic :: write_data => write_data_i8, write_data_integer, & write_data_real, & - write_data_array_3d - procedure :: write_data_i8 - procedure :: write_data_integer - procedure :: write_data_real - procedure :: write_data_array_3d + write_data_array_3d !! Write data (generic interface) + procedure :: write_data_i8 !! Write 64-bit integer + procedure :: write_data_integer !! Write default integer + procedure :: write_data_real !! Write double precision real + procedure :: write_data_array_3d !! Write 3D array generic :: write_attribute => write_attribute_string, & - write_attribute_array_1d_real - procedure :: write_attribute_string - procedure :: write_attribute_array_1d_real + write_attribute_array_1d_real !! Write attribute (generic interface) + procedure :: write_attribute_string !! Write string attribute + procedure :: write_attribute_array_1d_real !! Write 1D real array attribute end type io_writer_t contains diff --git a/src/io/io_field_utils.f90 b/src/io/io_field_utils.f90 index be8911a7b..8ebb3dc17 100644 --- a/src/io/io_field_utils.f90 +++ b/src/io/io_field_utils.f90 @@ -1,10 +1,11 @@ module m_io_field_utils -!! @brief Provides common utilities and helper routines for field I/O -!! operations +!! Common utilities and helper routines for field I/O operations. +!! +!! This module contains a collection of procedures and derived types that +!! handle the low-level tasks required for writing field data. +!! +!! **Primary functionalities:** !! -!! @details This module contains a collection of procedures and derived -!! types that handle the low-level tasks required for writing field data -!! Its primary functionalities include: !! - Data sub-sampling (striding) - applying a stride to data to reduce the !! size of the output files !! - Parallel I/O calculations - determining correct global shapes, @@ -27,14 +28,56 @@ module m_io_field_utils cleanup_field_buffers type :: field_buffer_map_t - ! Race-free field buffer mapping for async I/O operations. - ! Each field gets its own dedicated buffer to prevent data races - ! when multiple async write operations are in flight. + !! Named buffer for thread-safe asynchronous I/O operations. + !! + !! This type maps a field name to its dedicated memory buffer, preventing + !! data races when multiple asynchronous write operations are in flight + !! simultaneously. + !! + !! **Purpose:** + !! + !! During asynchronous I/O, fields are copied into persistent buffers that + !! remain valid while I/O operations execute in the background. Each field + !! gets its own buffer identified by name, ensuring: + !! + !! - **Thread safety**: No conflicts between concurrent writes + !! - **Data integrity**: Field data remains stable during async operations + !! - **Flexibility**: Supports strided/downsampled data for visualization + !! + !! **Workflow:** + !! + !! 1. `prepare_field_buffers`: Allocate buffers for all fields + !! 2. `write_single_field_to_buffer`: Copy field data into named buffer + !! 3. ADIOS2 writes from buffer (async, non-blocking) + !! 4. `cleanup_field_buffers`: Deallocate buffers when done + !! + !! **Components:** + !! + !! - `field_name`: Identifier for buffer lookup (e.g., "u", "v", "w", "p") + !! - `buffer`: 3D array holding field data (possibly strided) character(len=32) :: field_name real(dp), dimension(:, :, :), allocatable :: buffer end type field_buffer_map_t type :: field_ptr_t + !! Wrapper type for storing polymorphic field pointers in arrays. + !! + !! Fortran does not allow allocatable arrays of polymorphic pointers directly + !! (e.g., `class(field_t), pointer :: fields(:)`), so this wrapper type + !! enables creating arrays of field pointers: + !! + !! ```fortran + !! type(field_ptr_t), allocatable :: field_array(:) + !! ``` + !! + !! **Use cases:** + !! + !! - Managing multiple fields for I/O operations + !! - Storing references to velocity components (u, v, w) + !! - Building lists of fields to write/read simultaneously + !! + !! **Note:** Each `field_ptr_t` holds a pointer to a `field_t` object; + !! the pointer can be null if not yet associated. class(field_t), pointer :: ptr => null() end type field_ptr_t diff --git a/src/io/io_manager.f90 b/src/io/io_manager.f90 index a9b50a72f..18dab197c 100644 --- a/src/io/io_manager.f90 +++ b/src/io/io_manager.f90 @@ -1,12 +1,29 @@ module m_io_manager -!! @brief Provides a high-level manager that orchestrates all checkpoint and -!! snapshot operations. -!! -!! @details This module acts as a facade to the I/O subsystem. -!! Its purpose is to simplify the main simulation loop by providing -!! a single point of contact for all I/O-related actions. The mainprogram only -!! needs to interact with the `io_manager_t` type, which then delegates tasks -!! to the specialised checkpoint and snapshot managers. + !! High-level manager orchestrating checkpoint and snapshot operations. + !! + !! This module acts as a facade to the I/O subsystem, simplifying the main + !! simulation loop by providing a single point of contact for all I/O-related + !! actions. The main program only needs to interact with `io_manager_t`, which + !! delegates tasks to specialised checkpoint and snapshot managers. + !! + !! **Responsibilities:** + !! + !! - Initialise checkpoint and snapshot managers + !! - Coordinate restart from checkpoints + !! - Orchestrate periodic checkpoint and snapshot writes + !! - Finalise I/O operations and clean up resources + !! + !! **Usage Pattern:** + !! + !! ```fortran + !! type(io_manager_t) :: io_mgr + !! call io_mgr%init(comm) + !! if (io_mgr%is_restart()) call io_mgr%handle_restart(solver, comm) + !! do timestep = 1, n_steps + !! call io_mgr%handle_io_step(solver, timestep, comm) + !! end do + !! call io_mgr%finalise() + !! ``` use m_checkpoint_manager, only: checkpoint_manager_t use m_snapshot_manager, only: snapshot_manager_t use m_solver, only: solver_t @@ -17,53 +34,90 @@ module m_io_manager public :: io_manager_t type :: io_manager_t - type(checkpoint_manager_t) :: checkpoint_mgr - type(snapshot_manager_t) :: snapshot_mgr + !! Unified manager for checkpoint and snapshot operations. + !! + !! Contains both checkpoint and snapshot managers and provides + !! a simplified interface for the main simulation loop. + type(checkpoint_manager_t) :: checkpoint_mgr !! Manages restart and checkpoint files + type(snapshot_manager_t) :: snapshot_mgr !! Manages visualisation output files contains - procedure :: init => io_init - procedure :: handle_restart => io_handle_restart - procedure :: handle_io_step => io_handle_step - procedure :: finalise => io_finalise - procedure :: is_restart => io_is_restart + procedure :: init => io_init !! Initialise I/O managers + procedure :: handle_restart => io_handle_restart !! Load restart data if needed + procedure :: handle_io_step => io_handle_step !! Process checkpoints/snapshots for timestep + procedure :: finalise => io_finalise !! Finalise and clean up + procedure :: is_restart => io_is_restart !! Check if simulation is restarting end type io_manager_t contains subroutine io_init(self, comm) - class(io_manager_t), intent(inout) :: self - integer, intent(in) :: comm + !! Initialise checkpoint and snapshot managers. + !! + !! Sets up both managers by passing the MPI communicator. Each manager + !! reads its configuration and prepares for I/O operations. + implicit none + + class(io_manager_t), intent(inout) :: self !! I/O manager instance + integer, intent(in) :: comm !! MPI communicator call self%checkpoint_mgr%init(comm) call self%snapshot_mgr%init(comm) end subroutine io_init subroutine io_handle_restart(self, solver, comm) - class(io_manager_t), intent(inout) :: self - class(solver_t), intent(inout) :: solver - integer, intent(in), optional :: comm + !! Handle restart by loading checkpoint data. + !! + !! Delegates to the checkpoint manager to load solver state from + !! the most recent checkpoint file. Should only be called if + !! `is_restart()` returns true. + implicit none + + class(io_manager_t), intent(inout) :: self !! I/O manager instance + class(solver_t), intent(inout) :: solver !! Solver to load state into + integer, intent(in), optional :: comm !! MPI communicator (optional) call self%checkpoint_mgr%handle_restart(solver, comm) end subroutine io_handle_restart subroutine io_handle_step(self, solver, timestep, comm) - class(io_manager_t), intent(inout) :: self - class(solver_t), intent(in) :: solver - integer, intent(in) :: timestep - integer, intent(in), optional :: comm + !! Handle I/O operations for current timestep. + !! + !! Checks if checkpoint or snapshot output is required at this timestep + !! and writes data accordingly. Typically called at the end of each + !! timestep in the main simulation loop. + implicit none + + class(io_manager_t), intent(inout) :: self !! I/O manager instance + class(solver_t), intent(in) :: solver !! Solver containing current state + integer, intent(in) :: timestep !! Current timestep number + integer, intent(in), optional :: comm !! MPI communicator (optional) call self%checkpoint_mgr%handle_checkpoint_step(solver, timestep, comm) call self%snapshot_mgr%handle_snapshot_step(solver, timestep, comm) end subroutine io_handle_step function io_is_restart(self) result(is_restart) - class(io_manager_t), intent(in) :: self - logical :: is_restart + !! Check if simulation is restarting from checkpoint. + !! + !! Queries the checkpoint manager to determine if a restart file + !! exists and should be loaded. + implicit none + + class(io_manager_t), intent(in) :: self !! I/O manager instance + logical :: is_restart !! True if restarting from checkpoint is_restart = self%checkpoint_mgr%is_restart() end function io_is_restart subroutine io_finalise(self) - class(io_manager_t), intent(inout) :: self + !! Finalise I/O operations and clean up resources. + !! + !! Closes any open files and releases resources held by both + !! checkpoint and snapshot managers. Should be called at the end + !! of the simulation. + implicit none + + class(io_manager_t), intent(inout) :: self !! I/O manager instance call self%checkpoint_mgr%finalise() call self%snapshot_mgr%finalise() diff --git a/src/io/io_session.f90 b/src/io/io_session.f90 index 9eae14f7b..21c0de7e7 100644 --- a/src/io/io_session.f90 +++ b/src/io/io_session.f90 @@ -1,12 +1,12 @@ module m_io_session -!! @brief Provides high-level, session-based user interface for all I/O -!! operations +!! High-level, session-based user interface for all I/O operations. !! -!! @details This module is the sole entry point for file reading and writing. +!! This module is the sole entry point for file reading and writing. !! It abstracts away all backend details and provides a type-safe interface !! for all I/O tasks. !! -!! Key features: +!! **Key features:** +!! !! - Type-safe sessions: specialised `reader_session_t` and `writer_session_t` !! types for reading and writing operations, respectively. !! - Automatic backend selection: based on compile-time options @@ -16,10 +16,11 @@ module m_io_session !! `open -> read/write -> close` workflow, with no need for manual file handle !! management or explicit cleanup calls. !! -!! @example -!! A typical usage pattern for reading data and writing data: +!! **Usage Example:** +!! +!! A typical usage pattern for reading and writing data: !! -!! @code{.f90} +!! ```fortran !! use m_io_session, only: writer_session_t, reader_session_t !! !! implicit none @@ -39,9 +40,9 @@ module m_io_session !! call reader%read_data("temperature", temp_field) !! call reader%close() !! ! Note: reader is automatically cleaned up when it goes out of scope -!! @endcode +!! ``` !! -!! @note Users should only use the types provided by this module. The lower-level +!! **Note:** Users should only use the types provided by this module. The lower-level !! modules like `m_io_base` and `m_io_backend` are internal components and should !! never be used directly in user code. use m_common, only: dp, i8 @@ -68,16 +69,19 @@ module m_io_session procedure :: close => session_base_close end type io_session_base_t - !> **PRIMARY TYPE FOR READING DATA** - Use this for all file reading operations + !> PRIMARY TYPE FOR READING DATA - Use this for all file reading operations !! This is the only interface users should use for reading data. !! Provides type-safe reading operations with automatic backend selection. !! - !! Usage example: + !! **Usage example:** + !! + !! ```fortran !! type(reader_session_t) :: reader_session !! call reader_session%open("checkpoint.bp", MPI_COMM_WORLD) !! call reader_session%read_data("timestep", timestep) !! call reader_session%read_data("velocity_u", u_field, start_dims, count_dims) !! call reader_session%close() + !! ``` type, extends(io_session_base_t) :: reader_session_t private class(io_reader_t), allocatable :: reader @@ -94,18 +98,20 @@ module m_io_session final :: reader_session_finaliser end type reader_session_t - !> **PRIMARY TYPE FOR WRITING DATA** - Use this for all file writing operations + !> PRIMARY TYPE FOR WRITING DATA - Use this for all file writing operations !! This is the only interface users should use for writing data. !! Provides type-safe writing operations with automatic backend selection. !! - !! Usage example: - !! type(writer_session_t) :: writer_session - !! call writer_session%open("output.bp", MPI_COMM_WORLD) - !! call writer_session%write_data("timestep", current_step) - !! call writer_session%write_data("pressure", p_field, start_dims, count_dims) - !! call writer_session%close() - !! call writer_session%write_attribute("ParaView", "vtk_xml_content") - !! call writer_session%close() + !! **Usage example:** + !! + !! ```fortran + !! type(writer_session_t) :: writer_session + !! call writer_session%open("output.bp", MPI_COMM_WORLD) + !! call writer_session%write_data("timestep", current_step) + !! call writer_session%write_data("pressure", p_field, start_dims, count_dims) + !! call writer_session%close() + !! call writer_session%write_attribute("ParaView", "vtk_xml_content") + !! ``` type, extends(io_session_base_t) :: writer_session_t private class(io_writer_t), allocatable :: writer diff --git a/src/io/snapshot_manager.f90 b/src/io/snapshot_manager.f90 index 68ddfa841..1d03052a0 100644 --- a/src/io/snapshot_manager.f90 +++ b/src/io/snapshot_manager.f90 @@ -1,11 +1,33 @@ module m_snapshot_manager -!! @brief Manages the creation of simulation snapshots for post-processing -!! and visualisation. -!! -!! @details This module is responsible for periodically writing simulation -!! data to files intended for analysis and visualisation -!! Unlike checkpoints, which are always full-resolution for exact restarts, -!! snapshots can be strided to reduce file size. + !! Manages creation of simulation snapshots for post-processing and visualisation. + !! + !! This module periodically writes simulation data to files intended for + !! analysis and visualisation. Unlike checkpoints (full-resolution for exact + !! restarts), snapshots can be strided to reduce file size while retaining + !! sufficient resolution for visualisation. + !! + !! **Key Differences from Checkpoints:** + !! + !! - **Purpose**: Visualisation/analysis vs exact restart + !! - **Resolution**: Can be strided (e.g., every 2nd point) vs full resolution + !! - **Frequency**: Typically more frequent than checkpoints + !! - **File Management**: Single persistent file with multiple timesteps vs + !! separate files per checkpoint + !! + !! **Features:** + !! + !! - Configurable spatial striding to reduce output size + !! - Persistent file handle (stays open across multiple writes) + !! - Generates VTK-compatible XML for ParaView visualisation + !! - Writes velocity fields at each snapshot interval + !! + !! **Configuration:** + !! + !! Controlled via `checkpoint_config_t` read from input namelist: + !! + !! - `snapshot_freq`: write interval (iterations) + !! - `snapshot_prefix`: filename prefix + !! - `output_stride`: spatial stride factors [`sx`, `sy`, `sz`] use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank use m_common, only: dp, i8, DIR_C, VERT, get_argument use m_field, only: field_t @@ -24,34 +46,43 @@ module m_snapshot_manager public :: snapshot_manager_t type :: snapshot_manager_t - type(checkpoint_config_t) :: config - integer, dimension(3) :: output_stride = [1, 1, 1] - type(field_buffer_map_t), allocatable :: field_buffers(:) - integer(i8), dimension(3) :: last_shape_dims = 0 - integer, dimension(3) :: last_stride_factors = 0 - integer(i8), dimension(3) :: last_output_shape = 0 - character(len=4096) :: vtk_xml = "" - logical :: is_snapshot_file_open = .false. - type(writer_session_t) :: snapshot_writer - logical :: convert_to_sp = .false. !! Flag for single precision snapshots + !! Manager for snapshot file operations (periodic visualisation output). + !! + !! Handles periodic writing of visualisation data with optional striding. + !! Maintains a persistent file handle that stays open across multiple + !! snapshot writes for efficient I/O. + type(checkpoint_config_t) :: config !! Configuration settings + integer, dimension(3) :: output_stride = [1, 1, 1] !! Spatial stride factors [sx, sy, sz] + type(field_buffer_map_t), allocatable :: field_buffers(:) !! Buffers for field data I/O + integer(i8), dimension(3) :: last_shape_dims = 0 !! Shape dimensions from last write + integer, dimension(3) :: last_stride_factors = 0 !! Stride factors from last write + integer(i8), dimension(3) :: last_output_shape = 0 !! Output shape from last write + character(len=4096) :: vtk_xml = "" !! VTK XML metadata for ParaView + logical :: is_snapshot_file_open = .false. !! File handle state flag + type(writer_session_t) :: snapshot_writer !! I/O session writer + logical :: convert_to_sp = .false. !! Flag for single precision snapshots contains - procedure :: init - procedure :: handle_snapshot_step - procedure :: finalise - procedure, private :: write_snapshot - procedure, private :: write_fields - procedure, private :: cleanup_output_buffers - procedure, private :: generate_vtk_xml - procedure, private :: open_snapshot_file - procedure, private :: close_snapshot_file + procedure :: init !! Initialise snapshot manager + procedure :: handle_snapshot_step !! Write snapshot if needed at timestep + procedure :: finalise !! Clean up and finalise + procedure, private :: write_snapshot !! Write snapshot file (internal) + procedure, private :: write_fields !! Write field data to file (internal) + procedure, private :: cleanup_output_buffers !! Free output buffers (internal) + procedure, private :: generate_vtk_xml !! Generate VTK XML metadata (internal) + procedure, private :: open_snapshot_file !! Open snapshot file (internal) + procedure, private :: close_snapshot_file !! Close snapshot file (internal) end type snapshot_manager_t contains subroutine init(self, comm) - !! Initialise snapshot manager - class(snapshot_manager_t), intent(inout) :: self - integer, intent(in) :: comm + !! Initialise snapshot manager from configuration. + !! + !! Reads snapshot settings from input namelist and configures + !! output if snapshot frequency is positive. Prints snapshot + !! settings including stride factors on root process. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + integer, intent(in) :: comm !! MPI communicator self%config = checkpoint_config_t() call self%config%read(nml_file=get_argument(1)) @@ -62,10 +93,13 @@ subroutine init(self, comm) end subroutine init subroutine configure_output(self, comm) - !! Configure snapshot output settings + !! Configure and print snapshot output settings. + !! + !! Displays snapshot configuration on root process including + !! frequency, file prefix, and output stride factors. use m_io_backend, only: get_default_backend, IO_BACKEND_DUMMY - class(snapshot_manager_t), intent(inout) :: self - integer, intent(in) :: comm + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + integer, intent(in) :: comm !! MPI communicator integer :: myrank, ierr @@ -84,11 +118,15 @@ subroutine configure_output(self, comm) end subroutine configure_output subroutine handle_snapshot_step(self, solver, timestep, comm) - !! Handle snapshot writing at a given timestep - class(snapshot_manager_t), intent(inout) :: self - class(solver_t), intent(in) :: solver - integer, intent(in) :: timestep - integer, intent(in), optional :: comm + !! Write snapshot if frequency condition is met. + !! + !! Checks if current timestep is a snapshot interval (divisible by + !! snapshot_freq) and writes snapshot if so. Called each timestep + !! from main simulation loop. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + class(solver_t), intent(in) :: solver !! Solver containing current state + integer, intent(in) :: timestep !! Current timestep number + integer, intent(in), optional :: comm !! MPI communicator (optional) integer :: comm_to_use @@ -99,13 +137,15 @@ subroutine handle_snapshot_step(self, solver, timestep, comm) end subroutine handle_snapshot_step subroutine write_snapshot(self, solver, timestep, comm) - !! Write a snapshot file for visualisation - !! Uses a persistent file that stays open across multiple snapshots - !! Each snapshot is written as a separate timestep in the file - class(snapshot_manager_t), intent(inout) :: self - class(solver_t), intent(in) :: solver - integer, intent(in) :: timestep - integer, intent(in) :: comm + !! Write a snapshot file for visualisation. + !! + !! Uses a persistent file that stays open across multiple snapshots. + !! Each snapshot is written as a separate timestep within the file. + !! Data can be strided according to output_stride configuration. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + class(solver_t), intent(in) :: solver !! Solver containing field data + integer, intent(in) :: timestep !! Current timestep number + integer, intent(in) :: comm !! MPI communicator character(len=*), parameter :: field_names(*) = ["u", "v", "w"] integer :: myrank, ierr @@ -179,11 +219,27 @@ subroutine write_snapshot(self, solver, timestep, comm) end subroutine write_snapshot subroutine generate_vtk_xml(self, dims, fields, origin, spacing) - !! Generate VTK XML string for ImageData format for ParaView's ADIOS2VTXReader - class(snapshot_manager_t), intent(inout) :: self - integer(i8), dimension(3), intent(in) :: dims - character(len=*), dimension(:), intent(in) :: fields - real(dp), dimension(3), intent(in) :: origin, spacing + !! Generate VTK XML metadata for ParaView visualization (internal). + !! + !! Creates VTK ImageData XML string that describes the structured grid + !! for ParaView's ADIOS2VTXReader. This enables direct visualization of + !! ADIOS2 files in ParaView without conversion. + !! + !! **VTK ImageData Format:** + !! + !! - Defines structured rectilinear grid with uniform spacing + !! - Extent: grid dimensions from 0 to N-1 in (z,y,x) order + !! - Origin: physical coordinates of first grid point + !! - Spacing: grid resolution (dx, dy, dz) + !! - Point data: velocity fields (u, v, w) stored at grid points + !! + !! **Note:** VTK uses (x,y,z) order while X3D2 uses (z,y,x) internally, + !! requiring dimension reordering in the extent string. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + integer(i8), dimension(3), intent(in) :: dims !! Grid dimensions [nx, ny, nz] + character(len=*), dimension(:), intent(in) :: fields !! Field names ["u", "v", "w"] + real(dp), dimension(3), intent(in) :: origin !! Grid origin [x0, y0, z0] + real(dp), dimension(3), intent(in) :: spacing !! Grid spacing [dx, dy, dz] character(len=4096) :: xml character(len=96) :: extent_str, origin_str, spacing_str @@ -223,13 +279,28 @@ end subroutine generate_vtk_xml subroutine write_fields( & self, field_names, host_fields, solver, writer_session, data_loc & ) - !! Write field data with striding for snapshots - class(snapshot_manager_t), intent(inout) :: self - character(len=*), dimension(:), intent(in) :: field_names - class(field_ptr_t), dimension(:), target, intent(in) :: host_fields - class(solver_t), intent(in) :: solver - type(writer_session_t), intent(inout) :: writer_session - integer, intent(in) :: data_loc + !! Write field data with optional striding for snapshots (internal). + !! + !! Writes field data with spatial striding to reduce file size while + !! maintaining sufficient resolution for visualization. The procedure: + !! 1. Prepare field buffers with configured stride factors + !! 2. Calculate strided output dimensions and hyperslab selection + !! 3. For each field (u, v, w): + !! - Copy strided field data to output buffer + !! - Write buffer to file with proper hyperslab parameters + !! + !! **Striding:** Unlike checkpoints (full resolution), snapshots can + !! subsample data. For example, stride [2,2,2] writes every 2nd point + !! in each direction, reducing file size by factor of 8. + !! + !! **Parallel I/O:** Each MPI rank writes its strided local subdomain + !! using hyperslab selection to assemble the strided global field. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + character(len=*), dimension(:), intent(in) :: field_names !! Field names ["u", "v", "w"] + class(field_ptr_t), dimension(:), target, intent(in) :: host_fields !! Field pointers + class(solver_t), intent(in) :: solver !! Solver containing mesh info + type(writer_session_t), intent(inout) :: writer_session !! I/O writer session + integer, intent(in) :: data_loc !! Data location (VERT or CELL) integer :: i_field integer(i8), dimension(3) :: output_start, output_count @@ -272,26 +343,49 @@ subroutine write_fields( & end subroutine write_fields subroutine cleanup_output_buffers(self) - !! Clean up dynamic field buffers - class(snapshot_manager_t), intent(inout) :: self + !! Clean up dynamically allocated field buffers (internal). + !! + !! Frees memory allocated for field I/O buffers. Called during + !! finalisation to prevent memory leaks. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance call cleanup_field_buffers(self%field_buffers) end subroutine cleanup_output_buffers subroutine finalise(self) - !! Clean up snapshot manager - class(snapshot_manager_t), intent(inout) :: self + !! Finalise snapshot manager and free resources. + !! + !! Cleans up all dynamically allocated buffers and closes the + !! persistent snapshot file. Should be called at the end of + !! simulation or when snapshot manager is no longer needed. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance call self%cleanup_output_buffers() call self%close_snapshot_file() end subroutine finalise subroutine open_snapshot_file(self, filename, comm) - !! Open a persistent snapshot file - !! ADIOS2 handles both creating new files and appending to existing ones - class(snapshot_manager_t), intent(inout) :: self - character(len=*), intent(in) :: filename - integer, intent(in) :: comm + !! Open persistent snapshot file for appending timesteps (internal). + !! + !! Opens or creates a snapshot file that remains open across multiple + !! snapshot writes. Each snapshot is written as a new timestep within + !! the same file, enabling efficient time-series visualization. + !! + !! **Persistent File Strategy:** + !! + !! - File opened once at first snapshot + !! - Remains open for subsequent snapshots (append mode) + !! - Each write adds a new timestep to the file + !! - Closed only during finalisation + !! + !! **Benefits:** Reduces file open/close overhead and keeps all snapshots + !! in a single file for easy ParaView animation. + !! + !! **ADIOS2 Behaviour:** Automatically handles both creating new files + !! and appending to existing ones based on file existence. + class(snapshot_manager_t), intent(inout) :: self !! Snapshot manager instance + character(len=*), intent(in) :: filename !! Snapshot file path + integer, intent(in) :: comm !! MPI communicator logical :: file_exists integer :: myrank, ierr diff --git a/src/mesh.f90 b/src/mesh.f90 index bb0dac1c7..a9b625654 100644 --- a/src/mesh.f90 +++ b/src/mesh.f90 @@ -1,4 +1,11 @@ module m_mesh + !! Mesh module providing high-level mesh management and query functions. + !! + !! This module defines the `mesh_t` type which aggregates geometry, grid, and + !! parallel decomposition information. It provides methods to query mesh + !! dimensions, coordinates, and other mesh properties for both global and + !! local (per MPI rank) domains. + use iso_fortran_env, only: stderr => error_unit use mpi @@ -11,21 +18,28 @@ module m_mesh implicit none - ! The mesh class stores all the information about the global and local (due to domain decomposition) mesh - ! It also includes getter functions to access some of its parameters type :: mesh_t - type(geo_t), allocatable :: geo ! object containing geometry information - class(grid_t), allocatable :: grid ! object containing grid information - class(par_t), allocatable :: par ! object containing parallel domain decomposition information + !! Mesh type containing all mesh information for the simulation. + !! + !! This type aggregates three main components: + !! - geo: Geometry information (coordinates, stretching) + !! - grid: Grid dimensions and boundary conditions + !! - par: Parallel domain decomposition information + !! + !! The mesh is initialised once and should be treated as read-only + !! during the simulation. + type(geo_t), allocatable :: geo !! Geometry information + class(grid_t), allocatable :: grid !! Grid dimensions and boundary conditions + class(par_t), allocatable :: par !! Parallel decomposition information contains - procedure :: get_dims - procedure :: get_global_dims + procedure :: get_dims !! Get local dimensions for a data location + procedure :: get_global_dims !! Get global dimensions for a data location - procedure :: get_n_dir - procedure :: get_n_phi - generic :: get_n => get_n_dir, get_n_phi + procedure :: get_n_dir !! Get number of grid points in a direction + procedure :: get_n_phi !! Get number of grid points for a field + generic :: get_n => get_n_dir, get_n_phi !! Generic interface for get_n - procedure :: get_coordinates + procedure :: get_coordinates !! Get coordinate array for a direction end type mesh_t interface mesh_t @@ -36,18 +50,23 @@ module m_mesh function mesh_init(dims_global, nproc_dir, L_global, BC_x, BC_y, BC_z, & stretching, beta, use_2decomp) result(mesh) + !! Initialise the mesh object with global domain parameters. + !! + !! Creates and fully initialises a mesh object containing geometry, grid, and + !! parallel decomposition information. The mesh should be treated as read-only + !! after initialisation. Supports both uniform and stretched meshes, and can + !! use either 2decomp or generic domain decomposition. use m_decomp, only: is_avail_2decomp, decomposition_2decomp - !! Completely initialise the mesh object. - !! Upon initialisation the mesh object can be read-only and shouldn't be edited - !! Takes as argument global information about the mesh like its length, number of cells and decomposition in each direction - integer, dimension(3), intent(in) :: dims_global - integer, dimension(3), intent(in) :: nproc_dir ! Number of proc in each direction - real(dp), dimension(3), intent(in) :: L_global - character(len=*), dimension(2), intent(in) :: BC_x, BC_y, BC_z - character(len=*), dimension(3), optional, intent(in) :: stretching - real(dp), dimension(3), optional, intent(in) :: beta - logical, optional, intent(in) :: use_2decomp - class(mesh_t), allocatable :: mesh + integer, dimension(3), intent(in) :: dims_global !! Global grid dimensions [nx, ny, nz] + integer, dimension(3), intent(in) :: nproc_dir !! Number of processors in each direction + real(dp), dimension(3), intent(in) :: L_global !! Physical domain lengths [Lx, Ly, Lz] + character(len=*), dimension(2), intent(in) :: BC_x !! Boundary conditions in x (lower, upper) + character(len=*), dimension(2), intent(in) :: BC_y !! Boundary conditions in y (lower, upper) + character(len=*), dimension(2), intent(in) :: BC_z !! Boundary conditions in z (lower, upper) + character(len=*), dimension(3), optional, intent(in) :: stretching !! Mesh stretching type per direction + real(dp), dimension(3), optional, intent(in) :: beta !! Stretching parameters per direction + logical, optional, intent(in) :: use_2decomp !! Flag to use 2decomp library + class(mesh_t), allocatable :: mesh !! Initialised mesh object character(len=20), dimension(3, 2) :: BC_all logical :: is_first_domain, is_last_domain @@ -194,19 +213,25 @@ subroutine decomposition_generic(grid, par) end subroutine pure function get_dims(self, data_loc) result(dims) - !! Getter for local domain dimensions - class(mesh_t), intent(in) :: self - integer, intent(in) :: data_loc - integer, dimension(3) :: dims + !! Get local domain dimensions for a specific data location. + !! + !! Returns the dimensions of the local subdomain (on this MPI rank) for + !! the specified data location (VERT, CELL, X_FACE, etc.). + class(mesh_t), intent(in) :: self !! Mesh object + integer, intent(in) :: data_loc !! Data location flag (VERT, CELL, etc.) + integer, dimension(3) :: dims !! Local dimensions [nx, ny, nz] dims = get_dims_dataloc(data_loc, self%grid%vert_dims, self%grid%cell_dims) end function pure function get_global_dims(self, data_loc) result(dims) - !! Getter for local domain dimensions - class(mesh_t), intent(in) :: self - integer, intent(in) :: data_loc - integer, dimension(3) :: dims + !! Get global domain dimensions for a specific data location. + !! + !! Returns the dimensions of the entire global domain for the specified + !! data location (VERT, CELL, X_FACE, etc.). + class(mesh_t), intent(in) :: self !! Mesh object + integer, intent(in) :: data_loc !! Data location flag (VERT, CELL, etc.) + integer, dimension(3) :: dims !! Global dimensions [nx, ny, nz] dims = get_dims_dataloc(data_loc, self%grid%global_vert_dims, & self%grid%global_cell_dims) @@ -249,21 +274,30 @@ pure function get_dims_dataloc(data_loc, vert_dims, cell_dims) result(dims) end function get_dims_dataloc pure function get_n_phi(self, phi) result(n) - !! Getter for the main dimension of field phi - class(mesh_t), intent(in) :: self - class(field_t), intent(in) :: phi - integer :: n + !! Get the main dimension (pencil length) for a field. + !! + !! Returns the number of grid points along the primary direction for the + !! given field, accounting for both the field's orientation (dir) and + !! data location on the staggered grid. + class(mesh_t), intent(in) :: self !! Mesh object + class(field_t), intent(in) :: phi !! Field to query + integer :: n !! Number of grid points in main direction n = self%get_n(phi%dir, phi%data_loc) end function pure function get_n_dir(self, dir, data_loc) result(n) - !! Getter for the main dimension a field oriented along `dir` with data on `data_loc` - class(mesh_t), intent(in) :: self - integer, intent(in) :: dir - integer, intent(in) :: data_loc - integer :: n, n_cell, n_vert + !! Get the main dimension for a field with given direction and data location. + !! + !! Returns the number of grid points along a specified direction for a field + !! located at the given position on the staggered grid. Handles the different + !! grid dimensions for vertex-centered vs cell-centered data. + class(mesh_t), intent(in) :: self !! Mesh object + integer, intent(in) :: dir !! Primary direction (DIR_X, DIR_Y, DIR_Z) + integer, intent(in) :: data_loc !! Data location (VERT, CELL, X_FACE, etc.) + integer :: n !! Number of grid points in direction + integer :: n_cell, n_vert n_cell = self%grid%cell_dims(dir) n_vert = self%grid%vert_dims(dir) @@ -306,13 +340,17 @@ pure function get_n_dir(self, dir, data_loc) result(n) end function get_n_dir pure function get_coordinates(self, i, j, k, data_loc_op) result(coords) - !! Get the coordinates of a vertex with i, j, k local cartesian indices - !! Avoid calling this in hot loops - class(mesh_t), intent(in) :: self - integer, intent(in) :: i, j, k - integer, optional, intent(in) :: data_loc_op + !! Get physical coordinates for a grid point with given indices. + !! + !! Returns the physical (x, y, z) coordinates for a grid point specified by + !! local Cartesian indices (i, j, k) at the given data location. Default + !! location is vertex-centered (VERT). Note: Avoid calling this function in + !! hot loops due to performance overhead. + class(mesh_t), intent(in) :: self !! Mesh object + integer, intent(in) :: i, j, k !! Local Cartesian indices + integer, optional, intent(in) :: data_loc_op !! Data location (default: VERT) integer :: data_loc - real(dp), dimension(3) :: coords + real(dp), dimension(3) :: coords !! Physical coordinates [x, y, z] if (present(data_loc_op)) then data_loc = data_loc_op diff --git a/src/mesh_content.f90 b/src/mesh_content.f90 index 322870225..43012f705 100644 --- a/src/mesh_content.f90 +++ b/src/mesh_content.f90 @@ -1,80 +1,96 @@ module m_mesh_content + !! Module containing mesh content types for geometry, grid, and parallel decomposition. + !! + !! This module defines three main types: + !! + !! - `geo_t`: Geometry information including coordinates and mesh stretching + !! - `grid_t`: Grid dimensions and boundary conditions + !! - `par_t`: Parallel domain decomposition information use m_common, only: dp, pi implicit none type :: geo_t - !! Stores geometry information - !> Origin: coordinates of vertex (1, 1, 1) - real(dp) :: origin(3) - !> size of a cell in each direction for a uniform mesh - real(dp) :: d(3) - !> Global dimensions of the domain in each direction - real(dp) :: L(3) - !> Global coordinates at vertices - real(dp), allocatable, dimension(:, :) :: vert_coords - !> Global coordinates at midpoints - real(dp), allocatable, dimension(:, :) :: midp_coords - !> Stretching type - character(len=20), dimension(3) :: stretching - !> Stretching - logical :: stretched(3) - !> Stretching parameters - real(dp) :: alpha(3), beta(3) - !> Stretching factors at vertices + !! Geometry information type for domain coordinates and mesh stretching. + !! + !! This type stores physical domain dimensions, coordinates at grid points, + !! and mesh stretching parameters. Coordinates and stretching factors are + !! stored for both vertex-centered and cell-centered locations. + real(dp) :: origin(3) !! Coordinates of vertex (1, 1, 1) + real(dp) :: d(3) !! Cell size in each direction for uniform mesh + real(dp) :: L(3) !! Global domain dimensions in each direction + real(dp), allocatable, dimension(:, :) :: vert_coords !! Global coordinates at vertices + real(dp), allocatable, dimension(:, :) :: midp_coords !! Global coordinates at cell midpoints + character(len=20), dimension(3) :: stretching !! Stretching type in each direction + logical :: stretched(3) !! Whether each direction has stretching applied + real(dp) :: alpha(3) !! Stretching parameter \(\alpha\) in each direction + real(dp) :: beta(3) !! Stretching parameter \(\beta\) in each direction + !> Stretching factors at vertices: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\) real(dp), allocatable, dimension(:, :) :: vert_ds, vert_ds2, vert_d2s - !> Stretching factors at midpoints + !> Stretching factors at midpoints: \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), \(\frac{d^2\xi}{ds^2}\) real(dp), allocatable, dimension(:, :) :: midp_ds, midp_ds2, midp_d2s contains - procedure :: obtain_coordinates + procedure :: obtain_coordinates !! Compute coordinates and stretching factors end type type :: grid_t - !! Stores grid information - integer, dimension(3) :: global_vert_dims ! global number of vertices in each direction without padding (cartesian structure) - integer, dimension(3) :: global_cell_dims ! global number of cells in each direction without padding (cartesian structure) - - integer, dimension(3) :: vert_dims ! local number of vertices in each direction without padding (cartesian structure) - integer, dimension(3) :: cell_dims ! local number of cells in each direction without padding (cartesian structure) - logical, dimension(3) :: periodic_BC ! Whether or not a direction has a periodic BC - integer, dimension(3, 2) :: BCs_global - integer, dimension(3, 2) :: BCs + !! Grid information type for mesh dimensions and boundary conditions. + !! + !! This type stores both global and local (per MPI rank) grid dimensions, + !! accounting for both vertex-centered and cell-centered data. It also + !! manages boundary condition information. + integer, dimension(3) :: global_vert_dims !! Global number of vertices in each direction + integer, dimension(3) :: global_cell_dims !! Global number of cells in each direction + integer, dimension(3) :: vert_dims !! Local number of vertices in each direction + integer, dimension(3) :: cell_dims !! Local number of cells in each direction + logical, dimension(3) :: periodic_BC !! Whether each direction has periodic BC + integer, dimension(3, 2) :: BCs_global !! Global boundary conditions (lower, upper) in each direction + integer, dimension(3, 2) :: BCs !! Local subdomain boundary conditions (lower, upper) contains - procedure :: copy_cell2vert_dims ! Copies cell_dims to vert_dims taking periodicity into account - procedure :: copy_vert2cell_dims ! Copies vert_dims to cell_dims taking periodicity into account + procedure :: copy_cell2vert_dims !! Copy cell_dims to vert_dims accounting for periodicity + procedure :: copy_vert2cell_dims !! Copy vert_dims to cell_dims accounting for periodicity end type type :: par_t - !! Stores parallel domain related information - integer :: nrank ! local rank ID - integer :: nproc ! total number of ranks/proc participating in the domain decomposition - integer, dimension(3) :: nrank_dir ! local rank ID in each direction - integer, dimension(3) :: nproc_dir ! total number of proc in each direction - integer, dimension(3) :: n_offset ! number of cells offset in each direction due to domain decomposition - integer, dimension(3) :: pnext ! rank ID of the previous rank in each direction - integer, dimension(3) :: pprev ! rank ID of the next rank in each direction + !! Parallel domain decomposition information type. + !! + !! This type stores all information related to MPI domain decomposition, + !! including rank IDs, processor grid layout, and neighbor communication + !! information for halo exchanges. + integer :: nrank !! Local MPI rank ID (0-based) + integer :: nproc !! Total number of MPI ranks + integer, dimension(3) :: nrank_dir !! Local rank ID in each direction (0-based) + integer, dimension(3) :: nproc_dir !! Number of processors in each direction + integer, dimension(3) :: n_offset !! Cell offset in each direction due to decomposition + integer, dimension(3) :: pnext !! Rank ID of next neighbor in each direction + integer, dimension(3) :: pprev !! Rank ID of previous neighbor in each direction contains - procedure :: is_root ! returns if the current rank is the root rank - procedure :: compute_rank_pos_from_global ! fills in pnext, pprev and nrank_dir from global ranks map + procedure :: is_root !! Check if current rank is root (rank 0) + procedure :: compute_rank_pos_from_global !! Compute rank position and neighbors from global map end type contains pure function is_root(self) result(is_root_rank) - !! Returns wether or not the current rank is the root rank - class(par_t), intent(in) :: self - logical :: is_root_rank + !! Check whether the current MPI rank is the root rank. + !! + !! The root rank is defined as rank 0 in the MPI communicator. + class(par_t), intent(in) :: self !! Parallel decomposition object + logical :: is_root_rank !! True if this is rank 0 is_root_rank = (self%nrank == 0) end function pure subroutine compute_rank_pos_from_global(self, global_ranks) - !! From the global rank maps, fills in the rank position as well - !! as the previous and next rank in the `par` structure - - class(par_t), intent(inout) :: self - integer, dimension(:, :, :), intent(in) :: global_ranks + !! Compute rank position and neighbor ranks from global rank map. + !! + !! From the 3D global rank map, this subroutine determines the position + !! of the current rank in the processor grid and identifies the previous + !! and next neighboring ranks in each direction for halo communication. + !! Periodic wrapping is applied for neighbor identification. + class(par_t), intent(inout) :: self !! Parallel decomposition object to update + integer, dimension(:, :, :), intent(in) :: global_ranks !! 3D map of MPI ranks integer, dimension(3) :: subd_pos, subd_pos_prev, subd_pos_next integer :: dir, nproc @@ -102,10 +118,13 @@ pure subroutine compute_rank_pos_from_global(self, global_ranks) end subroutine pure subroutine copy_vert2cell_dims(self, par) - !! Copies vert_dims information to cell_dims taking - !! periodicity into account - class(grid_t), intent(inout) :: self - type(par_t), intent(in) :: par + !! Copy vertex dimensions to cell dimensions accounting for periodicity. + !! + !! For periodic boundaries, vertex and cell dimensions are equal. For + !! non-periodic boundaries on the last domain, cell dimensions are one + !! less than vertex dimensions. + class(grid_t), intent(inout) :: self !! Grid object to update + type(par_t), intent(in) :: par !! Parallel decomposition info integer :: dir logical :: is_last_domain @@ -121,10 +140,13 @@ pure subroutine copy_vert2cell_dims(self, par) end subroutine pure subroutine copy_cell2vert_dims(self, par) - !! Copies cell_dims information to vert_dims taking - !! periodicity into account - class(grid_t), intent(inout) :: self - type(par_t), intent(in) :: par + !! Copy cell dimensions to vertex dimensions accounting for periodicity. + !! + !! For periodic boundaries, vertex and cell dimensions are equal. For + !! non-periodic boundaries on the last domain, vertex dimensions are one + !! more than cell dimensions. + class(grid_t), intent(inout) :: self !! Grid object to update + type(par_t), intent(in) :: par !! Parallel decomposition info integer :: dir logical :: is_last_domain @@ -140,10 +162,17 @@ pure subroutine copy_cell2vert_dims(self, par) end subroutine subroutine obtain_coordinates(self, vert_dims, cell_dims, n_offset) - !! Obtains global coordinates for all the vertices and midpoints + !! Compute global coordinates and stretching factors for grid points. + !! + !! This subroutine calculates coordinates at both vertex-centered and + !! cell-centered locations, supporting both uniform and stretched meshes. + !! For stretched meshes, it also computes the stretching factors + !! \(\frac{ds}{d\xi}\), \(\frac{d^2s}{d\xi^2}\), and \(\frac{d^2\xi}{ds^2}\). implicit none - class(geo_t) :: self - integer, intent(in) :: vert_dims(3), cell_dims(3), n_offset(3) + class(geo_t) :: self !! Geometry object to populate + integer, intent(in) :: vert_dims(3) !! Local vertex dimensions + integer, intent(in) :: cell_dims(3) !! Local cell dimensions + integer, intent(in) :: n_offset(3) !! Cell offset due to domain decomposition integer :: dir, i, i_glob real(dp) :: L_inf, alpha, beta, r, const, s, yeta_vt, yeta_mp, coord diff --git a/src/module/ibm.f90 b/src/module/ibm.f90 index c24445625..8f6d019b7 100644 --- a/src/module/ibm.f90 +++ b/src/module/ibm.f90 @@ -1,10 +1,10 @@ module m_ibm !! This module implements the IBM capabilities. !! -!! When iibm = 0, the IBM object is never used. +!! When `iibm = 0`, the IBM object is never used. !! -!! When iibm = 1, the basic IBM capability is used. -!! It only requires ep1, a 3D field, as input. +!! When `iibm = 1`, the basic IBM capability is used. +!! It only requires `ep1`, a 3D field, as input. !! This field should be one (zero) in the fluid (solid) !! domain. use iso_fortran_env, only: stderr => error_unit @@ -25,6 +25,45 @@ module m_ibm integer, parameter :: iibm_basic = 1 type :: ibm_t + !! Immersed Boundary Method (IBM) for simulating flow around solid bodies. + !! + !! The IBM approach enables simulation of flows with complex solid geometries + !! without requiring body-fitted meshes. Instead, the solid geometry is + !! represented by a masking field (`ep1`) on a Cartesian grid. + !! + !! **Current Implementation (iibm = 1):** + !! + !! The basic IBM enforces zero velocity inside solid regions by multiplying + !! velocity components with the mask field `ep1`: + !! + !! - `ep1 = 1` in fluid regions → velocity unchanged + !! - `ep1 = 0` in solid regions → velocity set to zero + !! + !! This is applied before the pressure solve to ensure the divergence-free + !! constraint is satisfied only in the fluid domain. + !! + !! **Mask Field (ep1):** + !! + !! The `ep1` field defines the fluid/solid interface: + !! + !! - Values of 1.0 indicate fluid cells (no modification) + !! - Values of 0.0 indicate solid cells (velocity zeroed) + !! - Intermediate values (0 < ep1 < 1) represent interface cells + !! + !! **Future Extensions:** + !! + !! The current implementation sets velocity to zero in solid regions. + !! A more accurate IBM would set velocity to \(\Delta t \nabla p^n\) + !! before the pressure solve, then subtract \(\Delta t \nabla p^{n+1}\) + !! after reconstruction to properly enforce boundary conditions. + !! + !! **Components:** + !! + !! - `backend`: Computational backend for field operations + !! - `mesh`: Grid information + !! - `host_allocator`: Memory allocator for field storage + !! - `iibm`: IBM mode (0 = disabled, 1 = basic IBM) + !! - `ep1`: Mask field (1 in fluid, 0 in solid) class(base_backend_t), pointer :: backend => null() class(mesh_t), pointer :: mesh => null() type(allocator_t), pointer :: host_allocator => null() diff --git a/src/ordering.f90 b/src/ordering.f90 index 19be0a583..7c1d45c87 100644 --- a/src/ordering.f90 +++ b/src/ordering.f90 @@ -1,14 +1,16 @@ module m_ordering + !! Module for index conversion between application storage and Cartesian layouts. + !! + !! This module provides functions to convert between directional "application storage" + !! indices (optimised for cache locality) and Cartesian \( (i,j,k) \) indices. The application + !! storage layout arranges data in blocks oriented along a specific direction ( \( X, Y \), or \( Z \) ) + !! to improve memory access patterns during computations. use m_common, only: dp, get_dirs_from_rdr, DIR_X, DIR_Y, DIR_Z, DIR_C implicit none contains - !! - !! "Application storage" stores spatial data with a directionality for better cache locality - !! This set of functions converts indices from this application storage (_dir) to cartesian indices (_ijk) - !! pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, & SZ, nx_padded, ny_padded, nz_padded) diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index 3efe96b65..937f42ff6 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -1,4 +1,36 @@ module m_poisson_fft + !! FFT-based spectral Poisson solver for incompressible flow. + !! + !! This module implements fast Fourier transform (FFT) based solvers for + !! the Poisson equation: + !! \[ \nabla^2 \phi = f \] + !! + !! **Solution Strategy:** + !! + !! 1. **Forward FFT**: Transform RHS from physical to spectral space + !! 2. **Spectral division**: Solve algebraically using wave numbers: + !! \( \hat{\phi} = \hat{f} / k^2 \) + !! 3. **Backward FFT**: Transform solution back to physical space + !! + !! **Boundary Condition Support:** + !! + !! - **Periodic (000)**: Fully periodic in all directions (standard FFT) + !! - **Mixed (010)**: Periodic in \( X/Z \), non-periodic in \( Y \) (requires special handling) + !! + !! **Grid Stretching:** + !! + !! - Uniform grids in \( X \) and \( Z \) (required for FFT) + !! - \( Y \)-direction stretching supported for `010` BCs via transformation matrices + !! - Stretching handled through spectral equivalence constants + !! + !! **Parallel Implementation:** + !! + !! - Pencil decomposition in \( Y \) and \( Z \) directions (\( X \) must be undivided) + !! - Spectral space operations on permuted/transposed data layouts + !! - Backend-specific FFT implementations (CPU/GPU) + !! + !! The module is abstract; concrete implementations provide FFT routines + !! via deferred procedures (`fft_forward`, `fft_backward`, `fft_postprocess`). use m_common, only: dp, pi, CELL use m_field, only: field_t use m_mesh, only: mesh_t, geo_t @@ -7,111 +39,151 @@ module m_poisson_fft implicit none type, abstract :: poisson_fft_t - !! FFT based Poisson solver - !> Global dimensions + !! Abstract base type for FFT-based Poisson solvers. + !! + !! Concrete backend implementations (OMP, CUDA) extend this type + !! and provide FFT library integration (FFTW, cuFFT, etc.). + !> Global dimensions (full domain) integer :: nx_glob, ny_glob, nz_glob - !> Local dimensions + !> Local dimensions (subdomain on this rank) integer :: nx_loc, ny_loc, nz_loc - !> Local dimensions in the permuted slabs + !> Local dimensions in the permuted slabs (after transpose for FFT) integer :: nx_perm, ny_perm, nz_perm - !> Local dimensions in the permuted slabs in spectral space + !> Local dimensions in the permuted slabs in spectral space (complex) integer :: nx_spec, ny_spec, nz_spec - !> Offset in y and z directions in the permuted slabs in spectral space + !> Offset in x, y, z directions in the spectral space pencil integer :: x_sp_st, y_sp_st, z_sp_st - !> Local domain sized array storing the spectral equivalence constants + !> Local spectral equivalence constants (modified wave numbers) complex(dp), allocatable, dimension(:, :, :) :: waves - !> Wave numbers in x, y, and z + !> Tridiagonal coefficients for wave number computation (real part) real(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz - !> Wave numbers in x, y, and z + !> Complex wave numbers and their squares for each direction complex(dp), allocatable, dimension(:) :: kx, ky, kz, exs, eys, ezs, & k2x, k2y, k2z - !> Staggared grid transformation + !> Staggered grid transformation coefficients (real and imaginary parts) real(dp), allocatable, dimension(:) :: trans_x_re, trans_x_im, & trans_y_re, trans_y_im, & trans_z_re, trans_z_im - !> Periodicity in x, y, and z + !> Periodicity flags for each direction logical :: periodic_x, periodic_y, periodic_z, & - stretched_y = .false., stretched_y_sym - !> Stretching operator matrices + stretched_y = .false., stretched_y_sym !! Y-direction stretching + !> Stretching transformation matrices (odd/even modes, real/imaginary) real(dp), allocatable, dimension(:, :, :, :) :: a_odd_re, a_odd_im, & a_even_re, a_even_im, & a_re, a_im - !> lowmem option, only used in CUDA backend + !> Low memory mode flag (used for GPU backends to reduce memory usage) logical :: lowmem = .false. - !> Procedure pointer to BC specific poisson solvers + !> Procedure pointer to BC-specific Poisson solver implementation procedure(poisson_xxx), pointer :: poisson => null() contains - procedure(fft_forward), deferred :: fft_forward - procedure(fft_backward), deferred :: fft_backward - procedure(fft_postprocess), deferred :: fft_postprocess_000 - procedure(fft_postprocess), deferred :: fft_postprocess_010 - procedure(field_process), deferred :: enforce_periodicity_y - procedure(field_process), deferred :: undo_periodicity_y - procedure :: base_init - procedure :: solve_poisson - procedure :: stretching_matrix - procedure :: waves_set - procedure :: get_km - procedure :: get_km_re - procedure :: get_km_im + procedure(fft_forward), deferred :: fft_forward !! Forward FFT (deferred) + procedure(fft_backward), deferred :: fft_backward !! Backward FFT (deferred) + procedure(fft_postprocess), deferred :: fft_postprocess_000 !! Postprocess for 000 BCs + procedure(fft_postprocess), deferred :: fft_postprocess_010 !! Postprocess for 010 BCs + procedure(field_process), deferred :: enforce_periodicity_y !! Enforce Y periodicity + procedure(field_process), deferred :: undo_periodicity_y !! Undo Y periodicity + procedure :: base_init !! Initialise Poisson solver + procedure :: solve_poisson !! Main interface to solve Poisson equation + procedure :: stretching_matrix !! Compute stretching transformation matrices + procedure :: waves_set !! Compute spectral equivalence constants + procedure :: get_km !! Get complex wave number + procedure :: get_km_re !! Get real part of wave number + procedure :: get_km_im !! Get imaginary part of wave number end type poisson_fft_t abstract interface subroutine fft_forward(self, f_in) + !! Abstract interface for forward FFT transform. + !! + !! Transforms field from physical space to spectral space. + !! Implementation is backend-specific (FFTW, cuFFT, etc.). import :: poisson_fft_t import :: field_t implicit none - class(poisson_fft_t) :: self - class(field_t), intent(in) :: f_in + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(in) :: f_in !! Input field in physical space end subroutine fft_forward subroutine fft_backward(self, f_out) + !! Abstract interface for backward (inverse) FFT transform. + !! + !! Transforms field from spectral space back to physical space. + !! Implementation is backend-specific (FFTW, cuFFT, etc.). import :: poisson_fft_t import :: field_t implicit none - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f_out !! Output field in physical space end subroutine fft_backward subroutine fft_postprocess(self) + !! Abstract interface for spectral space postprocessing. + !! + !! Applies spectral division and any BC-specific operations + !! in Fourier space. Different implementations for different + !! boundary condition combinations (000, 010, etc.). import :: poisson_fft_t implicit none - class(poisson_fft_t) :: self + class(poisson_fft_t) :: self !! Poisson solver instance end subroutine fft_postprocess end interface abstract interface subroutine poisson_xxx(self, f, temp) + !! Abstract interface for complete Poisson solve. + !! + !! Orchestrates forward FFT, postprocessing, and backward FFT. + !! Different implementations for different BC combinations. import :: poisson_fft_t import :: field_t - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f, temp + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f, temp !! Field and temporary storage end subroutine poisson_xxx subroutine field_process(self, f_out, f_in) + !! Abstract interface for field processing operations. + !! + !! Used for enforcing or undoing periodicity in non-periodic + !! directions (e.g., Y direction for 010 BCs). import :: poisson_fft_t import :: field_t - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out - class(field_t), intent(in) :: f_in + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f_out !! Output field + class(field_t), intent(in) :: f_in !! Input field end subroutine field_process end interface contains subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st) + !! Initialise FFT-based Poisson solver with mesh and decomposition info. + !! + !! Sets up: + !! - Domain dimensions (global and local) + !! - Periodicity flags from boundary conditions + !! - Spectral space dimensions and offsets + !! - Wave number arrays and spectral equivalence constants + !! - Stretching matrices (if Y-direction is stretched) + !! - Function pointer to appropriate BC-specific solver + !! + !! **Restrictions:** + !! - X-direction must not be decomposed (nproc_dir(1) must be 1) + !! - Only Y-direction stretching is supported + !! - Currently supports 000 (fully periodic) and 010 (Y non-periodic) BCs + !! + !! **Note:** 010 BCs with multiple MPI ranks not yet supported. implicit none - class(poisson_fft_t) :: self - type(mesh_t), intent(in) :: mesh - type(dirps_t), intent(in) :: xdirps, ydirps, zdirps - integer, dimension(3), intent(in) :: n_spec ! Size of the spectral pencil - integer, dimension(3), intent(in) :: n_sp_st ! Offset of the spectral pencil + class(poisson_fft_t) :: self !! Poisson solver instance + type(mesh_t), intent(in) :: mesh !! Mesh object with grid and decomposition + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps !! Directional operators + integer, dimension(3), intent(in) :: n_spec !! Size of the spectral pencil [nx, ny, nz] + integer, dimension(3), intent(in) :: n_sp_st !! Offset of the spectral pencil [x, y, z] integer :: dims(3) @@ -180,20 +252,33 @@ subroutine base_init(self, mesh, xdirps, ydirps, zdirps, n_spec, n_sp_st) end subroutine base_init subroutine solve_poisson(self, f, temp) + !! Main interface to solve Poisson equation. + !! + !! Delegates to the BC-specific solver function pointed to by + !! self%poisson (either poisson_000 or poisson_010). This provides + !! a uniform interface regardless of boundary conditions. implicit none - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f, temp + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f, temp !! Field to solve (RHS in, solution out), temporary call self%poisson(f, temp) end subroutine solve_poisson subroutine poisson_000(self, f, temp) + !! Solve Poisson equation with fully periodic (000) boundary conditions. + !! + !! For periodic BCs in all directions, the solution procedure is: + !! 1. Forward FFT: f to f_hat + !! 2. Spectral division: \( \hat{f} / k^2 \) gives solution_hat + !! 3. Backward FFT: solution_hat to solution + !! + !! This is the simplest case requiring no special handling for BCs. implicit none - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f, temp + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f, temp !! Field (RHS in, solution out), temporary (unused) call self%fft_forward(f) call self%fft_postprocess_000 @@ -202,10 +287,21 @@ subroutine poisson_000(self, f, temp) end subroutine poisson_000 subroutine poisson_010(self, f, temp) + !! Solve Poisson equation with mixed (010) boundary conditions. + !! + !! For periodic in X/Z, non-periodic in Y, the solution procedure is: + !! 1. Enforce artificial periodicity in Y using symmetry extension + !! 2. Forward FFT: f to f_hat + !! 3. Spectral division with stretching corrections (if grid is stretched) + !! 4. Backward FFT: solution_hat to solution + !! 5. Undo artificial periodicity to recover physical solution + !! + !! The symmetry extension doubles the domain size in Y to handle + !! non-periodic BCs via FFT. implicit none - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f, temp + class(poisson_fft_t) :: self !! Poisson solver instance + class(field_t), intent(inout) :: f, temp !! Field (RHS in, solution out), temporary call self%enforce_periodicity_y(temp, f) diff --git a/src/solver.f90 b/src/solver.f90 index 700f4e912..dd5b5b8f2 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -1,4 +1,19 @@ module m_solver + !! Main solver module implementing the Incompact3D numerical algorithm. + !! + !! This module provides the high-level solver infrastructure for solving + !! incompressible Navier-Stokes equations using compact finite differences. + !! The solver orchestrates the transport equation (`transeq`), divergence, + !! Poisson solver, and gradient operations required for the fractional-step + !! projection method. + !! + !! The implementation supports: + !! + !! - Multiple backend executors (CPU/GPU) + !! - Distributed and Thomas algorithm for derivatives + !! - Immersed boundary method (IBM) + !! - Multi-species transport + !! - Various time integration schemes use iso_fortran_env, only: stderr => error_unit use mpi @@ -47,53 +62,65 @@ module m_solver !! method of the allocator can be used to make this field available !! for later use. - real(dp) :: dt, nu - real(dp), dimension(:), allocatable :: nu_species - integer :: n_iters, n_output - integer :: current_iter = 0 - integer :: ngrid - integer :: nvars = 3 - integer :: nspecies = 0 - - class(field_t), pointer :: u, v, w - type(flist_t), dimension(:), pointer :: species => null() - - class(base_backend_t), pointer :: backend - type(mesh_t), pointer :: mesh - type(time_intg_t) :: time_integrator - type(allocator_t), pointer :: host_allocator - type(dirps_t), pointer :: xdirps, ydirps, zdirps - type(vector_calculus_t) :: vector_calculus - type(ibm_t) :: ibm - logical :: ibm_on - procedure(poisson_solver), pointer :: poisson => null() - procedure(transport_equation), pointer :: transeq => null() + real(dp) :: dt !! Time step size + real(dp) :: nu !! Kinematic viscosity + real(dp), dimension(:), allocatable :: nu_species !! Viscosities for multiple species + integer :: n_iters !! Total number of time iterations + integer :: n_output !! Output frequency (every nth iteration) + integer :: current_iter = 0 !! Current iteration number + integer :: ngrid !! Total number of grid points + integer :: nvars = 3 !! Number of velocity variables (u,v,w) + integer :: nspecies = 0 !! Number of scalar species to transport + + class(field_t), pointer :: u, v, w !! Velocity field components + type(flist_t), dimension(:), pointer :: species => null() !! Array of scalar species fields + + class(base_backend_t), pointer :: backend !! Backend executor (CPU/GPU) + type(mesh_t), pointer :: mesh !! Computational mesh + type(time_intg_t) :: time_integrator !! Time integration scheme + type(allocator_t), pointer :: host_allocator !! Memory allocator for host arrays + type(dirps_t), pointer :: xdirps, ydirps, zdirps !! Tridiagonal operators in each direction + type(vector_calculus_t) :: vector_calculus !! Vector calculus operations + type(ibm_t) :: ibm !! Immersed boundary method handler + logical :: ibm_on !! Flag to enable/disable IBM + procedure(poisson_solver), pointer :: poisson => null() !! Poisson solver procedure pointer + procedure(transport_equation), pointer :: transeq => null() !! Transport equation solver pointer contains - procedure :: transeq_species - procedure :: pressure_correction - procedure :: divergence_v2p - procedure :: gradient_p2v - procedure :: curl + procedure :: transeq_species !! Compute transport equation for scalar species + procedure :: pressure_correction !! Apply pressure correction to enforce incompressibility + procedure :: divergence_v2p !! Compute divergence of velocity field + procedure :: gradient_p2v !! Compute pressure gradient + procedure :: curl !! Compute curl (vorticity) of velocity field end type solver_t abstract interface subroutine poisson_solver(self, pressure, div_u) + !! Interface for Poisson solver implementations. + !! + !! Solves the Poisson equation \( \nabla^2 p = f \) where f is the + !! divergence of the intermediate velocity field. import :: solver_t import :: field_t implicit none class(solver_t) :: self - class(field_t), intent(inout) :: pressure - class(field_t), intent(in) :: div_u + class(field_t), intent(inout) :: pressure !! Pressure field (solution) + class(field_t), intent(in) :: div_u !! Velocity divergence (RHS) end subroutine poisson_solver subroutine transport_equation(self, rhs, variables) + !! Interface for transport equation implementations. + !! + !! Computes the right-hand side of the transport equation including + !! convection, diffusion, and any source terms. The momentum equations are: + !! \[ \frac{\partial \mathbf{u}}{\partial t} + (\mathbf{u} \cdot \nabla)\mathbf{u} = -\nabla p + \nu \nabla^2 \mathbf{u} \] import :: solver_t import :: flist_t implicit none class(solver_t) :: self - type(flist_t), intent(inout) :: rhs(:), variables(:) + type(flist_t), intent(inout) :: rhs(:) !! Right-hand side terms (output) + type(flist_t), intent(inout) :: variables(:) !! Field variables (velocity components) end subroutine transport_equation end interface @@ -104,12 +131,25 @@ end subroutine transport_equation contains function init(backend, mesh, host_allocator) result(solver) + !! Initialise the solver with backend, mesh, and configuration. + !! + !! This function sets up the complete solver infrastructure including: + !! - Velocity field allocation (u, v, w) + !! - Tridiagonal operators for each direction (xdirps, ydirps, zdirps) + !! - Time integrator + !! - Poisson solver (FFT or CG) + !! - Transport equation solver (default or low-memory variant) + !! - Optional scalar species transport + !! - Optional immersed boundary method (IBM) + !! + !! All configuration is read from the namelist file specified as the first + !! command-line argument. implicit none - class(base_backend_t), target, intent(inout) :: backend - type(mesh_t), target, intent(inout) :: mesh - type(allocator_t), target, intent(inout) :: host_allocator - type(solver_t) :: solver + class(base_backend_t), target, intent(inout) :: backend !! Backend executor (CPU/GPU) + type(mesh_t), target, intent(inout) :: mesh !! Computational mesh + type(allocator_t), target, intent(inout) :: host_allocator !! Host memory allocator + type(solver_t) :: solver !! Initialised solver object type(solver_config_t) :: solver_cfg integer :: i @@ -208,11 +248,22 @@ end function init subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, & der2nd_scheme, interpl_scheme, stagder_scheme) - type(dirps_t), intent(inout) :: dirps - class(base_backend_t), intent(in) :: backend - type(mesh_t), intent(in) :: mesh - character(*), intent(in) :: der1st_scheme, der2nd_scheme, & - interpl_scheme, stagder_scheme + !! Allocate and initialise tridiagonal operators for a given direction. + !! + !! This subroutine creates the compact finite difference operators needed for: + !! - First derivatives (der1st) + !! - Second derivatives (der2nd) + !! - Interpolation (interpl) + !! - Staggered derivatives (stagder) + !! + !! Boundary conditions are determined from the mesh periodicity flags. + type(dirps_t), intent(inout) :: dirps !! Direction-specific operator set + class(base_backend_t), intent(in) :: backend !! Backend executor + type(mesh_t), intent(in) :: mesh !! Computational mesh + character(*), intent(in) :: der1st_scheme !! First derivative scheme name + character(*), intent(in) :: der2nd_scheme !! Second derivative scheme name + character(*), intent(in) :: interpl_scheme !! Interpolation scheme name + character(*), intent(in) :: stagder_scheme !! Staggered derivative scheme name integer :: dir, bc_start, bc_end, bc_mp_start, bc_mp_end, n_vert, n_cell, i real(dp) :: d @@ -282,15 +333,23 @@ subroutine allocate_tdsops(dirps, backend, mesh, der1st_scheme, & end subroutine subroutine transeq_default(self, rhs, variables) - !! Skew-symmetric form of convection-diffusion terms in the - !! incompressible Navier-Stokes momemtum equations, excluding - !! pressure terms. - !! Inputs from velocity grid and outputs to velocity grid. + !! Compute transport equation RHS using default (high-memory) algorithm. + !! + !! Evaluates the skew-symmetric form of convection-diffusion terms in the + !! incompressible Navier-Stokes momentum equations, excluding pressure: + !! \[ RHS = -(\mathbf{u} \cdot \nabla)\mathbf{u} + \nu \nabla^2 \mathbf{u} \] + !! + !! Uses skew-symmetric formulation for numerical stability: + !! \[ (\mathbf{u} \cdot \nabla)\mathbf{u} = \frac{1}{2}[(\mathbf{u} \cdot \nabla)\mathbf{u} + \nabla \cdot (\mathbf{u}\mathbf{u})] \] + !! + !! This version stores intermediate results for all velocity components, + !! providing better performance at the cost of higher memory usage. + !! Both inputs and outputs are on the velocity (vertex) grid. implicit none class(solver_t) :: self - type(flist_t), intent(inout) :: rhs(:) - type(flist_t), intent(inout) :: variables(:) + type(flist_t), intent(inout) :: rhs(:) !! Right-hand side output (du/dt, dv/dt, dw/dt) + type(flist_t), intent(inout) :: variables(:) !! Velocity components (u, v, w) class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, & du_y, dv_y, dw_y, du_z, dv_z, dw_z, & @@ -382,12 +441,20 @@ subroutine transeq_default(self, rhs, variables) end subroutine transeq_default subroutine transeq_lowmem(self, rhs, variables) - !! low memory version of the transport equation, roughly %2 slower overall + !! Compute transport equation RHS using low-memory algorithm. + !! + !! Evaluates the same skew-symmetric form as transeq_default but with + !! reduced memory footprint by reusing field storage. This approach is + !! approximately 2% slower but uses significantly less memory, which can + !! be important for large simulations or GPU implementations with limited + !! memory. + !! + !! See transeq_default for the mathematical formulation. implicit none class(solver_t) :: self - type(flist_t), intent(inout) :: rhs(:) - type(flist_t), intent(inout) :: variables(:) + type(flist_t), intent(inout) :: rhs(:) !! Right-hand side output (du/dt, dv/dt, dw/dt) + type(flist_t), intent(inout) :: variables(:) !! Velocity components (u, v, w) class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, & du_y, dv_y, dw_y, du_z, dv_z, dw_z, du, dv, dw, u, v, w @@ -498,14 +565,20 @@ subroutine transeq_lowmem(self, rhs, variables) end subroutine transeq_lowmem subroutine transeq_species(self, rhs, variables) - !! Skew-symmetric form of convection-diffusion terms in the - !! species equation. - !! Inputs from velocity grid and outputs to velocity grid. + !! Compute transport equation for passive scalar species. + !! + !! Evaluates the convection-diffusion equation for transported scalars: + !! \[ \frac{\partial \phi}{\partial t} + (\mathbf{u} \cdot \nabla)\phi = \nu_\phi \nabla^2 \phi \] + !! + !! where \( \phi \) represents each scalar species, \( \nu_\phi \) is the + !! species diffusivity. Uses skew-symmetric form similar to momentum equations. + !! Velocity field must be available in self%u, self%v, self%w. + !! Both inputs and outputs are on the velocity (vertex) grid. implicit none class(solver_t) :: self - type(flist_t), intent(inout) :: rhs(:) - type(flist_t), intent(in) :: variables(:) + type(flist_t), intent(inout) :: rhs(:) !! Right-hand side for species equations + type(flist_t), intent(in) :: variables(:) !! Scalar species fields integer :: i class(field_t), pointer :: u, v, w, & @@ -594,12 +667,18 @@ subroutine transeq_species(self, rhs, variables) end subroutine transeq_species subroutine divergence_v2p(self, div_u, u, v, w) - !! Wrapper for divergence_v2p + !! Compute divergence of velocity field from vertex to cell centers. + !! + !! Calculates \( \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \) + !! using staggered derivatives and interpolation operators. The input velocity + !! components are on the vertex grid and the output divergence is on the cell-centered grid. + !! + !! For incompressible flow, this should be zero (up to numerical errors). implicit none class(solver_t) :: self - class(field_t), intent(inout) :: div_u - class(field_t), intent(in) :: u, v, w + class(field_t), intent(inout) :: div_u !! Velocity divergence (output, cell-centered) + class(field_t), intent(in) :: u, v, w !! Velocity components (input, vertex-centered) call self%vector_calculus%divergence_v2c( & div_u, u, v, w, & @@ -611,12 +690,19 @@ subroutine divergence_v2p(self, div_u, u, v, w) end subroutine divergence_v2p subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) - !! Wrapper for gradient_p2v + !! Compute pressure gradient from cell centers to vertices. + !! + !! Calculates the pressure gradient components: + !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \] + !! + !! using staggered derivatives and interpolation operators. The input pressure + !! is on the cell-centered grid and the output gradient components are on the vertex grid. + !! This is used in the pressure correction step of the fractional-step method. implicit none class(solver_t) :: self - class(field_t), intent(inout) :: dpdx, dpdy, dpdz - class(field_t), intent(in) :: pressure + class(field_t), intent(inout) :: dpdx, dpdy, dpdz !! Pressure gradient components (vertex-centered) + class(field_t), intent(in) :: pressure !! Pressure field (cell-centered) call self%vector_calculus%gradient_c2v( & dpdx, dpdy, dpdz, pressure, & @@ -628,7 +714,13 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) end subroutine gradient_p2v subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) - !! Wrapper for curl + !! Compute curl (vorticity) of the velocity field. + !! + !! Calculates the curl of velocity: + !! \[ \boldsymbol{\omega} = \nabla \times \mathbf{u} = \left( \frac{\partial w}{\partial y} - \frac{\partial v}{\partial z}, \frac{\partial u}{\partial z} - \frac{\partial w}{\partial x}, \frac{\partial v}{\partial x} - \frac{\partial u}{\partial y} \right) \] + !! + !! All fields are on the vertex grid. This is primarily used for + !! post-processing and visualisation of vorticity. implicit none class(solver_t) :: self @@ -644,11 +736,23 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) end subroutine curl subroutine poisson_fft(self, pressure, div_u) + !! Solve Poisson equation using Fast Fourier Transform method. + !! + !! Solves \( \nabla^2 p = f \) where f is the velocity divergence, + !! using FFT-based spectral method. This is very efficient for periodic + !! or Neumann boundary conditions and is the default/recommended solver. + !! + !! The solution process involves: + !! 1. Transform to 3D Cartesian data structure + !! 2. Apply FFT in periodic/Neumann directions + !! 3. Solve in spectral space + !! 4. Inverse FFT back to physical space + !! 5. Transform back to pencil decomposition implicit none class(solver_t) :: self - class(field_t), intent(inout) :: pressure - class(field_t), intent(in) :: div_u + class(field_t), intent(inout) :: pressure !! Pressure field (solution) + class(field_t), intent(in) :: div_u !! Velocity divergence (RHS) class(field_t), pointer :: p_temp, temp @@ -671,11 +775,17 @@ subroutine poisson_fft(self, pressure, div_u) end subroutine poisson_fft subroutine poisson_cg(self, pressure, div_u) + !! Solve Poisson equation using Conjugate Gradient method. + !! + !! This is a placeholder for iterative Poisson solver using CG method. + !! Currently sets pressure to zero for performance testing. + !! Will be fully implemented for cases where FFT is not suitable + !! (e.g., complex geometries or Dirichlet boundary conditions). implicit none class(solver_t) :: self - class(field_t), intent(inout) :: pressure - class(field_t), intent(in) :: div_u + class(field_t), intent(inout) :: pressure !! Pressure field (solution) + class(field_t), intent(in) :: div_u !! Velocity divergence (RHS) ! set the pressure field to 0 so that we can do performance tests easily ! this will be removed once the CG solver is implemented of course @@ -684,10 +794,19 @@ subroutine poisson_cg(self, pressure, div_u) end subroutine poisson_cg subroutine pressure_correction(self, u, v, w) + !! Apply pressure correction to enforce incompressibility constraint. + !! + !! Implements the projection step of the fractional-step method: + !! 1. Compute divergence of intermediate velocity: \( \nabla \cdot \mathbf{u}^* \) + !! 2. Solve Poisson equation: \( \nabla^2 p = \frac{1}{\Delta t} \nabla \cdot \mathbf{u}^* \) + !! 3. Correct velocity: \( \mathbf{u}^{n+1} = \mathbf{u}^* - \Delta t \nabla p \) + !! + !! After correction, the velocity field is divergence-free (incompressible). + !! If IBM is active, IBM forcing is applied after pressure correction. implicit none class(solver_t) :: self - class(field_t), intent(inout) :: u, v, w + class(field_t), intent(inout) :: u, v, w !! Velocity components (corrected in-place) class(field_t), pointer :: div_u, pressure, dpdx, dpdy, dpdz diff --git a/src/tdsops.f90 b/src/tdsops.f90 index 40623711a..09e68aa66 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -1,4 +1,28 @@ module m_tdsops + !! Tridiagonal solver operators for compact finite differences. + !! + !! This module provides preprocessed tridiagonal operator arrays for + !! solving compact finite difference schemes. It supports both distributed + !! and Thomas algorithm implementations for computing: + !! + !! - First and second derivatives + !! - Interpolation between vertex and cell-centre grids + !! - Staggered derivatives + !! + !! The operators are preprocessed based on: + !! + !! - Grid spacing and optional stretching + !! - Boundary conditions (periodic, Neumann, Dirichlet) + !! - Numerical scheme (compact schemes of various orders) + !! - Symmetry properties for free-slip boundaries + !! + !! The distributed algorithm is designed for parallel execution and consists of: + !! + !! 1. Forward/backward elimination phase (`dist_fw`, `dist_bw`) + !! 2. Back-substitution phase (`dist_sa`, `dist_sc`) + !! + !! The Thomas algorithm (`thom_*`) is used for serial execution or + !! when the distributed approach is not suitable. use iso_fortran_env, only: stderr => error_unit use m_common, only: dp, pi, VERT, CELL, & @@ -24,21 +48,35 @@ module m_tdsops !! This class does not know about the current rank or its relative !! location among other ranks. All the operator arrays here are used when !! executing a distributed tridiagonal solver phase one or two. - real(dp), allocatable, dimension(:) :: dist_fw, dist_bw, & !! fw/bw phase - dist_sa, dist_sc, & !! back subs. - dist_af !! the auxiliary factors - real(dp), allocatable, dimension(:) :: thom_f, thom_s, thom_w, thom_p - real(dp), allocatable :: stretch(:), stretch_correct(:) - real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :) - real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp !! Compact scheme coeffs - logical :: periodic - integer :: n_tds !! Tridiagonal system size - integer :: n_rhs !! Right-hand-side builder size - integer :: move = 0 !! move between vertices and cell centres - integer :: n_halo !! number of halo points + real(dp), allocatable, dimension(:) :: dist_fw !! Forward elimination coefficients (distributed) + real(dp), allocatable, dimension(:) :: dist_bw !! Backward elimination coefficients (distributed) + real(dp), allocatable, dimension(:) :: dist_sa !! Back-substitution coefficients A (distributed) + real(dp), allocatable, dimension(:) :: dist_sc !! Back-substitution coefficients C (distributed) + real(dp), allocatable, dimension(:) :: dist_af !! Auxiliary factors (distributed) + real(dp), allocatable, dimension(:) :: thom_f !! Forward elimination factors (Thomas) + real(dp), allocatable, dimension(:) :: thom_s !! Scaling factors (Thomas) + real(dp), allocatable, dimension(:) :: thom_w !! Work array (Thomas) + real(dp), allocatable, dimension(:) :: thom_p !! Precomputed products (Thomas) + real(dp), allocatable :: stretch(:) !! Grid stretching coefficients + real(dp), allocatable :: stretch_correct(:) !! Stretch correction for 2nd derivatives + real(dp), allocatable :: coeffs(:) !! RHS builder coefficients (interior) + real(dp), allocatable :: coeffs_s(:, :) !! RHS builder coefficients (start boundary) + real(dp), allocatable :: coeffs_e(:, :) !! RHS builder coefficients (end boundary) + real(dp) :: alpha !! Compact scheme coefficient (LHS) + real(dp) :: a, b !! Compact scheme coefficients (RHS) + real(dp) :: c = 0._dp, d = 0._dp !! Extended compact scheme coefficients + logical :: periodic !! Periodic boundary condition flag + integer :: n_tds !! Tridiagonal system size + integer :: n_rhs !! Right-hand-side builder size + integer :: move = 0 !! Offset for vertex/cell-centre conversion + integer :: n_halo !! Number of halo points contains - procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st - procedure :: preprocess_dist, preprocess_thom + procedure :: deriv_1st !! Set up first derivative operator + procedure :: deriv_2nd !! Set up second derivative operator + procedure :: interpl_mid !! Set up interpolation operator + procedure :: stagder_1st !! Set up staggered derivative operator + procedure :: preprocess_dist !! Preprocess for distributed algorithm + procedure :: preprocess_thom !! Preprocess for Thomas algorithm end type tdsops_t interface tdsops_t @@ -49,10 +87,21 @@ module m_tdsops !! Directional tridiagonal solver container. !! !! This class contains the preprocessed tridiagonal solvers for operating - !! in each coordinate direction. - class(tdsops_t), allocatable :: der1st, der1st_sym, der2nd, der2nd_sym, & - stagder_v2p, stagder_p2v, interpl_v2p, interpl_p2v - integer :: dir + !! in a specific coordinate direction (x, y, or z). Each direction requires + !! different operators for: + !! - Regular and symmetric first derivatives + !! - Regular and symmetric second derivatives + !! - Staggered derivatives (vertex-to-cell and cell-to-vertex) + !! - Interpolation (vertex-to-cell and cell-to-vertex) + class(tdsops_t), allocatable :: der1st !! First derivative operator + class(tdsops_t), allocatable :: der1st_sym !! Symmetric first derivative operator + class(tdsops_t), allocatable :: der2nd !! Second derivative operator + class(tdsops_t), allocatable :: der2nd_sym !! Symmetric second derivative operator + class(tdsops_t), allocatable :: stagder_v2p !! Staggered derivative (vertex to cell) + class(tdsops_t), allocatable :: stagder_p2v !! Staggered derivative (cell to vertex) + class(tdsops_t), allocatable :: interpl_v2p !! Interpolation (vertex to cell) + class(tdsops_t), allocatable :: interpl_p2v !! Interpolation (cell to vertex) + integer :: dir !! Direction index (DIR_X, DIR_Y, DIR_Z) end type dirps_t contains @@ -61,44 +110,57 @@ function tdsops_init( & n_tds, delta, operation, scheme, bc_start, bc_end, & stretch, stretch_correct, n_halo, from_to, sym, c_nu, nu0_nu & ) result(tdsops) - !! Constructor function for the tdsops_t class. + !! Initialise and construct a tridiagonal operator. !! - !! 'n_tds', 'delta', 'operation', 'scheme', 'bc_start', and 'bc_end' are - !! necessary arguments. The remaining arguments are optional. + !! This function creates a preprocessed tridiagonal operator for compact + !! finite difference operations. Required arguments are 'n_tds', 'delta', + !! 'operation', 'scheme', 'bc_start', and 'bc_end'. Optional arguments + !! enable stretched grids, staggered operations, and boundary condition tuning. !! - !! 'stretch' is for obtaining the correct derivations in a stretched mesh - !! 'stretch_correct' is for correcting the second derivative with the first + !! **Operation types:** + !! - 'first-deriv': First derivative \( \frac{\partial f}{\partial x} \) + !! - 'second-deriv': Second derivative \( \frac{\partial^2 f}{\partial x^2} \) + !! - 'interpolate': Interpolation between grids + !! - 'stag-deriv': Staggered derivative (vertex ↔ cell) !! - !! 'from_to' is necessary for interpolation and staggared derivative, and - !! it can be 'v2p' or 'p2v'. - !! If the specific region the instance is operating is not a boundary - !! region, then 'bc_start' and 'bc_end' are BC_HALO. + !! **Boundary conditions:** + !! - BC_PERIODIC: Periodic boundaries + !! - BC_NEUMANN: Neumann (zero gradient) boundaries + !! - BC_DIRICHLET: Dirichlet (fixed value) boundaries !! - !! 'sym' is relevant when the BC is free-slip. If sym is .true. then it - !! means the field we operate on is assumed to be an even function - !! (symmetric, cos type) accross the boundary. If it is .false. it means - !! the field is assumed to be an odd function (anti-symmetric, sin type). + !! **Optional stretched grid support:** + !! 'stretch' provides stretching coefficients for non-uniform grids. + !! 'stretch_correct' applies correction for second derivatives on stretched grids. !! - !! 'c_nu', 'nu0_nu' are relevant when operation is second order - !! derivative and scheme is compact6-hyperviscous. + !! **Staggered operations:** + !! 'from_to' specifies direction: 'v2p' (vertex-to-cell) or 'p2v' (cell-to-vertex) + !! + !! **Symmetry for free-slip boundaries:** + !! 'sym' determines field symmetry at Neumann boundaries: + !! - .true. = symmetric (cos-type, even function) + !! - .false. = anti-symmetric (sin-type, odd function) + !! + !! **Hyperviscosity parameters:** + !! 'c_nu' and 'nu0_nu' are used for compact6-hyperviscous second derivatives implicit none - type(tdsops_t) :: tdsops !! return value of the function - - integer, intent(in) :: n_tds !! Tridiagonal system size - real(dp), intent(in) :: delta !! Grid spacing - character(*), intent(in) :: operation, scheme - integer, intent(in) :: bc_start, bc_end !! Boundary Cond. - real(dp), optional, intent(in) :: stretch(:) !! Stretching coefficients - real(dp), optional, intent(in) :: stretch_correct(:) !! Stretch correction - integer, optional, intent(in) :: n_halo !! Number of halo cells - character(*), optional, intent(in) :: from_to !! 'v2p' or 'p2v' - logical, optional, intent(in) :: sym !! (==npaire), only for Neumann BCs - real(dp), optional, intent(in) :: c_nu, nu0_nu !! params for hypervisc. + type(tdsops_t) :: tdsops !! Constructed tridiagonal operator + + integer, intent(in) :: n_tds !! Tridiagonal system size + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: operation !! Operation type + character(*), intent(in) :: scheme !! Numerical scheme name + integer, intent(in) :: bc_start, bc_end !! Boundary conditions + real(dp), optional, intent(in) :: stretch(:) !! Grid stretching coefficients + real(dp), optional, intent(in) :: stretch_correct(:) !! Stretch correction + integer, optional, intent(in) :: n_halo !! Number of halo cells + character(*), optional, intent(in) :: from_to !! Staggering: 'v2p' or 'p2v' + logical, optional, intent(in) :: sym !! Symmetry for Neumann BCs + real(dp), optional, intent(in) :: c_nu, nu0_nu !! Hyperviscosity parameters #ifdef SINGLE_PREC - real(dp) :: tol = 1e-12 + real(dp) :: tol = 1e-12 !! Tolerance for checking small coefficients in single precision #else - real(dp) :: tol = 1e-16 + real(dp) :: tol = 1e-16 !! Tolerance for checking small coefficients in double precision #endif integer :: n, n_stencil @@ -197,13 +259,28 @@ function tdsops_init( & end function tdsops_init subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) + !! Set up first derivative operator. + !! + !! Configures the compact finite difference operator for computing first + !! derivatives \( \frac{\partial f}{\partial x} \). Supports various compact + !! schemes with different orders of accuracy: + !! + !! **Supported schemes:** + !! - 'compact6': 6th-order accuracy + !! - 'compact6-exp': 6th-order with exponential profile + !! - 'compact6-hyp': 6th-order with hyperbolic profile + !! + !! The operator is built for the tridiagonal system: + !! \[ \alpha f'_{i-1} + f'_i + \alpha f'_{i+1} = a \frac{f_{i+1} - f_{i-1}}{2\Delta x} + b \frac{f_{i+2} - f_{i-2}}{4\Delta x} \] + !! + !! Boundary conditions modify the stencil near domain boundaries. implicit none - class(tdsops_t), intent(inout) :: self - real(dp), intent(in) :: delta - character(*), intent(in) :: scheme - integer, intent(in) :: bc_start, bc_end - logical, optional, intent(in) :: sym + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: scheme !! Scheme name + integer, intent(in) :: bc_start, bc_end !! Boundary conditions + logical, optional, intent(in) :: sym !! Symmetry flag for Neumann BCs real(dp), allocatable :: dist_b(:) real(dp) :: alpha, afi, bfi @@ -344,14 +421,30 @@ end subroutine deriv_1st subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & c_nu, nu0_nu) + !! Set up second derivative operator. + !! + !! Configures the compact finite difference operator for computing second + !! derivatives \( \frac{\partial^2 f}{\partial x^2} \). Supports various compact + !! schemes with different orders of accuracy and optional hyperviscosity. + !! + !! **Supported schemes:** + !! - 'compact6': 6th-order accuracy + !! - 'compact6-hyperviscous': 6th-order with selective hyperviscosity + !! + !! The operator is built for the tridiagonal system: + !! \[ \alpha f''_{i-1} + f''_i + \alpha f''_{i+1} = a \frac{f_{i+1} - 2f_i + f_{i-1}}{\Delta x^2} + b \frac{f_{i+2} - 2f_i + f_{i-2}}{4\Delta x^2} \] + !! + !! **Hyperviscosity:** Optional 'c_nu' and 'nu0_nu' parameters enable selective + !! damping of high-frequency modes for numerical stability. implicit none - class(tdsops_t), intent(inout) :: self - real(dp), intent(in) :: delta - character(*), intent(in) :: scheme - integer, intent(in) :: bc_start, bc_end - logical, optional, intent(in) :: sym - real(dp), optional, intent(in) :: c_nu, nu0_nu + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: scheme !! Scheme name + integer, intent(in) :: bc_start, bc_end !! Boundary conditions + logical, optional, intent(in) :: sym !! Symmetry flag for Neumann BCs + real(dp), optional, intent(in) :: c_nu !! Hyperviscosity coefficient + real(dp), optional, intent(in) :: nu0_nu !! Hyperviscosity parameter real(dp), allocatable :: dist_b(:) real(dp) :: alpha, asi, bsi, csi, dsi @@ -556,12 +649,29 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & end subroutine deriv_2nd subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) + !! Set up interpolation operator between vertex and cell grids. + !! + !! Configures the compact interpolation operator for transferring data + !! between staggered grids (vertex-centred ↔ cell-centred). Uses compact + !! schemes for high-order accuracy. + !! + !! **Supported schemes:** + !! - 'compact6': 6th-order interpolation + !! - 'classic': Classical 2nd-order interpolation + !! + !! **Direction:** + !! - 'v2p': Vertex to cell-centre (pressure point) + !! - 'p2v': Cell-centre to vertex + !! + !! The interpolation is critical for maintaining consistency between + !! velocity and pressure grids in staggered arrangements. implicit none - class(tdsops_t), intent(inout) :: self - character(*), intent(in) :: scheme, from_to - integer, intent(in) :: bc_start, bc_end - logical, optional, intent(in) :: sym + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + character(*), intent(in) :: scheme !! Interpolation scheme name + character(*), intent(in) :: from_to !! Direction: 'v2p' or 'p2v' + integer, intent(in) :: bc_start, bc_end !! Boundary conditions + logical, optional, intent(in) :: sym !! Symmetry flag for Neumann BCs real(dp), allocatable :: dist_b(:) real(dp) :: alpha, aici, bici, cici, dici @@ -702,13 +812,32 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) end subroutine interpl_mid subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) + !! Set up staggered first derivative operator. + !! + !! Configures the compact operator for computing first derivatives on + !! staggered grids, where the derivative is computed at a different grid + !! location than the input data. + !! + !! **Supported schemes:** + !! - 'compact6': 6th-order staggered derivative + !! - 'classic': Classical 2nd-order staggered derivative + !! + !! **Direction:** + !! - 'v2p': Derivative from vertex grid to cell-centre grid + !! - 'p2v': Derivative from cell-centre grid to vertex grid + !! + !! Staggered derivatives are essential for: + !! - Computing divergence and gradient on staggered grids + !! - Maintaining numerical stability in pressure-velocity coupling + !! - Accurate representation of boundary conditions implicit none - class(tdsops_t), intent(inout) :: self - real(dp), intent(in) :: delta - character(*), intent(in) :: scheme, from_to - integer, intent(in) :: bc_start, bc_end - logical, optional, intent(in) :: sym + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + real(dp), intent(in) :: delta !! Grid spacing + character(*), intent(in) :: scheme !! Scheme name + character(*), intent(in) :: from_to !! Direction: 'v2p' or 'p2v' + integer, intent(in) :: bc_start, bc_end !! Boundary conditions + logical, optional, intent(in) :: sym !! Symmetry flag for Neumann BCs real(dp), allocatable :: dist_b(:) real(dp) :: alpha, aci, bci @@ -810,11 +939,23 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) end subroutine stagder_1st subroutine preprocess_dist(self, dist_b) + !! Preprocess tridiagonal system for distributed algorithm. + !! + !! This subroutine preprocesses the tridiagonal matrix coefficients for + !! use in the distributed (parallel) tridiagonal solver algorithm. The + !! preprocessing follows Algorithm 3 from: + !! Reference: DOI: 10.1109/MCSE.2021.3130544 + !! + !! The distributed algorithm consists of two phases: + !! 1. **Forward/backward elimination**: Reduces the system in parallel subdomains + !! 2. **Back-substitution**: Applies corrections from neighbouring ranks + !! + !! This preprocessing computes the coefficients (dist_fw, dist_bw, dist_sa, + !! dist_sc, dist_af) needed for both phases, enabling efficient parallel execution. implicit none - class(tdsops_t), intent(inout) :: self - - real(dp), dimension(:), intent(in) :: dist_b + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + real(dp), dimension(:), intent(in) :: dist_b !! Diagonal coefficients of tridiagonal system integer :: i @@ -869,10 +1010,24 @@ subroutine preprocess_dist(self, dist_b) end subroutine preprocess_dist subroutine preprocess_thom(self, b) + !! Preprocess tridiagonal system for Thomas algorithm. + !! + !! This subroutine preprocesses the tridiagonal matrix coefficients for + !! use in the Thomas algorithm (serial tridiagonal solver). The Thomas + !! algorithm is a simplified form of Gaussian elimination optimised for + !! tridiagonal systems. + !! + !! The preprocessing performs forward elimination on the coefficients: + !! \( c'_i = c_i / (b_i - a_i \cdot c'_{i-1}) \) + !! \( d'_i = (d_i - a_i \cdot d'_{i-1}) / (b_i - a_i \cdot c'_{i-1}) \) + !! + !! This enables efficient back-substitution during the solve phase. This + !! algorithm is used within individual MPI ranks when the distributed + !! algorithm is employed, or for the entire domain in serial execution. implicit none - class(tdsops_t), intent(inout) :: self - real(dp), dimension(:), intent(in) :: b + class(tdsops_t), intent(inout) :: self !! Tridiagonal operator (modified in-place) + real(dp), dimension(:), intent(in) :: b !! Diagonal coefficients of tridiagonal system integer :: i, n diff --git a/src/time_integrator.f90 b/src/time_integrator.f90 index 0ac159246..1bcf2bdf5 100644 --- a/src/time_integrator.f90 +++ b/src/time_integrator.f90 @@ -1,4 +1,26 @@ module m_time_integrator + !! Time integration schemes for temporal advancement. + !! + !! This module provides explicit time integration methods for advancing + !! solutions in time. It supports two families of schemes: + !! + !! **1. Runge-Kutta (RK) Methods** + !! Multi-stage schemes that achieve high-order accuracy within a single + !! timestep. Supported orders: RK1 (Euler), RK2, RK3, RK4. Each stage + !! requires an evaluation of the right-hand side (derivative). + !! + !! **2. Adams-Bashforth (AB) Methods** + !! Multi-step schemes that use derivative information from previous + !! timesteps to achieve high-order accuracy. Supported orders: AB1, AB2, + !! AB3, AB4. These methods are more memory-efficient than RK schemes + !! for the same order of accuracy. + !! + !! The `time_intg_t` type encapsulates all integration state and provides + !! a unified interface through the step procedure pointer, which routes + !! to either runge_kutta() or adams_bashforth() based on the selected method. + !! + !! Old timestep/stage data is stored in the `olds` array and managed + !! automatically through rotation mechanisms for AB methods. use m_allocator, only: allocator_t use m_base_backend, only: base_backend_t use m_common, only: dp, DIR_X @@ -9,19 +31,65 @@ module m_time_integrator private adams_bashforth, runge_kutta type :: time_intg_t - integer :: method, istep, istage, order, nstep, nstage, nvars, nolds - real(dp) :: coeffs(4, 4) - real(dp) :: rk_b(4, 4) - real(dp) :: rk_a(3, 3, 4) - character(len=3) :: sname - type(flist_t), allocatable :: olds(:, :) - class(base_backend_t), pointer :: backend - class(allocator_t), pointer :: allocator - procedure(stepper_func), pointer :: step => null() + !! Time integrator for explicit multi-step and multi-stage methods. + !! + !! This type encapsulates all data and methods needed for time integration + !! of ordinary differential equations (ODEs) arising from spatial discretization + !! of the Navier-Stokes equations: + !! + !! \[ + !! \frac{d\mathbf{u}}{dt} = \mathbf{F}(\mathbf{u}, t) + !! \] + !! + !! where \(\mathbf{F}\) represents the spatial operators (advection, diffusion, + !! pressure gradient, etc.). + !! + !! **Supported Methods:** + !! + !! - **Adams-Bashforth (AB1-AB4)**: Explicit multi-step methods using + !! previous timestep derivatives. Efficient (single evaluation per step) + !! but requires startup procedure for higher orders. + !! - **Runge-Kutta (RK1-RK4)**: Explicit multi-stage methods using + !! intermediate stages within a timestep. Self-starting but requires + !! multiple evaluations per step. + !! + !! **Method Selection:** + !! + !! The `step` procedure pointer is bound at initialization to either + !! `runge_kutta()` or `adams_bashforth()` based on the method name + !! (e.g., "AB3" or "RK4"), enabling polymorphic time stepping. + !! + !! **Data Management:** + !! + !! - **AB methods**: Store previous timestep derivatives in `olds` array, + !! rotated each timestep to maintain history + !! - **RK methods**: Store intermediate stage solutions in `olds` array, + !! overwritten within each timestep + !! + !! **Startup Procedure (AB only):** + !! + !! Higher-order AB methods (AB2-AB4) ramp up from first-order during initial + !! timesteps until sufficient derivative history is available. + integer :: method !! Integration method identifier (unused, kept for compatibility) + integer :: istep !! Current timestep number (for AB startup ramping) + integer :: istage !! Current stage number within timestep (RK only) + integer :: order !! Order of accuracy of the scheme (1-4) + integer :: nstep !! Number of timesteps needed (AB: order, RK: 1) + integer :: nstage !! Number of stages per timestep (AB: 1, RK: order) + integer :: nvars !! Number of variables being integrated + integer :: nolds !! Number of old derivatives/solutions to store + real(dp) :: coeffs(4, 4) !! Adams-Bashforth coefficients [stage, order] + real(dp) :: rk_b(4, 4) !! Runge-Kutta final weights [stage, order] + real(dp) :: rk_a(3, 3, 4) !! Runge-Kutta stage weights [from_stage, to_stage, order] + character(len=3) :: sname !! Scheme name (e.g., 'AB3', 'RK4') + type(flist_t), allocatable :: olds(:, :) !! Old derivatives/solutions [nvars, nolds] + class(base_backend_t), pointer :: backend !! Computational backend for operations + class(allocator_t), pointer :: allocator !! Memory allocator for field storage + procedure(stepper_func), pointer :: step => null() !! Function pointer to integration method contains - procedure :: finalize - procedure :: runge_kutta - procedure :: adams_bashforth + procedure :: finalize !! Clean up and release allocated memory + procedure :: runge_kutta !! Runge-Kutta time integration implementation + procedure :: adams_bashforth !! Adams-Bashforth time integration implementation end type time_intg_t interface time_intg_t @@ -30,25 +98,34 @@ module m_time_integrator abstract interface subroutine stepper_func(self, curr, deriv, dt) + !! Abstract interface for time stepping functions. + !! + !! Defines the signature for integration methods (RK or AB). + !! Each method takes the current solution, its derivative, and + !! the timestep size, and updates the solution accordingly. import :: time_intg_t import :: dp import :: flist_t implicit none - class(time_intg_t), intent(inout) :: self - type(flist_t), intent(inout) :: curr(:) - type(flist_t), intent(in) :: deriv(:) - real(dp), intent(in) :: dt + class(time_intg_t), intent(inout) :: self !! Time integrator state + type(flist_t), intent(inout) :: curr(:) !! Current solution variables [nvars] + type(flist_t), intent(in) :: deriv(:) !! Time derivatives of variables [nvars] + real(dp), intent(in) :: dt !! Timestep size end subroutine stepper_func end interface contains subroutine finalize(self) + !! Finalise time integrator and release allocated resources. + !! + !! Releases all field storage blocks used for storing old derivatives + !! or stage solutions, and deallocates the olds array. implicit none !type(time_intg_t), intent(inout) :: self - class(time_intg_t), intent(inout) :: self + class(time_intg_t), intent(inout) :: self !! Time integrator to finalise integer :: i, j @@ -67,13 +144,32 @@ subroutine finalize(self) end subroutine finalize function init(backend, allocator, method, nvars) + !! Initialise time integrator with specified method and coefficients. + !! + !! This constructor configures the time integration scheme based on the + !! method string (e.g., 'AB3' or 'RK4'). It initialises all Runge-Kutta + !! and Adams-Bashforth coefficients for orders 1-4, then selects the + !! appropriate method and allocates storage for old derivatives or stages. + !! + !! **Supported Methods:** + !! - AB1, AB2, AB3, AB4: Adams-Bashforth (explicit multi-step) + !! - RK1, RK2, RK3, RK4: Runge-Kutta (explicit multi-stage) + !! + !! **RK Coefficients (Butcher tableau):** + !! - RK1: Forward Euler + !! - RK2: Midpoint method + !! - RK3: Strong Stability Preserving RK3 (SSP-RK3) + !! - RK4: Classical fourth-order Runge-Kutta + !! + !! **AB Coefficients:** + !! Derived from polynomial extrapolation of previous derivatives. implicit none - type(time_intg_t) :: init - class(base_backend_t), pointer :: backend - class(allocator_t), pointer :: allocator - character(3), intent(in) :: method - integer, intent(in) :: nvars + type(time_intg_t) :: init !! Initialised time integrator + class(base_backend_t), pointer :: backend !! Computational backend + class(allocator_t), pointer :: allocator !! Memory allocator + character(3), intent(in) :: method !! Integration method ('AB3', 'RK4', etc.) + integer, intent(in) :: nvars !! Number of variables to integrate integer :: i, j, stat @@ -160,12 +256,27 @@ function init(backend, allocator, method, nvars) end function init subroutine runge_kutta(self, curr, deriv, dt) + !! Advance solution using Runge-Kutta method. + !! + !! Implements explicit Runge-Kutta schemes of orders 1-4. The general + !! form for an s-stage RK method is: + !! + !! \[ k_i = f(t_n + c_i \Delta t, u_n + \Delta t \sum_{j=1}^{i-1} a_{ij} k_j) \] + !! \[ u_{n+1} = u_n + \Delta t \sum_{i=1}^{s} b_i k_i \] + !! + !! Where \( k_i \) are stage derivatives, \( a_{ij} \) are stage weights, + !! and \( b_i \) are final combination weights. This implementation stores + !! stage derivatives in `olds(:, 2:nstage+1)` and the initial solution in + !! `olds(:, 1)`. + !! + !! The subroutine is called once per stage. When `istage == nstage`, it + !! computes the final solution and resets the stage counter. implicit none - class(time_intg_t), intent(inout) :: self - type(flist_t), intent(inout) :: curr(:) - type(flist_t), intent(in) :: deriv(:) - real(dp), intent(in) :: dt + class(time_intg_t), intent(inout) :: self !! Time integrator state + type(flist_t), intent(inout) :: curr(:) !! Current solution (updated) + type(flist_t), intent(in) :: deriv(:) !! Stage derivative + real(dp), intent(in) :: dt !! Timestep size integer :: i, j @@ -219,12 +330,27 @@ subroutine runge_kutta(self, curr, deriv, dt) end subroutine runge_kutta subroutine adams_bashforth(self, curr, deriv, dt) + !! Advance solution using Adams-Bashforth method. + !! + !! Implements explicit Adams-Bashforth schemes of orders 1-4. These + !! multi-step methods use derivatives from previous timesteps: + !! + !! \[ u_{n+1} = u_n + \Delta t \sum_{i=0}^{s-1} b_i f_{n-i} \] + !! + !! Where \( f_{n-i} \) are stored derivatives from previous steps and + !! \( b_i \) are the Adams-Bashforth coefficients. The method has an + !! automatic startup phase: for the first `order` steps, it uses a + !! lower-order scheme (e.g., AB2 uses AB1 on step 1, then AB2 on step 2+). + !! + !! Old derivatives are stored in `olds(:, 1:nstep-1)` and rotated after + !! each step. The current derivative is used directly and then stored + !! in `olds(:, 1)` for the next timestep. implicit none - class(time_intg_t), intent(inout) :: self - type(flist_t), intent(inout) :: curr(:) - type(flist_t), intent(in) :: deriv(:) - real(dp), intent(in) :: dt + class(time_intg_t), intent(inout) :: self !! Time integrator state + type(flist_t), intent(inout) :: curr(:) !! Current solution (updated) + type(flist_t), intent(in) :: deriv(:) !! Current time derivative + real(dp), intent(in) :: dt !! Timestep size integer :: i, j integer :: nstep @@ -266,10 +392,19 @@ subroutine adams_bashforth(self, curr, deriv, dt) end subroutine adams_bashforth subroutine rotate(sol, n) + !! Rotate pointer array for Adams-Bashforth old derivatives. + !! + !! Shifts pointers in the array to make room for a new derivative: + !! sol(i) <- sol(i-1) for i from n down to 2, and sol(1) gets the + !! old sol(n). This implements a circular buffer for old derivatives + !! without copying data - only pointers are reassigned. + !! + !! Example for n=3: [new, old1, old2] becomes [?, new, old1] + !! (where ? will be filled with the newest derivative) implicit none - type(flist_t), intent(inout) :: sol(:) - integer, intent(in) :: n + type(flist_t), intent(inout) :: sol(:) !! Array of field list pointers to rotate + integer, intent(in) :: n !! Number of elements to rotate integer :: i class(field_t), pointer :: ptr diff --git a/src/vector_calculus.f90 b/src/vector_calculus.f90 index cf1a1da4d..411332b7f 100644 --- a/src/vector_calculus.f90 +++ b/src/vector_calculus.f90 @@ -1,4 +1,31 @@ module m_vector_calculus + !! Vector calculus operators for finite-difference. + !! + !! This module provides implementations of fundamental differential operators + !! (divergence, gradient, curl, Laplacian) on staggered and collocated grids. + !! All operators are built using high-order compact finite-difference schemes + !! from the tdsops module. + !! + !! **Key Features:** + !! + !! - **Staggered grid support**: Operators handle transitions between cell centres + !! (`CELL`) and vertices (`VERT`) through staged derivatives and interpolation + !! - **Data reordering**: Automatically manages pencil decomposition, reordering + !! fields between \( X, Y, Z \) orientations as needed for derivatives + !! - **Memory efficiency**: Uses allocator blocks for temporary fields with + !! careful release management to minimise memory footprint + !! + !! **Grid Conventions:** + !! + !! - `CELL` (`data_loc=CELL`): Variables stored at cell centres (e.g., pressure) + !! - `VERT` (`data_loc=VERT`): Variables stored at cell vertices (e.g., velocity) + !! - Staggered operators (`v2c`, `c2v`) transition between these locations + !! + !! **Data Layouts:** + !! + !! - `DIR_X`: Pencil decomposed in \( X \) direction (default for most operations) + !! - `DIR_Y`: Pencil decomposed in \( Y \) direction (for Y derivatives) + !! - `DIR_Z`: Pencil decomposed in \( Z \) direction (for Z derivatives) use iso_fortran_env, only: stderr => error_unit use m_allocator, only: allocator_t @@ -11,13 +38,16 @@ module m_vector_calculus implicit none type :: vector_calculus_t - !! Defines vector calculus operators - class(base_backend_t), pointer :: backend + !! Container for vector calculus operators. + !! + !! Provides methods for computing curl, divergence, gradient, and Laplacian. + !! All operations are delegated to the backend for computational flexibility. + class(base_backend_t), pointer :: backend !! Computational backend (CPU/GPU) contains - procedure :: curl - procedure :: divergence_v2c - procedure :: gradient_c2v - procedure :: laplacian + procedure :: curl !! Compute curl (vorticity) of vector field + procedure :: divergence_v2c !! Compute divergence from vertices to cell centres + procedure :: gradient_c2v !! Compute gradient from cell centres to vertices + procedure :: laplacian !! Compute Laplacian of scalar field end type vector_calculus_t interface vector_calculus_t @@ -27,10 +57,15 @@ module m_vector_calculus contains function init(backend) result(vector_calculus) + !! Initialise vector calculus module with computational backend. + !! + !! Simply stores a pointer to the backend, which provides access to + !! the allocator, reordering routines, and tridiagonal solvers needed + !! for computing derivatives. implicit none - class(base_backend_t), target, intent(inout) :: backend - type(vector_calculus_t) :: vector_calculus + class(base_backend_t), target, intent(inout) :: backend !! Computational backend + type(vector_calculus_t) :: vector_calculus !! Initialised vector calculus object vector_calculus%backend => backend @@ -142,21 +177,33 @@ subroutine divergence_v2c(self, div_u, u, v, w, & x_stagder_v2c, x_interpl_v2c, & y_stagder_v2c, y_interpl_v2c, & z_stagder_v2c, z_interpl_v2c) - !! Divergence of a vector field (u, v, w). + !! Compute divergence of a vector field from vertices to cell centres. + !! + !! Computes: + !! \[ \nabla \cdot \mathbf{u} = \frac{\partial u}{\partial x} + + !! \frac{\partial v}{\partial y} + \frac{\partial w}{\partial z} \] + !! + !! Input velocity components (u, v, w) are at vertices (VERT), and + !! divergence is evaluated at cell centres (CELL). This requires: + !! - **Staggered derivatives** in the aligned direction (e.g., du/dx uses x_stagder_v2c) + !! - **Interpolation** for cross terms (e.g., v and w interpolated in x direction) !! - !! Evaluated at the cell centers (data_loc=CELL) - !! Input fields are at vertices (data_loc=VERT) + !! The algorithm proceeds dimension by dimension: + !! 1. Compute du/dx (staggered), interpolate dv/dx, dw/dx in DIR_X + !! 2. Reorder to DIR_Y, compute dv/dy (staggered), interpolate du/dy, dw/dy + !! 3. Reorder to DIR_Z, compute dw/dz (staggered), interpolate du/dz + !! 4. Sum all components: div = du/dx + dv/dy + dw/dz !! - !! Input fields are in DIR_X data layout. - !! Output field is in DIR_Z data layout. + !! **Input:** All fields in DIR_X layout + !! **Output:** div_u in DIR_Z layout implicit none - class(vector_calculus_t) :: self - class(field_t), intent(inout) :: div_u - class(field_t), intent(in) :: u, v, w - class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, & - y_stagder_v2c, y_interpl_v2c, & - z_stagder_v2c, z_interpl_v2c + class(vector_calculus_t) :: self !! Vector calculus object + class(field_t), intent(inout) :: div_u !! Divergence output (CELL, DIR_Z) + class(field_t), intent(in) :: u, v, w !! Velocity components (VERT, DIR_X) + class(tdsops_t), intent(in) :: x_stagder_v2c, x_interpl_v2c, & !! X operators + y_stagder_v2c, y_interpl_v2c, & !! Y operators + z_stagder_v2c, z_interpl_v2c !! Z operators class(field_t), pointer :: du_x, dv_x, dw_x, & u_y, v_y, w_y, du_y, dv_y, dw_y, & @@ -248,21 +295,34 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, & x_stagder_c2v, x_interpl_c2v, & y_stagder_c2v, y_interpl_c2v, & z_stagder_c2v, z_interpl_c2v) - !! Gradient of a scalar field 'p'. + !! Compute gradient of a scalar field from cell centres to vertices. + !! + !! Computes: + !! \[ \nabla p = \left( \frac{\partial p}{\partial x}, + !! \frac{\partial p}{\partial y}, \frac{\partial p}{\partial z} \right) \] + !! + !! Input pressure p is at cell centres (CELL), and gradient components + !! are evaluated at vertices (VERT). This is the inverse operation of + !! divergence_v2c and is used in projection methods for incompressible flow. !! - !! Evaluated at the vertices (data_loc=VERT) - !! Input field is at cell centers (data_loc=CELL) + !! The algorithm proceeds in reverse order (Z to Y to X): + !! 1. Compute dp/dz (staggered), interpolate p in Z direction (DIR_Z) + !! 2. Reorder to DIR_Y, compute dp/dy (staggered), interpolate p and dpdz + !! 3. Reorder to DIR_X, compute dp/dx (staggered), interpolate dpdy and dpdz !! - !! Input field is in DIR_Z data layout. - !! Output fields (dpdx, dpdy, dpdz) are in DIR_X data layout. + !! This reverse ordering optimises memory usage by minimising temporary + !! field allocations. + !! + !! **Input:** p in DIR_Z layout + !! **Output:** dpdx, dpdy, dpdz in DIR_X layout implicit none - class(vector_calculus_t) :: self - class(field_t), intent(inout) :: dpdx, dpdy, dpdz - class(field_t), intent(in) :: p - class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, & - y_stagder_c2v, y_interpl_c2v, & - z_stagder_c2v, z_interpl_c2v + class(vector_calculus_t) :: self !! Vector calculus object + class(field_t), intent(inout) :: dpdx, dpdy, dpdz !! Gradient components (VERT, DIR_X) + class(field_t), intent(in) :: p !! Scalar field (CELL, DIR_Z) + class(tdsops_t), intent(in) :: x_stagder_c2v, x_interpl_c2v, & !! X operators + y_stagder_c2v, y_interpl_c2v, & !! Y operators + z_stagder_c2v, z_interpl_c2v !! Z operators class(field_t), pointer :: p_sxy_z, dpdz_sxy_z, & p_sxy_y, dpdz_sxy_y, & @@ -331,18 +391,31 @@ subroutine gradient_c2v(self, dpdx, dpdy, dpdz, p, & end subroutine gradient_c2v subroutine laplacian(self, lapl_u, u, x_der2nd, y_der2nd, z_der2nd) - !! Laplacian of a scalar field 'u'. + !! Compute Laplacian of a scalar field. + !! + !! Computes: + !! \[ \nabla^2 u = \frac{\partial^2 u}{\partial x^2} + + !! \frac{\partial^2 u}{\partial y^2} + \frac{\partial^2 u}{\partial z^2} \] + !! + !! The Laplacian is evaluated at the same grid location (CELL or VERT) + !! as the input field. This operator is used in diffusion terms and + !! Poisson equations. !! - !! Evaluated at the data_loc defined by the input u field + !! The algorithm computes second derivatives in each direction: + !! 1. Compute \( d^2u/dx^2 \) directly in DIR_X + !! 2. Reorder to DIR_Y, compute \( d^2u/dy^2 \), sum into result via sum_yintox + !! 3. Reorder to DIR_Z, compute \( d^2u/dz^2 \), sum into result via sum_zintox !! - !! Input and output fields are in DIR_X layout. + !! The sum_yintox and sum_zintox operations add directional derivatives + !! directly into the DIR_X result field without additional reordering. + !! + !! **Input/Output:** All fields in DIR_X layout implicit none - class(vector_calculus_t) :: self - class(field_t), intent(inout) :: lapl_u - class(field_t), intent(in) :: u - - class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd + class(vector_calculus_t) :: self !! Vector calculus object + class(field_t), intent(inout) :: lapl_u !! Laplacian output (same data_loc as u, DIR_X) + class(field_t), intent(in) :: u !! Scalar field (DIR_X) + class(tdsops_t), intent(in) :: x_der2nd, y_der2nd, z_der2nd !! Second derivative operators class(field_t), pointer :: u_y, d2u_y, u_z, d2u_z diff --git a/src/xcompact.f90 b/src/xcompact.f90 index dcfed3fd8..663d75f14 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -1,4 +1,44 @@ program xcompact + !! Main program for X3D2 CFD solver. + !! + !! X3D2 is a high-order finite-difference incompressible Navier-Stokes + !! solver based on Xcompact3D/Incompact3D. It solves the incompressible + !! Navier-Stokes equations using: + !! + !! - **Compact finite differences** for spatial derivatives (4th-6th order) + !! - **Fractional-step method** for pressure-velocity coupling + !! - **FFT-based or iterative Poisson solvers** for pressure + !! - **Explicit time integration** (Runge-Kutta or Adams-Bashforth) + !! + !! **Program Flow:** + !! + !! 1. Initialise MPI and determine rank/size + !! 2. Select computational backend (CUDA GPU or OpenMP CPU) + !! 3. Read configuration from input file (domain and solver parameters) + !! 4. Create mesh with domain decomposition (pencil decomposition) + !! 5. Instantiate allocator and backend for the selected platform + !! 6. Select and instantiate flow case (channel, TGV, generic, etc.) + !! 7. Run simulation via flow_case%run() + !! 8. Report timing and finalise MPI + !! + !! **Backend Options:** + !! + !! - **CUDA**: GPU acceleration via NVIDIA CUDA (compile with -DCUDA) + !! - **OMP**: CPU parallelism via OpenMP threading + !! + !! **Input:** Namelist file specified as command-line argument (e.g., input.x3d) + !! + !! **Domain Decomposition:** + !! + !! X3D2 supports two decomposition strategies: + !! + !! - **2DECOMP&FFT**: External library used when FFT Poisson solver + OMP backend. + !! Provides optimised pencil decomposition and FFT transforms. Cannot decompose + !! in X-direction (`nproc_dir(1)` must be 1). + !! - **Generic**: Built-in X3D2 decomposition used for CUDA backend or when + !! 2DECOMP&FFT is unavailable. Can decompose in any direction (X, Y, Z). + !! + !! The decomposition is selected automatically based on backend and solver type. use mpi use m_allocator @@ -22,30 +62,31 @@ program xcompact implicit none - class(base_backend_t), pointer :: backend - class(allocator_t), pointer :: allocator - type(allocator_t), pointer :: host_allocator - type(mesh_t), target :: mesh - class(base_case_t), allocatable :: flow_case + class(base_backend_t), pointer :: backend !! Active computational backend (CUDA or OMP) + class(allocator_t), pointer :: allocator !! Memory allocator for device/host + type(allocator_t), pointer :: host_allocator !! Host memory allocator (for I/O, etc.) + type(mesh_t), target :: mesh !! Computational mesh with decomposition + class(base_case_t), allocatable :: flow_case !! Flow case instance (polymorphic) #ifdef CUDA - type(cuda_backend_t), target :: cuda_backend - type(cuda_allocator_t), target :: cuda_allocator - integer :: ndevs, devnum + type(cuda_backend_t), target :: cuda_backend !! CUDA backend implementation + type(cuda_allocator_t), target :: cuda_allocator !! CUDA device memory allocator + integer :: ndevs, devnum !! Number of GPUs, assigned device number #else - type(omp_backend_t), target :: omp_backend + type(omp_backend_t), target :: omp_backend !! OpenMP backend implementation #endif - type(allocator_t), target :: omp_allocator + type(allocator_t), target :: omp_allocator !! Host/CPU memory allocator - real(dp) :: t_start, t_end + real(dp) :: t_start, t_end !! CPU timing for performance measurement - type(domain_config_t) :: domain_cfg - type(solver_config_t) :: solver_cfg - character(32) :: backend_name - integer :: dims(3), nrank, nproc, ierr - logical :: use_2decomp + type(domain_config_t) :: domain_cfg !! Domain configuration from input file + type(solver_config_t) :: solver_cfg !! Solver configuration from input file + character(32) :: backend_name !! Backend name string ("CUDA" or "OMP") + integer :: dims(3), nrank, nproc, ierr !! Dimensions, MPI rank/size, error code + logical :: use_2decomp !! Whether to use 2DECOMP&FFT library + ! Initialise MPI call MPI_Init(ierr) call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) @@ -74,7 +115,9 @@ program xcompact domain_cfg%nproc_dir = [1, 1, nproc] end if - ! Decide whether 2decomp is used or not + ! Select decomposition strategy: + ! - 2DECOMP&FFT: Used for FFT Poisson solver with OMP backend (optimised) + ! - Generic: Used for CUDA backend or non-FFT solvers (more flexible) use_2decomp = solver_cfg%poisson_solver_type == 'FFT' & .and. trim(backend_name) == 'OMP'