omlins · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,8 +12,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.9' # Minimum version supporting extensions
-          - '1'   # Latest stable 1.x release of Julia
+          - '1.10' # Minimum version supporting Data module creation
+          - '1'    # Latest stable 1.x release of Julia
           #- 'nightly'
         os:
           - ubuntu-latest
@@ -27,7 +27,7 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v4
         env:
           cache-name: cache-artifacts
         with:

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ParallelStencil"
 uuid = "94395366-693c-11ea-3b26-d9b7aac5d958"
 authors = ["Samuel Omlin", "Ludovic Räss"]
-version = "0.13.5"
+version = "0.14.2"
 
 [deps]
 CellArrays = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
@@ -13,26 +13,29 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 
 [extensions]
 ParallelStencil_AMDGPUExt = "AMDGPU"
 ParallelStencil_CUDAExt = "CUDA"
 ParallelStencil_EnzymeExt = "Enzyme"
+ParallelStencil_MetalExt = "Metal"
 
 [compat]
 AMDGPU = "0.6, 0.7, 0.8, 0.9, 1"
 CUDA = "3.12, 4, 5"
-CellArrays = "0.2.1"
-Enzyme = "0.11"
+CellArrays = "0.3.2"
+Enzyme = "0.12, 0.13"
 MacroTools = "0.5"
+Metal = "1.2"
 Polyester = "0.7"
 StaticArrays = "1"
-julia = "1.9" # Minimum version supporting extensions
+julia = "1.10" # Minimum version supporting Data module creation
 
 [extras]
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "TOML", "AMDGPU", "CUDA", "Enzyme", "Polyester"]
+test = ["Test", "TOML", "AMDGPU", "CUDA", "Metal", "Enzyme", "Polyester"]
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
 <h1> <img src="docs/logo/logo_ParallelStencil.png" alt="ParallelStencil.jl" width="50"> ParallelStencil.jl </h1>
 
 [![Build Status](https://github.com/omlins/ParallelStencil.jl/workflows/CI/badge.svg)](https://github.com/omlins/ParallelStencil.jl/actions)
+[![DOI](https://proceedings.juliacon.org/papers/10.21105/jcon.00138/status.svg)](https://doi.org/10.21105/jcon.00138)
 
 ParallelStencil empowers domain scientists to write architecture-agnostic high-level code for parallel high-performance stencil computations on GPUs and CPUs. Performance similar to CUDA C / HIP can be achieved, which is typically a large improvement over the performance reached when using only [CUDA.jl] or [AMDGPU.jl] [GPU Array programming]. For example, a 2-D shallow ice solver presented at JuliaCon 2020 \[[1][JuliaCon20a]\] achieved a nearly 20 times better performance than a corresponding [GPU Array programming] implementation; in absolute terms, it reached 70% of the theoretical upper performance bound of the used Nvidia P100 GPU, as defined by the effective throughput metric, `T_eff` (note that `T_eff` is very different from common throughput metrics, see section [Performance metric](#performance-metric)). The GPU performance of the solver is reported in green, the CPU performance in blue:
 
 <a id="fig_teff">![Performance ParallelStencil Teff](docs/images/perf_ps2.png)</a>
 
-ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl] and [AMDGPU.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]:
+ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl], [AMDGPU.jl], [Metal.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]:
 
 ![Parallel efficiency of ParallelStencil with CUDA C backend](docs/images/par_eff_c_julia2.png)
 
@@ -32,7 +33,7 @@ Beyond traditional high-performance computing, ParallelStencil supports automati
 * [References](#references)
 
 ## Parallelization and optimization with one macro call
-A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl] and [AMDGPU.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`):
+A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl], [AMDGPU.jl] and [Metal.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`):
 ```julia
 #(...)
 @init_parallel_stencil(CUDA,...)
@@ -518,11 +519,11 @@ julia>]
 ```
 
 ## Questions, comments and discussions
-To discuss technical issues, please post on Julia Discourse in the [GPU topic] or the [Julia at Scale topic] or in the `#gpu` or `#distributed` channels on the [Julia Slack] (to join, visit https://julialang.org/slack/).
+To discuss technical issues, please post on Julia Discourse in the [GPU topic] or the [Julia at Scale topic] or in the `#gpu` or `#hpc` channels on the [Julia Slack] (to join, visit https://julialang.org/slack/).
 To discuss numerical/domain-science issues, please post on Julia Discourse in the [Numerics topic] or the [Modelling & Simulations topic] or whichever other topic fits best your issue.
 
 ## Your contributions
-This project welcomes your contribution! Have you developed an application with ParallelStencil that could be featured as a mini-app? Please contribute it to share it with the world! Would you like to use other methods than finite differences with math-close notation in ParallelStencil kernels? Then check out the tiny `ParallelStencil.FiniteDifferences1D` submodule as an example for enabling math-close notation for a method and contribute your own submodule! Are you missing a great feature in the core of ParallelStencil? Maybe you can contribute yourself! 
+This project welcomes your contribution! Have you developed an application with ParallelStencil that could be featured as a mini-app? Please contribute it to share it with the world! Would you like to use other methods than finite differences with math-close notation in ParallelStencil kernels? Then check out the tiny `ParallelStencil.FiniteDifferences1D` submodule as an example for enabling math-close notation for a method and contribute your own submodule! Are you missing a great feature in the core of ParallelStencil? Maybe you can contribute yourself!
 Please open an issue to discuss your idea for a contribution beforehand. Furthermore, note that a pull request should always address a significant issue in its completeness. Moreover, pull requests should blend nicely into the existing project; common sense is the primary guide in this regard (community guideline documents, e.g. [ColPrac](https://github.com/SciML/ColPrac), can be consulted in addition for inspiration). We are looking forward to your contribution!
 
 ## References
@@ -545,14 +546,15 @@ Please open an issue to discuss your idea for a contribution beforehand. Further
 [JuliaCon20a]: https://www.youtube.com/watch?v=vPsfZUqI4_0
 [JuliaCon20b]: https://www.youtube.com/watch?v=1t1AKnnGRqA
 [JuliaCon19]: https://www.youtube.com/watch?v=b90qqbYJ58Q
-[PASC19]: https://pasc19.pasc-conference.org/program/schedule/presentation/?id=msa218&sess=sess144
+[PASC19]: https://pasc19.pasc-conference.org/program/schedule/index.html%3Fpost_type=page&p=10&id=msa218&sess=sess144.html
 [Base.Threads]: https://docs.julialang.org/en/v1/base/multi-threading/
 [ImplicitGlobalGrid.jl]: https://github.com/eth-cscs/ImplicitGlobalGrid.jl
 [JULIA_NUM_THREADS]:https://docs.julialang.org/en/v1.0.0/manual/environment-variables/#JULIA_NUM_THREADS-1
 [MPI.jl]: https://github.com/JuliaParallel/MPI.jl
 [CellArrays.jl]: https://github.com/omlins/CellArrays.jl
 [CUDA.jl]: https://github.com/JuliaGPU/CUDA.jl
 [AMDGPU.jl]: https://github.com/JuliaGPU/AMDGPU.jl
+[Metal.jl]: https://github.com/JuliaGPU/Metal.jl
 [Enzyme.jl]: https://github.com/EnzymeAD/Enzyme.jl
 [MacroTools.jl]: https://github.com/FluxML/MacroTools.jl
 [StaticArrays.jl]: https://github.com/JuliaArrays/StaticArrays.jl

diff --git a/ext/ParallelStencil_MetalExt.jl b/ext/ParallelStencil_MetalExt.jl
@@ -0,0 +1,4 @@
+module ParallelStencil_MetalExt
+    include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "allocators.jl"))
+end
diff --git a/src/AD.jl b/src/AD.jl
@@ -7,8 +7,8 @@ Provides GPU-compatible wrappers for automatic differentiation functions of the
     import ParallelStencil.AD
 
 # Functions
-- `autodiff_deferred!`: wraps function `autodiff_deferred`.
-- `autodiff_deferred_thunk!`: wraps function `autodiff_deferred_thunk`.
+- `autodiff_deferred!`: wraps function `autodiff_deferred`, promoting all arguments that are not Enzyme.Annotations to Enzyme.Const.
+- `autodiff_deferred_thunk!`: wraps function `autodiff_deferred_thunk`, promoting all arguments that are not Enzyme.Annotations to Enzyme.Const.
 
 # Examples
     const USE_GPU = true
@@ -43,9 +43,6 @@ Provides GPU-compatible wrappers for automatic differentiation functions of the
 
     main()
 
-!!! note "Enzyme runtime activity default"
-    If ParallelStencil is initialized with Threads, then `Enzyme.API.runtimeActivity!(true)` is called to ensure correct behavior of Enzyme. If you want to disable this behavior, then call `Enzyme.API.runtimeActivity!(false)` after loading ParallelStencil.
-
 To see a description of a function type `?<functionname>`.
 """
 module AD

diff --git a/src/FieldAllocators.jl b/src/FieldAllocators.jl
@@ -0,0 +1,65 @@
+"""
+Module FieldAllocators
+
+Provides macros for the allocation of different kind of fields on a grid of size `gridsize`.
+
+# Usage
+    using ParallelStencil.FieldAllocators
+
+# Macros
+
+###### Multiple fields at once
+- [`@allocate`](@ref)
+
+###### Scalar fields
+- [`@Field`](@ref)
+- `{X|Y|Z}Field`, e.g. [`@XField`](@ref)
+- `B{X|Y|Z}Field`, e.g. [`@BXField`](@ref)
+- `{XX|YY|ZZ|XY|XZ|YZ}Field`, e.g. [`@XXField`](@ref)
+
+###### Vector fields
+- [`@VectorField`](@ref)
+- [`@BVectorField`](@ref)
+
+###### Tensor fields
+- [`@TensorField`](@ref)
+
+To see a description of a macro type `?<macroname>` (including the `@`).
+"""
+module FieldAllocators
+    import ..ParallelKernel
+    import ..ParallelStencil: check_initialized
+    @doc replace(ParallelKernel.FieldAllocators.ALLOCATE_DOC,          "@init_parallel_kernel" => "@init_parallel_stencil") macro allocate(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@allocate($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.FIELD_DOC,             "@init_parallel_kernel" => "@init_parallel_stencil") macro Field(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@Field($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.VECTORFIELD_DOC,       "@init_parallel_kernel" => "@init_parallel_stencil") macro VectorField(args...)  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@VectorField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.BVECTORFIELD_DOC,      "@init_parallel_kernel" => "@init_parallel_stencil") macro BVectorField(args...) check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@BVectorField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_DOC,       "@init_parallel_kernel" => "@init_parallel_stencil") macro TensorField(args...)  check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@TensorField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.VECTORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro XField(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.BVECTORFIELD_COMP_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro BXField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@BXField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.VECTORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro YField(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@YField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.BVECTORFIELD_COMP_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro BYField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@BYField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.VECTORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro ZField(args...)       check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@ZField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.BVECTORFIELD_COMP_DOC, "@init_parallel_kernel" => "@init_parallel_stencil") macro BZField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@BZField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro XXField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro YYField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@YYField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro ZZField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@ZZField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro XYField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro XZField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XZField($(args...)))); end
+    @doc replace(ParallelKernel.FieldAllocators.TENSORFIELD_COMP_DOC,  "@init_parallel_kernel" => "@init_parallel_stencil") macro YZField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@YZField($(args...)))); end
+
+    macro IField(args...)        check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@IField($(args...)))); end
+    macro XXYField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXYField($(args...)))); end
+    macro XYYField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYYField($(args...)))); end
+    macro XYZField(args...)      check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYZField($(args...)))); end
+    macro XXYZField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXYZField($(args...)))); end
+    macro XYYZField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYYZField($(args...)))); end
+    macro XYZZField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYZZField($(args...)))); end
+    macro XXYYField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXYYField($(args...)))); end
+    macro XXZZField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXZZField($(args...)))); end
+    macro YYZZField(args...)     check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@YYZZField($(args...)))); end
+    macro XXYYZField(args...)    check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXYYZField($(args...)))); end
+    macro XYYZZField(args...)    check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XYYZZField($(args...)))); end
+    macro XXYZZField(args...)    check_initialized(__module__); esc(:(ParallelStencil.ParallelKernel.FieldAllocators.@XXYZZField($(args...)))); end
+
+    export @allocate, @Field, @VectorField, @BVectorField, @TensorField, @XField, @BXField, @YField, @BYField, @ZField, @BZField, @XXField, @YYField, @ZZField, @XYField, @XZField, @YZField
+end