diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index cf17c35f5ab..7ced4ca1873 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -904,10 +904,21 @@ steps: agents: slurm_gpus: 1 + - label: "GPU: Analytic Schar Mountain Test (2D, High Resolution)" + command: > + julia --color=yes --project=examples examples/hybrid/driver.jl + --config_file $CONFIG_PATH/plane_analytic_schar_mountain_high_res_test.yml + --job_id gpu_plane_analytic_schar_mountain_high_res_test + artifact_paths: "gpu_plane_analytic_schar_mountain_high_res_test/output_active/*" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - label: "GPU: Analytic Cosine Mountain Test (2D)" command: > julia --color=yes --project=examples examples/hybrid/driver.jl - --config_file $CONFIG_PATH/gpu_plane_analytic_cosine_mountain_test.yml + --config_file $CONFIG_PATH/plane_analytic_cosine_mountain_test.yml --job_id gpu_plane_analytic_cosine_mountain_test artifact_paths: "gpu_plane_analytic_cosine_mountain_test/output_active/*" env: @@ -915,6 +926,17 @@ steps: agents: slurm_gpus: 1 + - label: "GPU: Analytic Cosine Mountain Test (Extruded 2D)" + command: > + julia --color=yes --project=examples examples/hybrid/driver.jl + --config_file $CONFIG_PATH/extruded_plane_analytic_cosine_mountain_test.yml + --job_id gpu_extruded_plane_analytic_cosine_mountain_test + artifact_paths: "gpu_extruded_plane_analytic_cosine_mountain_test/output_active/*" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - label: "GPU: Analytic Cosine Mountain Test (3D)" command: > julia --color=yes --project=examples examples/hybrid/driver.jl diff --git a/config/model_configs/gpu_plane_analytic_cosine_mountain_test.yml b/config/model_configs/extruded_plane_analytic_cosine_mountain_test.yml similarity index 100% rename from config/model_configs/gpu_plane_analytic_cosine_mountain_test.yml rename to config/model_configs/extruded_plane_analytic_cosine_mountain_test.yml diff --git a/config/model_configs/plane_analytic_schar_mountain_high_res_test.yml b/config/model_configs/plane_analytic_schar_mountain_high_res_test.yml new file mode 100644 index 00000000000..046dfcee939 --- /dev/null +++ b/config/model_configs/plane_analytic_schar_mountain_high_res_test.yml @@ -0,0 +1,18 @@ +config: "plane" +FLOAT_TYPE: "Float64" +initial_condition: "ConstantBuoyancyFrequencyProfile" +topography: "Schar" +x_max: 100e3 +z_max: 21e3 +x_elem: 250 +z_elem: 100 +z_stretch: false +dt: "0.2secs" +t_end: "24hours" +rayleigh_sponge: true +toml: [toml/analytic_mountain_test.toml] +analytic_check: true +output_default_diagnostics: false +diagnostics: + - short_name: [orog, ua, wa, uapredicted, wapredicted, uaerror, waerror] + period: 1hours diff --git a/config/model_configs/plane_analytic_schar_mountain_test.yml b/config/model_configs/plane_analytic_schar_mountain_test.yml index 5f02eba6e17..25af4991add 100644 --- a/config/model_configs/plane_analytic_schar_mountain_test.yml +++ b/config/model_configs/plane_analytic_schar_mountain_test.yml @@ -1,16 +1,14 @@ -config: "box" +config: "plane" FLOAT_TYPE: "Float64" initial_condition: "ConstantBuoyancyFrequencyProfile" topography: "Schar" x_max: 100e3 -y_max: 2e3 z_max: 21e3 x_elem: 100 -y_elem: 2 z_elem: 100 z_stretch: false dt: "0.5secs" -t_end: "18hours" +t_end: "24hours" rayleigh_sponge: true toml: [toml/analytic_mountain_test.toml] analytic_check: true diff --git a/examples/hybrid/fix_1d_spectral_space_on_gpu.jl b/examples/hybrid/fix_1d_spectral_space_on_gpu.jl index d75e1f2d66c..583ef54bd6f 100644 --- a/examples/hybrid/fix_1d_spectral_space_on_gpu.jl +++ b/examples/hybrid/fix_1d_spectral_space_on_gpu.jl @@ -1,7 +1,9 @@ import ClimaCore: ClimaCore, + RecursiveApply, DataLayouts, Geometry, + Meshes, Topologies, Quadratures, Grids, @@ -9,14 +11,20 @@ import ClimaCore: Fields, Operators, Hypsography, + Remapping, slab +import ClimaCore.RecursiveApply: ⊞, ⊟, ⊠ import ClimaComms import CUDA +import Adapt import StaticArrays: SMatrix ClimaCoreCUDAExt = Base.get_extension(ClimaCore, :ClimaCoreCUDAExt) -# TODO: Move this to ClimaCore.jl/ext/ClimaCoreCUDAExt/data_layouts.jl +########## +## ClimaCore.jl/ext/cuda/data_layouts.jl +########## + function Base.copyto!( dest::DataLayouts.IFH{S, Ni}, bc::Union{ @@ -36,6 +44,7 @@ function Base.copyto!( end return dest end + function Base.copyto!( dest::DataLayouts.VIFH{S, Nv, Ni}, bc::Union{ @@ -51,14 +60,277 @@ function Base.copyto!( ClimaCoreCUDAExt.knl_copyto!, (dest, bc), dest; - threads_s = (Ni, Nv_per_block), + threads_s = (Ni, 1, Nv_per_block), blocks_s = (Nh, Nv_blocks), ) end return dest end -# TODO: Move this to ClimaCore.jl/ext/ClimaCoreCUDAExt/operators_sem_shmem.jl +########## +## ClimaCore.jl/ext/cuda/fields.jl +########## + +function ClimaCoreCUDAExt.mapreduce_cuda( + f, + op, + field::Fields.Field{V}; + weighting = false, + opargs..., +) where { + S, + V <: Union{DataLayouts.VF{S}, DataLayouts.IFH{S}, DataLayouts.IJFH{S}, DataLayouts.VIFH{S}, DataLayouts.VIJFH{S}}, +} + data = Fields.field_values(field) + pdata = parent(data) + T = eltype(pdata) + (Ni, Nj, Nk, Nv, Nh) = size(data) + Nf = div(length(pdata), prod(size(data))) # length of field dimension + wt = Spaces.weighted_jacobian(axes(field)) + pwt = parent(wt) + + nitems = Nv * Ni * Nj * Nk * Nh + max_threads = 256# 512 1024 + nthreads = min(max_threads, nitems) + # perform n ops during loading to shmem (this is a tunable parameter) + n_ops_on_load = cld(nitems, nthreads) == 1 ? 0 : 7 + effective_blksize = nthreads * (n_ops_on_load + 1) + nblocks = cld(nitems, effective_blksize) + + reduce_cuda = CUDA.CuArray{T}(undef, nblocks, Nf) + shmemsize = nthreads + # place each field on a different block + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (nblocks, Nf) ClimaCoreCUDAExt.mapreduce_cuda_kernel!( + reduce_cuda, + f, + op, + pdata, + pwt, + weighting, + n_ops_on_load, + Val(shmemsize), + nitems * Nf, + ClimaCoreCUDAExt._dataview, + ClimaCoreCUDAExt._get_gidx, + ClimaCoreCUDAExt._cuda_intrablock_reduce!, + ) + # reduce block data + if nblocks > 1 + nthreads = min(32, nblocks) + shmemsize = nthreads + CUDA.@cuda always_inline = true threads = (nthreads) blocks = (Nf) ClimaCoreCUDAExt.reduce_cuda_blocks_kernel!( + reduce_cuda, + op, + Val(shmemsize), + ) + end + return DataLayouts.DataF{S}(Array(Array(reduce_cuda)[1, :])) +end + +function ClimaCoreCUDAExt.mapreduce_cuda_kernel!( + reduce_cuda::AbstractArray{T, 2}, + f, + op, + pdata::AbstractArray{T, N}, + pwt::AbstractArray{T, N}, + weighting::Bool, + n_ops_on_load::Int, + ::Val{shmemsize}, + nitems, + _dataview, + _get_gidx, + _cuda_intrablock_reduce!, +) where {T, N, shmemsize} + blksize = CUDA.blockDim().x + nblk = CUDA.gridDim().x + tidx = CUDA.threadIdx().x + bidx = CUDA.blockIdx().x + fidx = CUDA.blockIdx().y + dataview = _dataview(pdata, fidx) + effective_blksize = blksize * (n_ops_on_load + 1) + gidx = _get_gidx(tidx, bidx, effective_blksize) + reduction = CUDA.CuStaticSharedArray(T, shmemsize) + reduction[tidx] = 0 + + # load shmem + if gidx ≤ nitems + if weighting + reduction[tidx] = f(dataview[gidx]) * pwt[gidx] + for n_ops in 1:n_ops_on_load + gidx2 = + _get_gidx(tidx + blksize * n_ops, bidx, effective_blksize) + if gidx2 ≤ nitems + reduction[tidx] = + op(reduction[tidx], f(dataview[gidx2]) * pwt[gidx2]) + end + end + else + reduction[tidx] = f(dataview[gidx]) + for n_ops in 1:n_ops_on_load + gidx2 = + _get_gidx(tidx + blksize * n_ops, bidx, effective_blksize) + if gidx2 ≤ nitems + reduction[tidx] = op(reduction[tidx], f(dataview[gidx2])) + end + end + end + end + CUDA.sync_threads() + _cuda_intrablock_reduce!(op, reduction, tidx, blksize) + + tidx == 1 && (reduce_cuda[bidx, fidx] = reduction[1]) + return nothing +end + +@inline ClimaCoreCUDAExt._dataview(pdata::AbstractArray{FT, 3}, fidx) where {FT} = + view(pdata, :, fidx:fidx, :) + +########## +## ClimaCore.jl/ext/cuda/remapping_distributed.jl +########## + +function ClimaCoreCUDAExt.set_interpolated_values_kernel!( + out::AbstractArray, + (I,)::NTuple{1}, + local_horiz_indices, + vert_interpolation_weights, + vert_bounding_indices, + field_values, +) + # TODO: Check the memory access pattern. This was not optimized and likely inefficient! + num_horiz = length(local_horiz_indices) + num_vert = length(vert_bounding_indices) + num_fields = length(field_values) + + hindex = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x + vindex = (CUDA.blockIdx().y - Int32(1)) * CUDA.blockDim().y + CUDA.threadIdx().y + findex = (CUDA.blockIdx().z - Int32(1)) * CUDA.blockDim().z + CUDA.threadIdx().z + + totalThreadsX = CUDA.gridDim().x * CUDA.blockDim().x + totalThreadsY = CUDA.gridDim().y * CUDA.blockDim().y + totalThreadsZ = CUDA.gridDim().z * CUDA.blockDim().z + + _, Nq = size(I) + + for i in hindex:totalThreadsX:num_horiz + h = local_horiz_indices[i] + for j in vindex:totalThreadsY:num_vert + v_lo, v_hi = vert_bounding_indices[j] + A, B = vert_interpolation_weights[j] + for k in findex:totalThreadsZ:num_fields + if i ≤ num_horiz && j ≤ num_vert && k ≤ num_fields + out[i, j, k] = 0 + for t in 1:Nq + out[i, j, k] += + I[i, t] * + ( + A * + field_values[k][t, nothing, nothing, v_lo, h] + + B * + field_values[k][t, nothing, nothing, v_hi, h] + ) + end + end + end + end + end + return nothing +end + +########## +## ClimaCore.jl/src/Remapping/distributed_remapping.jl +########## + +function Remapping._set_interpolated_values!( + out::AbstractArray, + fields::AbstractArray{<:Fields.Field}, + _scratch_field_values, + local_horiz_indices, + local_horiz_interpolation_weights, + ::Nothing, + ::Nothing, +) + CUDA.@allowscalar Remapping._set_interpolated_values_device!( + out, + fields, + _scratch_field_values, + local_horiz_indices, + local_horiz_interpolation_weights, + nothing, + nothing, + ClimaComms.CPUSingleThreaded(), + ) # TODO: Fix the kernel function above and remove the @allowscalar. +end + +########## +## ClimaCore.jl/ext/cuda/operators_sem_shmem.jl +########## + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( + space, + ::Val{Nvt}, + op::Operators.Divergence{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + RT = Operators.operator_return_eltype(op, eltype(arg)) + Jv¹ = CUDA.CuStaticSharedArray(RT, (Nq, Nvt)) + return (Jv¹,) +end +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( + op::Operators.Divergence{(1,)}, + (Jv¹,), + space, + ij, + slabidx, + arg, +) + vt = CUDA.threadIdx().z + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + i, _ = ij.I + Jv¹[i, vt] = + local_geometry.J ⊠ RecursiveApply.rmap( + v -> Geometry.contravariant1(v, local_geometry), + arg, + ) +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( + space, + ::Val{Nvt}, + op::Operators.WeakDivergence{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + RT = Operators.operator_return_eltype(op, eltype(arg)) + Nf = DataLayouts.typesize(FT, RT) + WJv¹ = CUDA.CuStaticSharedArray(RT, (Nq, Nvt)) + return (WJv¹,) +end +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( + op::Operators.WeakDivergence{(1,)}, + (WJv¹,), + space, + ij, + slabidx, + arg, +) + vt = CUDA.threadIdx().z + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + i, _ = ij.I + WJv¹[i, vt] = + local_geometry.WJ ⊠ RecursiveApply.rmap( + v -> Geometry.contravariant1(v, local_geometry), + arg, + ) +end + Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( space, ::Val{Nvt}, @@ -86,7 +358,222 @@ Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( input[i, vt] = arg end -# TODO: Move this to ClimaCore.jl/ext/ClimaCoreCUDAExt/operators_spectral_element.jl +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( + space, + ::Val{Nvt}, + op::Operators.WeakGradient{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + IT = eltype(arg) + Wf = CUDA.CuStaticSharedArray(IT, (Nq, Nvt)) + return (Wf,) +end +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( + op::Operators.WeakGradient{(1,)}, + (Wf,), + space, + ij, + slabidx, + arg, +) + vt = CUDA.threadIdx().z + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + i, _ = ij.I + Wf[i, vt] = W ⊠ arg +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( + space, + ::Val{Nvt}, + op::Operators.Curl{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + IT = eltype(arg) + ET = eltype(IT) + RT = Operators.operator_return_eltype(op, IT) + # allocate temp output + if RT <: Geometry.Contravariant3Vector + v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, v₂) + elseif RT <: Geometry.Contravariant2Vector + v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (v₃,) + elseif RT <: Geometry.Contravariant23Vector + v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, v₂, v₃) + else + error("invalid return type") + end +end +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( + op::Operators.Curl{(1,)}, + work, + space, + ij, + slabidx, + arg, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + RT = Operators.operator_return_eltype(op, typeof(arg)) + if RT <: Geometry.Contravariant3Vector + _, v₂ = work + v₂[i, vt] = Geometry.covariant2(arg, local_geometry) + elseif RT <: Geometry.Contravariant2Vector + (v₃,) = work + v₃[i, vt] = Geometry.covariant3(arg, local_geometry) + else + _, v₂, v₃ = work + v₂[i, vt] = Geometry.covariant2(arg, local_geometry) + v₃[i, vt] = Geometry.covariant3(arg, local_geometry) + end +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_shmem( + space, + ::Val{Nvt}, + op::Operators.WeakCurl{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + IT = eltype(arg) + ET = eltype(IT) + RT = Operators.operator_return_eltype(op, IT) + # allocate temp output + if RT <: Geometry.Contravariant3Vector + Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, Wv₂) + elseif RT <: Geometry.Contravariant2Vector + Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (Wv₃,) + elseif RT <: Geometry.Contravariant23Vector + Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, Wv₂, Wv₃) + else + error("invalid return type") + end +end +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_fill_shmem!( + op::Operators.WeakCurl{(1,)}, + work, + space, + ij, + slabidx, + arg, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + RT = Operators.operator_return_eltype(op, typeof(arg)) + if RT <: Geometry.Contravariant3Vector + _, Wv₂ = work + Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry) + elseif RT <: Geometry.Contravariant2Vector + (Wv₃,) = work + Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry) + else + _, Wv₂, Wv₃ = work + Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry) + Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry) + end +end + +########## +## ClimaCore.jl/ext/cuda/operators_spectral_element.jl +########## + +function Base.copyto!( + out::Fields.Field, + sbc::Union{ + Operators.SpectralBroadcasted{ClimaCoreCUDAExt.CUDASpectralStyle}, + Base.Broadcast.Broadcasted{ClimaCoreCUDAExt.CUDASpectralStyle}, + }, +) + space = axes(out) + (Ni, Nj, _, Nv, Nh) = size(Fields.field_values(out)) + max_threads = 256 + @assert Ni * Nj ≤ max_threads + Nvthreads = fld(max_threads, Ni * Nj) + Nvblocks = cld(Nv, Nvthreads) + # executed + args = ( + Operators.strip_space(out, space), + Operators.strip_space(sbc, space), + space, + Val(Nvthreads), + ) + ClimaCoreCUDAExt.auto_launch!( + ClimaCoreCUDAExt.copyto_spectral_kernel!, + args, + out; + threads_s = (Ni, Nj, Nvthreads), + blocks_s = (Nh, Nvblocks), + ) + return out +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( + op::Operators.Divergence{(1,)}, + (Jv¹,), + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + + DJv = D[i, 1] ⊠ Jv¹[1, vt] + for k in 2:Nq + DJv = DJv ⊞ D[i, k] ⊠ Jv¹[k, vt] + end + return RecursiveApply.rmul(DJv, local_geometry.invJ) +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( + op::Operators.WeakDivergence{(1,)}, + (WJv¹,), + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + + Dᵀ₁WJv¹ = D[1, i] ⊠ WJv¹[1, vt] + for k in 2:Nq + Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k, i] ⊠ WJv¹[k, vt] + end + return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹, local_geometry.WJ)) +end + Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( op::Operators.Gradient{(1,)}, (input,), @@ -109,7 +596,324 @@ Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( return Geometry.Covariant1Vector(∂f∂ξ₁) end -# TODO: Move this to ClimaCore.jl/src/Grids/spectralelement.jl +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( + op::Operators.WeakGradient{(1,)}, + (Wf,), + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + + Dᵀ₁Wf = D[1, i] ⊠ Wf[1, vt] + for k in 2:Nq + Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k, i] ⊠ Wf[k, vt] + end + return Geometry.Covariant1Vector(⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W))) +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( + op::Operators.Curl{(1,)}, + work, + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + + if length(work) == 2 + _, v₂ = work + D₁v₂ = D[i, 1] ⊠ v₂[1, vt] + for k in 2:Nq + D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt] + end + return Geometry.Contravariant3Vector( + RecursiveApply.rmul(D₁v₂, local_geometry.invJ), + ) + elseif length(work) == 1 + (v₃,) = work + D₁v₃ = D[i, 1] ⊠ v₃[1, vt] + for k in 2:Nq + D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt] + end + return Geometry.Contravariant2Vector( + ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)), + ) + else + _, v₂, v₃ = work + D₁v₂ = D[i, 1] ⊠ v₂[1, vt] + D₁v₃ = D[i, 1] ⊠ v₃[1, vt] + @simd for k in 2:Nq + D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt] + D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt] + end + return Geometry.Contravariant23Vector( + ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)), + RecursiveApply.rmul(D₁v₂, local_geometry.invJ), + ) + end +end + +Base.@propagate_inbounds function ClimaCoreCUDAExt.operator_evaluate( + op::Operators.WeakCurl{(1,)}, + work, + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + local_geometry = Operators.get_local_geometry(space, ij, slabidx) + + if length(work) == 2 + _, Wv₂ = work + Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt] + for k in 2:Nq + Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt] + end + return Geometry.Contravariant3Vector( + RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ), + ) + elseif length(work) == 1 + (Wv₃,) = work + Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt] + for k in 2:Nq + Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt] + end + return Geometry.Contravariant2Vector( + RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ), + ) + else + _, Wv₂, Wv₃ = work + Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt] + Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt] + @simd for k in 2:Nq + Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt] + Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt] + end + return Geometry.Contravariant23Vector( + RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ), + RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ), + ) + end +end + +########## +## ClimaCore.jl/ext/cuda/operators_finite_difference.jl +########## + +function Base.copyto!( + out::Fields.Field, + bc::Union{ + Operators.StencilBroadcasted{ClimaCoreCUDAExt.CUDAColumnStencilStyle}, + Base.Broadcast.Broadcasted{ClimaCoreCUDAExt.CUDAColumnStencilStyle}, + }, +) + space = axes(out) + (Ni, Nj, _, Nv, Nh) = size(Fields.field_values(out)) + (li, lw, rw, ri) = bounds = Operators.window_bounds(space, bc) + @assert Nv == ri - li + 1 + max_threads = 256 + nitems = Nv * Ni * Nj * Nh # # of independent items + (nthreads, nblocks) = ClimaCoreCUDAExt._configure_threadblock(max_threads, nitems) + args = ( + Operators.strip_space(out, space), + Operators.strip_space(bc, space), + axes(out), + bounds, + Ni, + Nj, + Nh, + Nv, + ) + ClimaCoreCUDAExt.auto_launch!( + ClimaCoreCUDAExt.copyto_stencil_kernel!, + args, + out; + threads_s = (nthreads,), + blocks_s = (nblocks,), + ) + return out +end + +function ClimaCoreCUDAExt.copyto_stencil_kernel!(out, bc, space, bds, Ni, Nj, Nh, Nv) + gid = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + if gid ≤ Nv * Ni * Nj * Nh + (li, lw, rw, ri) = bds + (v, i, j, h) = Topologies._get_idx((Nv, Ni, Nj, Nh), gid) + hidx = (i, j, h) + idx = v - 1 + li + window = + idx < lw ? Operators.LeftBoundaryWindow{Spaces.left_boundary_name(space)}() : + ( + idx > rw ? + Operators.RightBoundaryWindow{Spaces.right_boundary_name(space)}() : + Operators.Interior() + ) + Operators.setidx!( + space, + out, + idx, + hidx, + Operators.getidx(space, bc, window, idx, hidx), + ) + end + return nothing +end + +########## +## ClimaCore.jl/ext/cuda/topologies_dss.jl +########## + +function Topologies.dss_1d!( + ::ClimaComms.CUDADevice, + data::Union{DataLayouts.VIFH, DataLayouts.IFH}, + topology::Topologies.IntervalTopology, + lg = nothing, + weight = nothing, +) + (_, _, _, Nv, Nh) = size(data) + Nfaces = Topologies.isperiodic(topology) ? Nh : Nh - 1 + if Nfaces > 0 + nthreads, nblocks = ClimaCoreCUDAExt._configure_threadblock(Nv * Nfaces) + ClimaCoreCUDAExt.auto_launch!( + dss_1d_kernel!, + (data, lg, weight, Nfaces), + data; + threads_s = nthreads, + blocks_s = nblocks, + ) + end + return nothing +end + +function dss_1d_kernel!(data, lg, weight, Nfaces) + T = eltype(data) + (Ni, _, _, Nv, Nh) = size(data) + gidx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + if gidx ≤ Nv * Nfaces + left_face_elem = cld(gidx, Nv) + level = gidx - (left_face_elem - 1) * Nv + right_face_elem = (left_face_elem % Nh) + 1 + left_idx = CartesianIndex(Ni, 1, 1, level, left_face_elem) + right_idx = CartesianIndex(1, 1, 1, level, right_face_elem) + val = + Topologies.dss_transform(data, lg, weight, left_idx) ⊞ + Topologies.dss_transform(data, lg, weight, right_idx) + data[left_idx] = Topologies.dss_untransform(T, val, lg, left_idx) + data[right_idx] = Topologies.dss_untransform(T, val, lg, right_idx) + end + return nothing +end + +########## +## ClimaCore.jl/src/Topologies/dss.jl +########## + +function Topologies.dss_1d!( + ::ClimaComms.AbstractCPUDevice, + data::Union{DataLayouts.VIFH, DataLayouts.IFH}, + topology::Topologies.IntervalTopology, + lg = nothing, + weight = nothing, +) + T = eltype(data) + (Ni, _, _, Nv, Nh) = size(data) + Nfaces = Topologies.isperiodic(topology) ? Nh : Nh - 1 + @inbounds for left_face_elem in 1:Nfaces, level in 1:Nv + right_face_elem = (left_face_elem % Nh) + 1 + left_idx = CartesianIndex(Ni, 1, 1, level, left_face_elem) + right_idx = CartesianIndex(1, 1, 1, level, right_face_elem) + val = + Topologies.dss_transform(data, lg, weight, left_idx) ⊞ + Topologies.dss_transform(data, lg, weight, right_idx) + data[left_idx] = Topologies.dss_untransform(T, val, lg, left_idx) + data[right_idx] = Topologies.dss_untransform(T, val, lg, right_idx) + end +end + +########## +## ClimaCore.jl/src/Spaces/dss.jl +########## + +function Spaces.weighted_dss_internal!( + data::Union{ + DataLayouts.IFH, + DataLayouts.VIFH, + DataLayouts.IJFH, + DataLayouts.VIJFH, + }, + space::Union{ + Spaces.AbstractSpectralElementSpace, + Spaces.ExtrudedFiniteDifferenceSpace, + }, + hspace::Spaces.AbstractSpectralElementSpace, + dss_buffer::Union{Topologies.DSSBuffer, Nothing}, +) + Spaces.assert_same_eltype(data, dss_buffer) + length(parent(data)) == 0 && return nothing + device = ClimaComms.device(Spaces.topology(hspace)) + if hspace isa Spaces.SpectralElementSpace1D + Topologies.dss_1d!( + device, + data, + Spaces.topology(hspace), + Spaces.local_geometry_data(space), + Spaces.local_dss_weights(space), + ) + else + Topologies.dss_transform!( + device, + dss_buffer, + data, + Spaces.local_geometry_data(space), + Spaces.local_dss_weights(space), + Spaces.perimeter(hspace), + dss_buffer.internal_elems, + ) + Topologies.dss_local!( + device, + dss_buffer.perimeter_data, + Spaces.perimeter(hspace), + Spaces.topology(hspace), + ) + Topologies.dss_untransform!( + device, + dss_buffer, + data, + Spaces.local_geometry_data(space), + Spaces.perimeter(hspace), + dss_buffer.internal_elems, + ) + end + return nothing +end + +########## +## ClimaCore.jl/src/Grids/spectralelement.jl +########## + function Grids._SpectralElementGrid1D( topology::Topologies.IntervalTopology, quadrature_style::Quadratures.QuadratureStyle, @@ -126,10 +930,10 @@ function Grids._SpectralElementGrid1D( Quadratures.quadrature_points(FT, quadrature_style) # Initialize the local_geometry with Array{FT} as the backing array type. - local_geometry = DataLayouts.IFH{LG, Nq}(Array{FT}, nelements) + local_geometry_cpu = DataLayouts.IFH{LG, Nq}(Array{FT}, nelements) for elem in 1:nelements - local_geometry_slab = slab(local_geometry, elem) + local_geometry_slab = slab(local_geometry_cpu, elem) for i in 1:Nq ξ = quad_points[i] vcoords = Topologies.vertex_coordinates(topology, elem) @@ -156,6 +960,9 @@ function Grids._SpectralElementGrid1D( end end + # If needed, move the local_geometry onto the GPU. + local_geometry = DataLayouts.rebuild(local_geometry_cpu, DA) + # TODO: Why was this code setting dss_weights = 1 / DSS(1)? That's just 1?? # dss_weights = copy(local_geometry.J) # dss_weights .= one(FT) @@ -163,20 +970,22 @@ function Grids._SpectralElementGrid1D( # dss_weights = one(FT) ./ dss_weights dss_weights = copy(local_geometry.J) - Topologies.dss_1d!(topology, dss_weights) + Topologies.dss_1d!(ClimaComms.device(topology), dss_weights, topology) dss_weights .= local_geometry.J ./ dss_weights - # If needed, move the local_geometry and dss_weights onto the GPU. return Grids.SpectralElementGrid1D( topology, quadrature_style, global_geometry, - DataLayouts.rebuild(local_geometry, DA), - DataLayouts.rebuild(dss_weights, DA), + local_geometry, + dss_weights, ) end -# TODO: Move this to ClimaCore.jl/src/Hypsography/Hypsography.jl +########## +## ClimaCore.jl/src/Hypsography/Hypsography.jl +########## + function Grids._ExtrudedFiniteDifferenceGrid( horizontal_grid::Grids.AbstractGrid, vertical_grid::Grids.FiniteDifferenceGrid, @@ -199,6 +1008,14 @@ function Grids._ExtrudedFiniteDifferenceGrid( Ref(z_top), Ref(adaption_parameters(adaption)), ) + face_z_cpu = + modified_ref_z_to_physical_z.( + Ref(typeof(adaption)), + Adapt.adapt(Array, face_z_ref), + Adapt.adapt(Array, z_surface), + Ref(z_top), + Ref(adaption_parameters(adaption)), + ) return Grids._ExtrudedFiniteDifferenceGrid( horizontal_grid, diff --git a/post_processing/ci_plots.jl b/post_processing/ci_plots.jl index 387ad014507..9724e1f7b8b 100644 --- a/post_processing/ci_plots.jl +++ b/post_processing/ci_plots.jl @@ -706,7 +706,9 @@ end const AnalyticMountainTest = Union{ Val{:plane_analytic_cosine_mountain_test}, Val{:gpu_plane_analytic_schar_mountain_test}, + Val{:gpu_plane_analytic_schar_mountain_high_res_test}, Val{:gpu_plane_analytic_cosine_mountain_test}, + Val{:gpu_extruded_plane_analytic_cosine_mountain_test}, Val{:gpu_box_analytic_cosine_mountain_test}, } @@ -715,7 +717,12 @@ function make_plots( output_paths::Vector{<:AbstractString}, ) simdirs = SimDir.(output_paths) - is_3d = val != Val(:plane_analytic_cosine_mountain_test) + is_3d = + val == Val(:gpu_extruded_plane_analytic_cosine_mountain_test) || + val == Val(:gpu_box_analytic_cosine_mountain_test) + is_schar_mountain = + val == Val(:gpu_plane_analytic_schar_mountain_test) || + val == Val(:gpu_plane_analytic_schar_mountain_high_res_test) short_names = ["ua", "wa", "uapredicted", "wapredicted", "uaerror", "waerror"] @@ -737,12 +744,12 @@ function make_plots( output_name = "slice_summary", ) - if val == Val(:gpu_plane_analytic_schar_mountain_test) + if is_schar_mountain zoomed_slice_vars = Iterators.flatmap(["wa", "wapredicted", "waerror"]) do short_name map(simdirs) do simdir data = get(simdir; short_name) - data = window(data, "x"; left = 80e3, right = 120e3) + data = window(data, "x"; left = 40e3, right = 60e3) data = is_3d ? slice(data; y = 0) : data data = window(data, "z_reference"; left = 0, right = 13e3) slice(data; time = 24 * 60 * 60) diff --git a/src/utils/common_spaces.jl b/src/utils/common_spaces.jl index f12e89dc3b4..7feb5f38059 100644 --- a/src/utils/common_spaces.jl +++ b/src/utils/common_spaces.jl @@ -120,14 +120,6 @@ function make_hybrid_spaces( hypsography = Hypsography.LinearAdaption(Geometry.ZPoint.(z_surface)) end - @show device - @show ClimaComms.device(Fields.coordinate_field(h_space)) - @show ClimaComms.device(z_grid) - @show ClimaComms.device(z_surface) - @show ClimaComms.device(Geometry.ZPoint.(z_surface)) - @show typeof(h_space) - @show typeof(z_grid) - @show typeof(z_surface) end grid = Grids.ExtrudedFiniteDifferenceGrid(h_grid, z_grid, hypsography; deep) # TODO: return the grid