Skip to content

Commit 3336c7c

Browse files
authored
Optimize volume integral kernels for larger arrays (less common use) (#116)
* Complete 1D * Complete 2D * Complete 3D
1 parent e41092d commit 3336c7c

File tree

3 files changed

+73
-113
lines changed

3 files changed

+73
-113
lines changed

src/solvers/dg_1d.jl

+10-16
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,7 @@ function noncons_volume_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u, de
203203

204204
for ii in axes(u, 1)
205205
@inbounds begin
206-
symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii] *
207-
derivative_split[j1, j2]
206+
symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii] * derivative_split[j1, j2]
208207
noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii]
209208
end
210209
end
@@ -221,16 +220,12 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, nonco
221220

222221
if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
223222
@inbounds du[i, j, k] = zero(eltype(du)) # fuse `reset_du!` here
224-
integral_contribution = zero(eltype(du))
225223

226224
for ii in axes(du, 2)
227-
@inbounds begin
228-
du[i, j, k] += symmetric_flux_arr[i, j, ii, k]
229-
integral_contribution += derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]
230-
end
225+
@inbounds du[i, j, k] += symmetric_flux_arr[i, j, ii, k] +
226+
0.5f0 *
227+
derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]
231228
end
232-
233-
@inbounds du[i, j, k] += 0.5f0 * integral_contribution
234229
end
235230

236231
return nothing
@@ -292,10 +287,9 @@ function noncons_volume_flux_integral_kernel!(du, u, derivative_split, derivativ
292287

293288
# TODO: Avoid potential bank conflicts
294289
for tx in axes(du, 1)
295-
@inbounds begin
296-
shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_szero[thread, ty] +
297-
0.5f0 * noncons_flux_node[tx] * shmem_split[thread, ty]
298-
end
290+
@inbounds shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_szero[thread, ty] +
291+
0.5f0 *
292+
noncons_flux_node[tx] * shmem_split[thread, ty]
299293
end
300294
end
301295

@@ -796,7 +790,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms,
796790
# TODO: More checks before the kernel launch
797791
thread_per_block = size(du, 1) * size(du, 2)
798792
if thread_per_block > MAX_THREADS_PER_BLOCK
799-
# TODO: How to optimize when size is large
793+
# How to optimize when size is large?
800794
flux_arr = similar(u)
801795

802796
flux_kernel = @cuda launch=false flux_kernel!(flux_arr, u, equations, flux)
@@ -834,7 +828,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
834828

835829
thread_per_block = size(du, 2)
836830
if thread_per_block > MAX_THREADS_PER_BLOCK
837-
# TODO: How to optimize when size is large
831+
# How to optimize when size is large?
838832
volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
839833

840834
volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations,
@@ -875,7 +869,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
875869

876870
thread_per_block = size(du, 2)
877871
if thread_per_block > MAX_THREADS_PER_BLOCK
878-
# TODO: How to optimize when size is large
872+
# How to optimize when size is large?
879873
symmetric_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
880874
noncons_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
881875

src/solvers/dg_2d.jl

+26-40
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
4242
@inbounds du[i, j1, j2, k] = zero(eltype(du)) # fuse `reset_du!` here
4343

4444
for ii in axes(du, 2)
45-
@inbounds begin
46-
du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k]
47-
du[i, j1, j2, k] += derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k]
48-
end
45+
@inbounds du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k] +
46+
derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k]
4947
end
5048
end
5149

@@ -94,10 +92,8 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
9492
# Loop within one block to get weak form
9593
# TODO: Avoid potential bank conflicts
9694
for thread in 1:tile_width
97-
@inbounds begin
98-
value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1] +
99-
shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2]
100-
end
95+
@inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1] +
96+
shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2]
10197
end
10298

10399
# Synchronization is not needed here if we use only one tile
@@ -154,10 +150,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
154150
@inbounds du[i, j1, j2, k] = zero(eltype(du)) # fuse `reset_du!` here
155151

156152
for ii in axes(du, 2)
157-
@inbounds begin
158-
du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k]
159-
du[i, j1, j2, k] += derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
160-
end
153+
@inbounds du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k] +
154+
derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
161155
end
162156
end
163157

@@ -212,10 +206,8 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
212206
# Try another way to parallelize (ty1, ty2) with threads to ty3, then
213207
# consolidate each computation back to (ty1, ty2)
214208
for tx in axes(du, 1)
215-
@inbounds begin
216-
shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] +
217-
shmem_split[thread, ty2] * volume_flux_node2[tx]
218-
end
209+
@inbounds shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] +
210+
shmem_split[thread, ty2] * volume_flux_node2[tx]
219211
end
220212
end
221213

@@ -257,10 +249,9 @@ function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, n
257249

258250
for ii in axes(u, 1)
259251
@inbounds begin
260-
symmetric_flux_arr1[ii, j1, j3, j2, k] = derivative_split[j1, j3] *
261-
symmetric_flux_node1[ii]
262-
symmetric_flux_arr2[ii, j1, j2, j3, k] = derivative_split[j2, j3] *
263-
symmetric_flux_node2[ii]
252+
symmetric_flux_arr1[ii, j1, j3, j2, k] = derivative_split[j1, j3] * symmetric_flux_node1[ii]
253+
symmetric_flux_arr2[ii, j1, j2, j3, k] = derivative_split[j2, j3] * symmetric_flux_node2[ii]
254+
264255
noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii]
265256
noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii]
266257
end
@@ -282,20 +273,15 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symm
282273
j2 = rem(j - 1, size(du, 2)) + 1
283274

284275
@inbounds du[i, j1, j2, k] = zero(eltype(du)) # fuse `reset_du!` here
285-
integral_contribution = zero(eltype(du))
286276

287277
for ii in axes(du, 2)
288-
@inbounds begin
289-
du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k]
290-
du[i, j1, j2, k] += symmetric_flux_arr2[i, j1, j2, ii, k]
291-
integral_contribution += derivative_split[j1, ii] *
292-
noncons_flux_arr1[i, j1, ii, j2, k]
293-
integral_contribution += derivative_split[j2, ii] *
294-
noncons_flux_arr2[i, j1, j2, ii, k]
295-
end
278+
@inbounds du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k] +
279+
symmetric_flux_arr2[i, j1, j2, ii, k] +
280+
0.5f0 *
281+
derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] +
282+
0.5f0 *
283+
derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k]
296284
end
297-
298-
@inbounds du[i, j1, j2, k] += 0.5f0 * integral_contribution
299285
end
300286

301287
return nothing
@@ -362,12 +348,12 @@ function noncons_volume_flux_integral_kernel!(du, u, derivative_split, derivativ
362348

363349
# TODO: Avoid potential bank conflicts
364350
for tx in axes(du, 1)
365-
@inbounds begin
366-
shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_szero[thread, ty1] +
367-
symmetric_flux_node2[tx] * shmem_szero[thread, ty2] +
368-
0.5f0 * noncons_flux_node1[tx] * shmem_split[thread, ty1] +
369-
0.5f0 * noncons_flux_node2[tx] * shmem_split[thread, ty2]
370-
end
351+
@inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_szero[thread, ty1] +
352+
symmetric_flux_node2[tx] * shmem_szero[thread, ty2] +
353+
0.5f0 *
354+
noncons_flux_node1[tx] * shmem_split[thread, ty1] +
355+
0.5f0 *
356+
noncons_flux_node2[tx] * shmem_split[thread, ty2]
371357
end
372358
end
373359

@@ -1260,7 +1246,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms,
12601246
# TODO: More checks before the kernel launch
12611247
thread_per_block = size(du, 1) * size(du, 2)^2
12621248
if thread_per_block > MAX_THREADS_PER_BLOCK
1263-
# TODO: How to optimize when size is large
1249+
# How to optimize when size is large?
12641250
flux_arr1 = similar(u)
12651251
flux_arr2 = similar(u)
12661252

@@ -1300,7 +1286,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
13001286

13011287
thread_per_block = size(du, 2)^2
13021288
if thread_per_block > MAX_THREADS_PER_BLOCK
1303-
# TODO: How to optimize when size is large
1289+
# How to optimize when size is large?
13041290
volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
13051291
size(u, 4))
13061292
volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
@@ -1345,7 +1331,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
13451331

13461332
thread_per_block = size(du, 2)^2
13471333
if thread_per_block > MAX_THREADS_PER_BLOCK
1348-
# TODO: How to optimize when size is large
1334+
# How to optimize when size is large?
13491335
symmetric_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
13501336
size(u, 4))
13511337
symmetric_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),

0 commit comments

Comments
 (0)