@@ -42,10 +42,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
42
42
@inbounds du[i, j1, j2, k] = zero (eltype (du)) # fuse `reset_du!` here
43
43
44
44
for ii in axes (du, 2 )
45
- @inbounds begin
46
- du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k]
47
- du[i, j1, j2, k] += derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k]
48
- end
45
+ @inbounds du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k] +
46
+ derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k]
49
47
end
50
48
end
51
49
@@ -94,10 +92,8 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
94
92
# Loop within one block to get weak form
95
93
# TODO : Avoid potential bank conflicts
96
94
for thread in 1 : tile_width
97
- @inbounds begin
98
- value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1 ] +
99
- shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2 ]
100
- end
95
+ @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1 ] +
96
+ shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2 ]
101
97
end
102
98
103
99
# Synchronization is not needed here if we use only one tile
@@ -154,10 +150,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
154
150
@inbounds du[i, j1, j2, k] = zero (eltype (du)) # fuse `reset_du!` here
155
151
156
152
for ii in axes (du, 2 )
157
- @inbounds begin
158
- du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k]
159
- du[i, j1, j2, k] += derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
160
- end
153
+ @inbounds du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k] +
154
+ derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
161
155
end
162
156
end
163
157
@@ -212,10 +206,8 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
212
206
# Try another way to parallelize (ty1, ty2) with threads to ty3, then
213
207
# consolidate each computation back to (ty1, ty2)
214
208
for tx in axes (du, 1 )
215
- @inbounds begin
216
- shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] +
217
- shmem_split[thread, ty2] * volume_flux_node2[tx]
218
- end
209
+ @inbounds shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] +
210
+ shmem_split[thread, ty2] * volume_flux_node2[tx]
219
211
end
220
212
end
221
213
@@ -257,10 +249,9 @@ function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, n
257
249
258
250
for ii in axes (u, 1 )
259
251
@inbounds begin
260
- symmetric_flux_arr1[ii, j1, j3, j2, k] = derivative_split[j1, j3] *
261
- symmetric_flux_node1[ii]
262
- symmetric_flux_arr2[ii, j1, j2, j3, k] = derivative_split[j2, j3] *
263
- symmetric_flux_node2[ii]
252
+ symmetric_flux_arr1[ii, j1, j3, j2, k] = derivative_split[j1, j3] * symmetric_flux_node1[ii]
253
+ symmetric_flux_arr2[ii, j1, j2, j3, k] = derivative_split[j2, j3] * symmetric_flux_node2[ii]
254
+
264
255
noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii]
265
256
noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii]
266
257
end
@@ -282,20 +273,15 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symm
282
273
j2 = rem (j - 1 , size (du, 2 )) + 1
283
274
284
275
@inbounds du[i, j1, j2, k] = zero (eltype (du)) # fuse `reset_du!` here
285
- integral_contribution = zero (eltype (du))
286
276
287
277
for ii in axes (du, 2 )
288
- @inbounds begin
289
- du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k]
290
- du[i, j1, j2, k] += symmetric_flux_arr2[i, j1, j2, ii, k]
291
- integral_contribution += derivative_split[j1, ii] *
292
- noncons_flux_arr1[i, j1, ii, j2, k]
293
- integral_contribution += derivative_split[j2, ii] *
294
- noncons_flux_arr2[i, j1, j2, ii, k]
295
- end
278
+ @inbounds du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k] +
279
+ symmetric_flux_arr2[i, j1, j2, ii, k] +
280
+ 0.5f0 *
281
+ derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] +
282
+ 0.5f0 *
283
+ derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k]
296
284
end
297
-
298
- @inbounds du[i, j1, j2, k] += 0.5f0 * integral_contribution
299
285
end
300
286
301
287
return nothing
@@ -362,12 +348,12 @@ function noncons_volume_flux_integral_kernel!(du, u, derivative_split, derivativ
362
348
363
349
# TODO : Avoid potential bank conflicts
364
350
for tx in axes (du, 1 )
365
- @inbounds begin
366
- shmem_value[tx, ty1, ty2] += symmetric_flux_node1 [tx] * shmem_szero[thread, ty1 ] +
367
- symmetric_flux_node2[tx] * shmem_szero[thread, ty2] +
368
- 0.5f0 * noncons_flux_node1[tx] * shmem_split[thread, ty1] +
369
- 0.5f0 * noncons_flux_node2[tx] * shmem_split[thread, ty2]
370
- end
351
+ @inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_szero[thread, ty1] +
352
+ symmetric_flux_node2 [tx] * shmem_szero[thread, ty2 ] +
353
+ 0.5f0 *
354
+ noncons_flux_node1[tx] * shmem_split[thread, ty1] +
355
+ 0.5f0 *
356
+ noncons_flux_node2[tx] * shmem_split[thread, ty2]
371
357
end
372
358
end
373
359
@@ -1260,7 +1246,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms,
1260
1246
# TODO : More checks before the kernel launch
1261
1247
thread_per_block = size (du, 1 ) * size (du, 2 )^ 2
1262
1248
if thread_per_block > MAX_THREADS_PER_BLOCK
1263
- # TODO : How to optimize when size is large
1249
+ # How to optimize when size is large?
1264
1250
flux_arr1 = similar (u)
1265
1251
flux_arr2 = similar (u)
1266
1252
@@ -1300,7 +1286,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
1300
1286
1301
1287
thread_per_block = size (du, 2 )^ 2
1302
1288
if thread_per_block > MAX_THREADS_PER_BLOCK
1303
- # TODO : How to optimize when size is large
1289
+ # How to optimize when size is large?
1304
1290
volume_flux_arr1 = CuArray {RealT} (undef, size (u, 1 ), size (u, 2 ), size (u, 2 ), size (u, 2 ),
1305
1291
size (u, 4 ))
1306
1292
volume_flux_arr2 = CuArray {RealT} (undef, size (u, 1 ), size (u, 2 ), size (u, 2 ), size (u, 2 ),
@@ -1345,7 +1331,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
1345
1331
1346
1332
thread_per_block = size (du, 2 )^ 2
1347
1333
if thread_per_block > MAX_THREADS_PER_BLOCK
1348
- # TODO : How to optimize when size is large
1334
+ # How to optimize when size is large?
1349
1335
symmetric_flux_arr1 = CuArray {RealT} (undef, size (u, 1 ), size (u, 2 ), size (u, 2 ), size (u, 2 ),
1350
1336
size (u, 4 ))
1351
1337
symmetric_flux_arr2 = CuArray {RealT} (undef, size (u, 1 ), size (u, 2 ), size (u, 2 ), size (u, 2 ),
0 commit comments