Skip to content

Commit db9ff76

Browse files
committed
api: move interp coefficient inside most inner loop
1 parent d0d2fcb commit db9ff76

File tree

10 files changed

+47
-41
lines changed

10 files changed

+47
-41
lines changed

devito/operations/interpolators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ def _inject(self, field, expr, implicit_dims=None):
287287
injection expression, but that should be honored when constructing
288288
the operator.
289289
"""
290-
implicit_dims = self._augment_implicit_dims(implicit_dims)
290+
implicit_dims = self._augment_implicit_dims(implicit_dims) + self._rdim
291291

292292
# Make iterable to support inject((u, v), expr=expr)
293293
# or inject((u, v), expr=(expr1, expr2))

devito/passes/iet/parpragma.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -285,10 +285,6 @@ def _select_candidates(self, candidates):
285285
if i.is_Vectorized:
286286
break
287287

288-
# Also, we do not want to collapse small atomic reductions
289-
if i.is_ParallelAtomic and i.dim.is_Custom:
290-
break
291-
292288
# Would there be enough work per parallel iteration?
293289
nested = candidates[n+1:]
294290
if nested:
@@ -299,7 +295,8 @@ def _select_candidates(self, candidates):
299295
except TypeError:
300296
pass
301297

302-
collapsable.append(i)
298+
if not i.is_ParallelAtomic or nested:
299+
collapsable.append(i)
303300

304301
# Give a score to this candidate, based on the number of fully-parallel
305302
# Iterations and their position (i.e. outermost to innermost) in the nest

devito/symbolics/printer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ def _print_Mod(self, expr):
102102
args = ['(%s)' % self._print(a) for a in expr.args]
103103
return '%'.join(args)
104104

105+
def _print_Mul(self, expr):
106+
term = super()._print_Mul(expr)
107+
return term.replace("(-1)*", "-")
108+
105109
def _print_Min(self, expr):
106110
if has_integer_args(*expr.args) and len(expr.args) == 2:
107111
return "MIN(%s)" % self._print(expr.args)[1:-1]

tests/test_buffering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def test_over_injection():
272272

273273
# Check generated code
274274
assert len(retrieve_iteration_tree(op1)) == \
275-
8 + bool(configuration['language'] != 'C')
275+
7 + 2*int(configuration['language'] != 'C')
276276
buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
277277
assert len(buffers) == 1
278278

tests/test_dle.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -187,14 +187,9 @@ def test_cache_blocking_structure_optrelax():
187187

188188
op = Operator(eqns, opt=('advanced', {'blockrelax': True}))
189189

190-
bns, _ = assert_blocking(op, {'p_src0_blk0', 'x0_blk0', 'p_src1_blk0'})
190+
bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'})
191191

192192
iters = FindNodes(Iteration).visit(bns['p_src0_blk0'])
193-
assert len(iters) == 2
194-
assert iters[0].dim.is_Block
195-
assert iters[1].dim.is_Block
196-
197-
iters = FindNodes(Iteration).visit(bns['p_src1_blk0'])
198193
assert len(iters) == 5
199194
assert iters[0].dim.is_Block
200195
assert iters[1].dim.is_Block
@@ -291,7 +286,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
291286
'openmp': True,
292287
'par-collapse-ncores': 1}))
293288

294-
assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
289+
assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'],
295290
't,p_s0_blk0,p_s,rsx,rsy')
296291

297292

@@ -750,13 +745,14 @@ def test_array_sum_reduction(self, so, dim):
750745
iterations = FindNodes(Iteration).visit(op)
751746
parallelized = iterations[dim+1]
752747
assert parallelized.pragmas
753-
if parallelized is iterations[-1]:
748+
if parallelized.dim is iterations[-1]:
754749
# With the `f[z] += u[t0][x + 1][y + 1][z + 1] + 1` expr, the innermost
755750
# `z` Iteration gets parallelized, nothing is collapsed, hence no
756751
# reduction is required
757752
assert "reduction" not in parallelized.pragmas[0].value
758753
elif Ompizer._support_array_reduction(configuration['compiler']):
759-
assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
754+
if "collapse" in parallelized.pragmas[0].value:
755+
assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
760756
else:
761757
# E.g. old GCC's
762758
assert "atomic update" in str(iterations[-1])
@@ -817,8 +813,10 @@ def test_incs_no_atomic(self):
817813
# All loops get collapsed, but the `y` and `z` loops are PARALLEL_IF_ATOMIC,
818814
# hence an atomic pragma is expected
819815
op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True,
820-
'par-collapse-ncores': 1}))
821-
assert 'collapse(3)' in str(op0)
816+
'par-collapse-ncores': 1,
817+
'par-collapse-work': 0}))
818+
819+
assert 'collapse(2)' in str(op0)
822820
assert 'atomic' in str(op0)
823821

824822
# Now only `x` is parallelized
@@ -928,7 +926,6 @@ def test_simd_space_invariant(self):
928926
assert 'omp simd' in iterations[3].pragmas[0].value
929927

930928
op.apply()
931-
print(op._lib)
932929
assert np.isclose(np.linalg.norm(f.data), 37.1458, rtol=1e-5)
933930

934931
def test_parallel_prec_inject(self):
@@ -955,6 +952,14 @@ def test_parallel_prec_inject(self):
955952
assert not iterations[0].pragmas
956953
assert 'omp for' in iterations[1].pragmas[0].value
957954

955+
op0 = Operator(eqns, opt=('advanced', {'openmp': True,
956+
'par-collapse-ncores': 1,
957+
'par-collapse-work': 1}))
958+
iterations = FindNodes(Iteration).visit(op0)
959+
960+
assert not iterations[0].pragmas
961+
assert 'omp for collapse(2)' in iterations[1].pragmas[0].value
962+
958963

959964
class TestNestedParallelism(object):
960965

@@ -1007,6 +1012,7 @@ def test_collapsing(self):
10071012

10081013
# Does it produce the right result
10091014
op.apply(t_M=9)
1015+
10101016
assert np.all(u.data[0] == 10)
10111017

10121018
bns, _ = assert_blocking(op, {'x0_blk0'})

tests/test_dse.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ def test_scheduling_after_rewrite():
4242
trees = retrieve_iteration_tree(op)
4343

4444
# Check loop nest structure
45-
assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant
46-
assert trees[2].root.dim is grid.time_dim
47-
assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:])
45+
assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant
46+
assert trees[1].root.dim is grid.time_dim
47+
assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:])
4848

4949

5050
@pytest.mark.parametrize('exprs,expected,min_cost', [
@@ -1665,7 +1665,7 @@ def test_drop_redundants_after_fusion(self, rotate):
16651665
op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate}))
16661666

16671667
arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
1668-
assert len(arrays) == 4
1668+
assert len(arrays) == 2
16691669
assert all(i._mem_heap and not i._mem_external for i in arrays)
16701670

16711671
def test_full_shape_big_temporaries(self):
@@ -2689,10 +2689,9 @@ def test_fullopt(self):
26892689
assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001)
26902690

26912691
assert summary1[('section0', None)].ops == 9
2692-
assert summary1[('section1', None)].ops == 9
2693-
assert summary1[('section2', None)].ops == 31
2694-
assert summary1[('section3', None)].ops == 26
2695-
assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001)
2692+
assert summary1[('section1', None)].ops == 31
2693+
assert summary1[('section2', None)].ops == 88
2694+
assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001)
26962695

26972696
assert np.allclose(u0.data, u1.data, atol=10e-5)
26982697
assert np.allclose(rec0.data, rec1.data, atol=10e-5)
@@ -2752,8 +2751,8 @@ def test_fullopt(self):
27522751
assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1)
27532752

27542753
# Check expected opcount/oi
2755-
assert summary[('section3', None)].ops == 92
2756-
assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001)
2754+
assert summary[('section2', None)].ops == 92
2755+
assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001)
27572756

27582757
# With optimizations enabled, there should be exactly four BlockDimensions
27592758
op = wavesolver.op_fwd()
@@ -2768,10 +2767,10 @@ def test_fullopt(self):
27682767
# * all of the six Arrays are allocated on the heap
27692768
# * with OpenMP:
27702769
# four Arrays are defined globally for the cos/sin temporaries
2771-
# six Arrays are defined globally for the sparse positions temporaries
2770+
# 3 Arrays are defined globally for the sparse positions temporaries
27722771
# and two additional bock-sized Arrays are defined locally
27732772
arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
2774-
extra_arrays = 2+6
2773+
extra_arrays = 2+3
27752774
assert len(arrays) == 4 + extra_arrays
27762775
assert all(i._mem_heap and not i._mem_external for i in arrays)
27772776
bns, pbs = assert_blocking(op, {'x0_blk0'})
@@ -2807,7 +2806,7 @@ def test_fullopt_w_mpi(self):
28072806
def test_opcounts(self, space_order, expected):
28082807
op = self.tti_operator(opt='advanced', space_order=space_order)
28092808
sections = list(op.op_fwd()._profiler._sections.values())
2810-
assert sections[3].sops == expected
2809+
assert sections[2].sops == expected
28112810

28122811
@switchconfig(profiling='advanced')
28132812
@pytest.mark.parametrize('space_order,expected', [
@@ -2817,8 +2816,8 @@ def test_opcounts_adjoint(self, space_order, expected):
28172816
wavesolver = self.tti_operator(opt=('advanced', {'openmp': False}))
28182817
op = wavesolver.op_adj()
28192818

2820-
assert op._profiler._sections['section3'].sops == expected
2821-
assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+6
2819+
assert op._profiler._sections['section2'].sops == expected
2820+
assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3
28222821

28232822

28242823
class TestTTIv2(object):

tests/test_gpu_openacc.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def test_tile_insteadof_collapse(self, par_tile):
102102
opt=('advanced', {'par-tile': par_tile}))
103103

104104
trees = retrieve_iteration_tree(op)
105-
assert len(trees) == 6
105+
assert len(trees) == 4
106106

107107
assert trees[1][1].pragmas[0].value ==\
108108
'acc parallel loop tile(32,4,4) present(u)'
@@ -130,7 +130,7 @@ def test_multiple_tile_sizes(self, par_tile):
130130
opt=('advanced', {'par-tile': par_tile}))
131131

132132
trees = retrieve_iteration_tree(op)
133-
assert len(trees) == 6
133+
assert len(trees) == 4
134134

135135
assert trees[1][1].pragmas[0].value ==\
136136
'acc parallel loop tile(32,4,4) present(u)'

tests/test_gpu_openmp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
265265
assert not tree.root.pragmas
266266
assert len(tree[1].pragmas) == 1
267267
assert tree[1].pragmas[0].value ==\
268-
('omp target teams distribute parallel for collapse(3)'
268+
('omp target teams distribute parallel for collapse(2)'
269269
' reduction(+:f[0])')
270270

271271

tests/test_mpi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2498,8 +2498,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save):
24982498
op_adj = solver.op_adj()
24992499
adj_calls = FindNodes(Call).visit(op_adj)
25002500

2501-
# one halo, 2 * ndim memalign and free (pos temp src/rec)
2502-
sf_calls = 2 * len(shape) + 2 * len(shape)
2501+
# one halo, ndim memalign and free (pos temp rec)
2502+
sf_calls = 2 * len(shape)
25032503
assert len(fwd_calls) == 1 + sf_calls
25042504
assert len(adj_calls) == 1 + sf_calls
25052505

tests/test_operator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1803,7 +1803,7 @@ def test_scheduling_sparse_functions(self):
18031803
# `trees` than 6
18041804
op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
18051805
trees = retrieve_iteration_tree(op)
1806-
assert len(trees) == 6
1806+
assert len(trees) == 5
18071807
# Time loop not shared due to the WAR
18081808
assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared
18091809
assert trees[1][0] is not trees[3][0]
@@ -1813,7 +1813,7 @@ def test_scheduling_sparse_functions(self):
18131813
eqn2 = sf1.inject(u1.forward, expr=sf1)
18141814
op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
18151815
trees = retrieve_iteration_tree(op)
1816-
assert len(trees) == 6
1816+
assert len(trees) == 5
18171817
assert all(trees[0][0] is i[0] for i in trees)
18181818

18191819
def test_scheduling_with_free_dims(self):

0 commit comments

Comments
 (0)