Skip to content

Commit

Permalink
api: move interp coefficient inside most inner loop
Browse files Browse the repository at this point in the history
  • Loading branch information
mloubout committed Jul 29, 2023
1 parent d0d2fcb commit db9ff76
Show file tree
Hide file tree
Showing 10 changed files with 47 additions and 41 deletions.
2 changes: 1 addition & 1 deletion devito/operations/interpolators.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def _inject(self, field, expr, implicit_dims=None):
injection expression, but that should be honored when constructing
the operator.
"""
implicit_dims = self._augment_implicit_dims(implicit_dims)
implicit_dims = self._augment_implicit_dims(implicit_dims) + self._rdim

# Make iterable to support inject((u, v), expr=expr)
# or inject((u, v), expr=(expr1, expr2))
Expand Down
7 changes: 2 additions & 5 deletions devito/passes/iet/parpragma.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,10 +285,6 @@ def _select_candidates(self, candidates):
if i.is_Vectorized:
break

# Also, we do not want to collapse small atomic reductions
if i.is_ParallelAtomic and i.dim.is_Custom:
break

# Would there be enough work per parallel iteration?
nested = candidates[n+1:]
if nested:
Expand All @@ -299,7 +295,8 @@ def _select_candidates(self, candidates):
except TypeError:
pass

collapsable.append(i)
if not i.is_ParallelAtomic or nested:
collapsable.append(i)

# Give a score to this candidate, based on the number of fully-parallel
# Iterations and their position (i.e. outermost to innermost) in the nest
Expand Down
4 changes: 4 additions & 0 deletions devito/symbolics/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ def _print_Mod(self, expr):
args = ['(%s)' % self._print(a) for a in expr.args]
return '%'.join(args)

def _print_Mul(self, expr):
term = super()._print_Mul(expr)
return term.replace("(-1)*", "-")

def _print_Min(self, expr):
if has_integer_args(*expr.args) and len(expr.args) == 2:
return "MIN(%s)" % self._print(expr.args)[1:-1]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_buffering.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_over_injection():

# Check generated code
assert len(retrieve_iteration_tree(op1)) == \
8 + bool(configuration['language'] != 'C')
7 + 2*int(configuration['language'] != 'C')
buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
assert len(buffers) == 1

Expand Down
30 changes: 18 additions & 12 deletions tests/test_dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,9 @@ def test_cache_blocking_structure_optrelax():

op = Operator(eqns, opt=('advanced', {'blockrelax': True}))

bns, _ = assert_blocking(op, {'p_src0_blk0', 'x0_blk0', 'p_src1_blk0'})
bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'})

iters = FindNodes(Iteration).visit(bns['p_src0_blk0'])
assert len(iters) == 2
assert iters[0].dim.is_Block
assert iters[1].dim.is_Block

iters = FindNodes(Iteration).visit(bns['p_src1_blk0'])
assert len(iters) == 5
assert iters[0].dim.is_Block
assert iters[1].dim.is_Block
Expand Down Expand Up @@ -291,7 +286,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
'openmp': True,
'par-collapse-ncores': 1}))

assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'],
't,p_s0_blk0,p_s,rsx,rsy')


Expand Down Expand Up @@ -750,13 +745,14 @@ def test_array_sum_reduction(self, so, dim):
iterations = FindNodes(Iteration).visit(op)
parallelized = iterations[dim+1]
assert parallelized.pragmas
if parallelized is iterations[-1]:
if parallelized.dim is iterations[-1]:
# With the `f[z] += u[t0][x + 1][y + 1][z + 1] + 1` expr, the innermost
# `z` Iteration gets parallelized, nothing is collapsed, hence no
# reduction is required
assert "reduction" not in parallelized.pragmas[0].value
elif Ompizer._support_array_reduction(configuration['compiler']):
assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
if "collapse" in parallelized.pragmas[0].value:
assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
else:
# E.g. old GCC's
assert "atomic update" in str(iterations[-1])
Expand Down Expand Up @@ -817,8 +813,10 @@ def test_incs_no_atomic(self):
# All loops get collapsed, but the `y` and `z` loops are PARALLEL_IF_ATOMIC,
# hence an atomic pragma is expected
op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1}))
assert 'collapse(3)' in str(op0)
'par-collapse-ncores': 1,
'par-collapse-work': 0}))

assert 'collapse(2)' in str(op0)
assert 'atomic' in str(op0)

# Now only `x` is parallelized
Expand Down Expand Up @@ -928,7 +926,6 @@ def test_simd_space_invariant(self):
assert 'omp simd' in iterations[3].pragmas[0].value

op.apply()
print(op._lib)
assert np.isclose(np.linalg.norm(f.data), 37.1458, rtol=1e-5)

def test_parallel_prec_inject(self):
Expand All @@ -955,6 +952,14 @@ def test_parallel_prec_inject(self):
assert not iterations[0].pragmas
assert 'omp for' in iterations[1].pragmas[0].value

op0 = Operator(eqns, opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1,
'par-collapse-work': 1}))
iterations = FindNodes(Iteration).visit(op0)

assert not iterations[0].pragmas
assert 'omp for collapse(2)' in iterations[1].pragmas[0].value


class TestNestedParallelism(object):

Expand Down Expand Up @@ -1007,6 +1012,7 @@ def test_collapsing(self):

# Does it produce the right result
op.apply(t_M=9)

assert np.all(u.data[0] == 10)

bns, _ = assert_blocking(op, {'x0_blk0'})
Expand Down
29 changes: 14 additions & 15 deletions tests/test_dse.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def test_scheduling_after_rewrite():
trees = retrieve_iteration_tree(op)

# Check loop nest structure
assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant
assert trees[2].root.dim is grid.time_dim
assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:])
assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant
assert trees[1].root.dim is grid.time_dim
assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:])


@pytest.mark.parametrize('exprs,expected,min_cost', [
Expand Down Expand Up @@ -1665,7 +1665,7 @@ def test_drop_redundants_after_fusion(self, rotate):
op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate}))

arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
assert len(arrays) == 4
assert len(arrays) == 2
assert all(i._mem_heap and not i._mem_external for i in arrays)

def test_full_shape_big_temporaries(self):
Expand Down Expand Up @@ -2689,10 +2689,9 @@ def test_fullopt(self):
assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001)

assert summary1[('section0', None)].ops == 9
assert summary1[('section1', None)].ops == 9
assert summary1[('section2', None)].ops == 31
assert summary1[('section3', None)].ops == 26
assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001)
assert summary1[('section1', None)].ops == 31
assert summary1[('section2', None)].ops == 88
assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001)

assert np.allclose(u0.data, u1.data, atol=10e-5)
assert np.allclose(rec0.data, rec1.data, atol=10e-5)
Expand Down Expand Up @@ -2752,8 +2751,8 @@ def test_fullopt(self):
assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1)

# Check expected opcount/oi
assert summary[('section3', None)].ops == 92
assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001)
assert summary[('section2', None)].ops == 92
assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001)

# With optimizations enabled, there should be exactly four BlockDimensions
op = wavesolver.op_fwd()
Expand All @@ -2768,10 +2767,10 @@ def test_fullopt(self):
# * all of the six Arrays are allocated on the heap
# * with OpenMP:
# four Arrays are defined globally for the cos/sin temporaries
# six Arrays are defined globally for the sparse positions temporaries
# 3 Arrays are defined globally for the sparse positions temporaries
# and two additional bock-sized Arrays are defined locally
arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
extra_arrays = 2+6
extra_arrays = 2+3
assert len(arrays) == 4 + extra_arrays
assert all(i._mem_heap and not i._mem_external for i in arrays)
bns, pbs = assert_blocking(op, {'x0_blk0'})
Expand Down Expand Up @@ -2807,7 +2806,7 @@ def test_fullopt_w_mpi(self):
def test_opcounts(self, space_order, expected):
op = self.tti_operator(opt='advanced', space_order=space_order)
sections = list(op.op_fwd()._profiler._sections.values())
assert sections[3].sops == expected
assert sections[2].sops == expected

@switchconfig(profiling='advanced')
@pytest.mark.parametrize('space_order,expected', [
Expand All @@ -2817,8 +2816,8 @@ def test_opcounts_adjoint(self, space_order, expected):
wavesolver = self.tti_operator(opt=('advanced', {'openmp': False}))
op = wavesolver.op_adj()

assert op._profiler._sections['section3'].sops == expected
assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+6
assert op._profiler._sections['section2'].sops == expected
assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3


class TestTTIv2(object):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_gpu_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_tile_insteadof_collapse(self, par_tile):
opt=('advanced', {'par-tile': par_tile}))

trees = retrieve_iteration_tree(op)
assert len(trees) == 6
assert len(trees) == 4

assert trees[1][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
Expand Down Expand Up @@ -130,7 +130,7 @@ def test_multiple_tile_sizes(self, par_tile):
opt=('advanced', {'par-tile': par_tile}))

trees = retrieve_iteration_tree(op)
assert len(trees) == 6
assert len(trees) == 4

assert trees[1][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
Expand Down
2 changes: 1 addition & 1 deletion tests/test_gpu_openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
assert not tree.root.pragmas
assert len(tree[1].pragmas) == 1
assert tree[1].pragmas[0].value ==\
('omp target teams distribute parallel for collapse(3)'
('omp target teams distribute parallel for collapse(2)'
' reduction(+:f[0])')


Expand Down
4 changes: 2 additions & 2 deletions tests/test_mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2498,8 +2498,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save):
op_adj = solver.op_adj()
adj_calls = FindNodes(Call).visit(op_adj)

# one halo, 2 * ndim memalign and free (pos temp src/rec)
sf_calls = 2 * len(shape) + 2 * len(shape)
# one halo, ndim memalign and free (pos temp rec)
sf_calls = 2 * len(shape)
assert len(fwd_calls) == 1 + sf_calls
assert len(adj_calls) == 1 + sf_calls

Expand Down
4 changes: 2 additions & 2 deletions tests/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1803,7 +1803,7 @@ def test_scheduling_sparse_functions(self):
# `trees` than 6
op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
trees = retrieve_iteration_tree(op)
assert len(trees) == 6
assert len(trees) == 5
# Time loop not shared due to the WAR
assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared
assert trees[1][0] is not trees[3][0]
Expand All @@ -1813,7 +1813,7 @@ def test_scheduling_sparse_functions(self):
eqn2 = sf1.inject(u1.forward, expr=sf1)
op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
trees = retrieve_iteration_tree(op)
assert len(trees) == 6
assert len(trees) == 5
assert all(trees[0][0] is i[0] for i in trees)

def test_scheduling_with_free_dims(self):
Expand Down

0 comments on commit db9ff76

Please sign in to comment.