diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index b480c9f1139..e082848ffca 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -287,7 +287,7 @@ def _inject(self, field, expr, implicit_dims=None): injection expression, but that should be honored when constructing the operator. """ - implicit_dims = self._augment_implicit_dims(implicit_dims) + implicit_dims = self._augment_implicit_dims(implicit_dims) + self._rdim # Make iterable to support inject((u, v), expr=expr) # or inject((u, v), expr=(expr1, expr2)) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 016726c0017..1d1ca334d2d 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -285,10 +285,6 @@ def _select_candidates(self, candidates): if i.is_Vectorized: break - # Also, we do not want to collapse small atomic reductions - if i.is_ParallelAtomic and i.dim.is_Custom: - break - # Would there be enough work per parallel iteration? nested = candidates[n+1:] if nested: @@ -299,7 +295,8 @@ def _select_candidates(self, candidates): except TypeError: pass - collapsable.append(i) + if not i.is_ParallelAtomic or nested: + collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index c5f4e19e4c7..3539f64aeb4 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -102,6 +102,10 @@ def _print_Mod(self, expr): args = ['(%s)' % self._print(a) for a in expr.args] return '%'.join(args) + def _print_Mul(self, expr): + term = super()._print_Mul(expr) + return term.replace("(-1)*", "-") + def _print_Min(self, expr): if has_integer_args(*expr.args) and len(expr.args) == 2: return "MIN(%s)" % self._print(expr.args)[1:-1] diff --git a/tests/test_buffering.py b/tests/test_buffering.py index b7f59e61a5a..05587c04030 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -272,7 +272,7 @@ def test_over_injection(): # Check generated code assert len(retrieve_iteration_tree(op1)) == \ - 8 + bool(configuration['language'] != 'C') + 7 + 2*int(configuration['language'] != 'C') buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 diff --git a/tests/test_dle.py b/tests/test_dle.py index 56484c40425..f98eaa0ed36 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -187,14 +187,9 @@ def test_cache_blocking_structure_optrelax(): op = Operator(eqns, opt=('advanced', {'blockrelax': True})) - bns, _ = assert_blocking(op, {'p_src0_blk0', 'x0_blk0', 'p_src1_blk0'}) + bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'}) iters = FindNodes(Iteration).visit(bns['p_src0_blk0']) - assert len(iters) == 2 - assert iters[0].dim.is_Block - assert iters[1].dim.is_Block - - iters = FindNodes(Iteration).visit(bns['p_src1_blk0']) assert len(iters) == 5 assert iters[0].dim.is_Block assert iters[1].dim.is_Block @@ -291,7 +286,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -750,13 +745,14 @@ def test_array_sum_reduction(self, so, dim): iterations = FindNodes(Iteration).visit(op) parallelized = iterations[dim+1] assert parallelized.pragmas - if parallelized is iterations[-1]: + if parallelized.dim is iterations[-1]: # With the `f[z] += u[t0][x + 1][y + 1][z + 1] + 1` expr, the innermost # `z` Iteration gets parallelized, nothing is collapsed, hence no # reduction is required assert "reduction" not in parallelized.pragmas[0].value elif Ompizer._support_array_reduction(configuration['compiler']): - assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value + if "collapse" in parallelized.pragmas[0].value: + assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value else: # E.g. old GCC's assert "atomic update" in str(iterations[-1]) @@ -817,8 +813,10 @@ def test_incs_no_atomic(self): # All loops get collapsed, but the `y` and `z` loops are PARALLEL_IF_ATOMIC, # hence an atomic pragma is expected op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True, - 'par-collapse-ncores': 1})) - assert 'collapse(3)' in str(op0) + 'par-collapse-ncores': 1, + 'par-collapse-work': 0})) + + assert 'collapse(2)' in str(op0) assert 'atomic' in str(op0) # Now only `x` is parallelized @@ -928,7 +926,6 @@ def test_simd_space_invariant(self): assert 'omp simd' in iterations[3].pragmas[0].value op.apply() - print(op._lib) assert np.isclose(np.linalg.norm(f.data), 37.1458, rtol=1e-5) def test_parallel_prec_inject(self): @@ -955,6 +952,14 @@ def test_parallel_prec_inject(self): assert not iterations[0].pragmas assert 'omp for' in iterations[1].pragmas[0].value + op0 = Operator(eqns, opt=('advanced', {'openmp': True, + 'par-collapse-ncores': 1, + 'par-collapse-work': 1})) + iterations = FindNodes(Iteration).visit(op0) + + assert not iterations[0].pragmas + assert 'omp for collapse(2)' in iterations[1].pragmas[0].value + class TestNestedParallelism(object): @@ -1007,6 +1012,7 @@ def test_collapsing(self): # Does it produce the right result op.apply(t_M=9) + assert np.all(u.data[0] == 10) bns, _ = assert_blocking(op, {'x0_blk0'}) diff --git a/tests/test_dse.py b/tests/test_dse.py index 2904597128f..4a868bc813f 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -42,9 +42,9 @@ def test_scheduling_after_rewrite(): trees = retrieve_iteration_tree(op) # Check loop nest structure - assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant - assert trees[2].root.dim is grid.time_dim - assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:]) + assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant + assert trees[1].root.dim is grid.time_dim + assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:]) @pytest.mark.parametrize('exprs,expected,min_cost', [ @@ -1665,7 +1665,7 @@ def test_drop_redundants_after_fusion(self, rotate): op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate})) arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - assert len(arrays) == 4 + assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays) def test_full_shape_big_temporaries(self): @@ -2689,10 +2689,9 @@ def test_fullopt(self): assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 9 - assert summary1[('section2', None)].ops == 31 - assert summary1[('section3', None)].ops == 26 - assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001) + assert summary1[('section1', None)].ops == 31 + assert summary1[('section2', None)].ops == 88 + assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2752,8 +2751,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section3', None)].ops == 92 - assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001) + assert summary[('section2', None)].ops == 92 + assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2768,10 +2767,10 @@ def test_fullopt(self): # * all of the six Arrays are allocated on the heap # * with OpenMP: # four Arrays are defined globally for the cos/sin temporaries - # six Arrays are defined globally for the sparse positions temporaries + # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+6 + extra_arrays = 2+3 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2807,7 +2806,7 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[3].sops == expected + assert sections[2].sops == expected @switchconfig(profiling='advanced') @pytest.mark.parametrize('space_order,expected', [ @@ -2817,8 +2816,8 @@ def test_opcounts_adjoint(self, space_order, expected): wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section3'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+6 + assert op._profiler._sections['section2'].sops == expected + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 class TestTTIv2(object): diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index db92db3c83f..3ea0fe21c96 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,7 +102,7 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' @@ -130,7 +130,7 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 29866508d85..bc2de717082 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self): assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 assert tree[1].pragmas[0].value ==\ - ('omp target teams distribute parallel for collapse(3)' + ('omp target teams distribute parallel for collapse(2)' ' reduction(+:f[0])') diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 9afac22dd50..245e77bfd9e 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2498,8 +2498,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save): op_adj = solver.op_adj() adj_calls = FindNodes(Call).visit(op_adj) - # one halo, 2 * ndim memalign and free (pos temp src/rec) - sf_calls = 2 * len(shape) + 2 * len(shape) + # one halo, ndim memalign and free (pos temp rec) + sf_calls = 2 * len(shape) assert len(fwd_calls) == 1 + sf_calls assert len(adj_calls) == 1 + sf_calls diff --git a/tests/test_operator.py b/tests/test_operator.py index 8cf535dc328..667bf25eb4d 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -1803,7 +1803,7 @@ def test_scheduling_sparse_functions(self): # `trees` than 6 op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 # Time loop not shared due to the WAR assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared assert trees[1][0] is not trees[3][0] @@ -1813,7 +1813,7 @@ def test_scheduling_sparse_functions(self): eqn2 = sf1.inject(u1.forward, expr=sf1) op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 assert all(trees[0][0] is i[0] for i in trees) def test_scheduling_with_free_dims(self):