api: move interp coefficient inside most inner loop

mloubout · mloubout · commit db9ff76d7208 · 2023-07-28T23:02:10.000-04:00
diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py
@@ -287,7 +287,7 @@ def _inject(self, field, expr, implicit_dims=None):
             injection expression, but that should be honored when constructing
             the operator.
         """
-        implicit_dims = self._augment_implicit_dims(implicit_dims)
+        implicit_dims = self._augment_implicit_dims(implicit_dims) + self._rdim
 
         # Make iterable to support inject((u, v), expr=expr)
         # or inject((u, v), expr=(expr1, expr2))
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
@@ -285,10 +285,6 @@ def _select_candidates(self, candidates):
                 if i.is_Vectorized:
                     break
 
-                # Also, we do not want to collapse small atomic reductions
-                if i.is_ParallelAtomic and i.dim.is_Custom:
-                    break
-
                 # Would there be enough work per parallel iteration?
                 nested = candidates[n+1:]
                 if nested:
@@ -299,7 +295,8 @@ def _select_candidates(self, candidates):
                     except TypeError:
                         pass
 
-                collapsable.append(i)
+                if not i.is_ParallelAtomic or nested:
+                    collapsable.append(i)
 
             # Give a score to this candidate, based on the number of fully-parallel
             # Iterations and their position (i.e. outermost to innermost) in the nest
diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py
@@ -102,6 +102,10 @@ def _print_Mod(self, expr):
         args = ['(%s)' % self._print(a) for a in expr.args]
         return '%'.join(args)
 
+    def _print_Mul(self, expr):
+        term = super()._print_Mul(expr)
+        return term.replace("(-1)*", "-")
+
     def _print_Min(self, expr):
         if has_integer_args(*expr.args) and len(expr.args) == 2:
             return "MIN(%s)" % self._print(expr.args)[1:-1]
diff --git a/tests/test_buffering.py b/tests/test_buffering.py
@@ -272,7 +272,7 @@ def test_over_injection():
 
     # Check generated code
     assert len(retrieve_iteration_tree(op1)) == \
-        8 + bool(configuration['language'] != 'C')
+        7 + 2*int(configuration['language'] != 'C')
     buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
     assert len(buffers) == 1
 
diff --git a/tests/test_dle.py b/tests/test_dle.py
@@ -187,14 +187,9 @@ def test_cache_blocking_structure_optrelax():
 
     op = Operator(eqns, opt=('advanced', {'blockrelax': True}))
 
-    bns, _ = assert_blocking(op, {'p_src0_blk0', 'x0_blk0', 'p_src1_blk0'})
+    bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'})
 
     iters = FindNodes(Iteration).visit(bns['p_src0_blk0'])
-    assert len(iters) == 2
-    assert iters[0].dim.is_Block
-    assert iters[1].dim.is_Block
-
-    iters = FindNodes(Iteration).visit(bns['p_src1_blk0'])
     assert len(iters) == 5
     assert iters[0].dim.is_Block
     assert iters[1].dim.is_Block
@@ -291,7 +286,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
                                           'openmp': True,
                                           'par-collapse-ncores': 1}))
 
-    assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
+    assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'],
                      't,p_s0_blk0,p_s,rsx,rsy')
 
 
@@ -750,13 +745,14 @@ def test_array_sum_reduction(self, so, dim):
         iterations = FindNodes(Iteration).visit(op)
         parallelized = iterations[dim+1]
         assert parallelized.pragmas
-        if parallelized is iterations[-1]:
+        if parallelized.dim is iterations[-1]:
             # With the `f[z] += u[t0][x + 1][y + 1][z + 1] + 1` expr, the innermost
             # `z` Iteration gets parallelized, nothing is collapsed, hence no
             # reduction is required
             assert "reduction" not in parallelized.pragmas[0].value
         elif Ompizer._support_array_reduction(configuration['compiler']):
-            assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
+            if "collapse" in parallelized.pragmas[0].value:
+                assert "reduction(+:f[0:f_vec->size[0]])" in parallelized.pragmas[0].value
         else:
             # E.g. old GCC's
             assert "atomic update" in str(iterations[-1])
@@ -817,8 +813,10 @@ def test_incs_no_atomic(self):
         # All loops get collapsed, but the `y` and `z` loops are PARALLEL_IF_ATOMIC,
         # hence an atomic pragma is expected
         op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True,
-                                                     'par-collapse-ncores': 1}))
-        assert 'collapse(3)' in str(op0)
+                                                     'par-collapse-ncores': 1,
+                                                     'par-collapse-work': 0}))
+
+        assert 'collapse(2)' in str(op0)
         assert 'atomic' in str(op0)
 
         # Now only `x` is parallelized
@@ -928,7 +926,6 @@ def test_simd_space_invariant(self):
         assert 'omp simd' in iterations[3].pragmas[0].value
 
         op.apply()
-        print(op._lib)
         assert np.isclose(np.linalg.norm(f.data), 37.1458, rtol=1e-5)
 
     def test_parallel_prec_inject(self):
@@ -955,6 +952,14 @@ def test_parallel_prec_inject(self):
         assert not iterations[0].pragmas
         assert 'omp for' in iterations[1].pragmas[0].value
 
+        op0 = Operator(eqns, opt=('advanced', {'openmp': True,
+                                               'par-collapse-ncores': 1,
+                                               'par-collapse-work': 1}))
+        iterations = FindNodes(Iteration).visit(op0)
+
+        assert not iterations[0].pragmas
+        assert 'omp for collapse(2)' in iterations[1].pragmas[0].value
+
 
 class TestNestedParallelism(object):
 
@@ -1007,6 +1012,7 @@ def test_collapsing(self):
 
         # Does it produce the right result
         op.apply(t_M=9)
+
         assert np.all(u.data[0] == 10)
 
         bns, _ = assert_blocking(op, {'x0_blk0'})
diff --git a/tests/test_dse.py b/tests/test_dse.py
@@ -42,9 +42,9 @@ def test_scheduling_after_rewrite():
     trees = retrieve_iteration_tree(op)
 
     # Check loop nest structure
-    assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions))  # time invariant
-    assert trees[2].root.dim is grid.time_dim
-    assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:])
+    assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions))  # time invariant
+    assert trees[1].root.dim is grid.time_dim
+    assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:])
 
 
 @pytest.mark.parametrize('exprs,expected,min_cost', [
@@ -1665,7 +1665,7 @@ def test_drop_redundants_after_fusion(self, rotate):
         op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate}))
 
         arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
-        assert len(arrays) == 4
+        assert len(arrays) == 2
         assert all(i._mem_heap and not i._mem_external for i in arrays)
 
     def test_full_shape_big_temporaries(self):
@@ -2689,10 +2689,9 @@ def test_fullopt(self):
         assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001)
 
         assert summary1[('section0', None)].ops == 9
-        assert summary1[('section1', None)].ops == 9
-        assert summary1[('section2', None)].ops == 31
-        assert summary1[('section3', None)].ops == 26
-        assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001)
+        assert summary1[('section1', None)].ops == 31
+        assert summary1[('section2', None)].ops == 88
+        assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001)
 
         assert np.allclose(u0.data, u1.data, atol=10e-5)
         assert np.allclose(rec0.data, rec1.data, atol=10e-5)
@@ -2752,8 +2751,8 @@ def test_fullopt(self):
         assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1)
 
         # Check expected opcount/oi
-        assert summary[('section3', None)].ops == 92
-        assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001)
+        assert summary[('section2', None)].ops == 92
+        assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001)
 
         # With optimizations enabled, there should be exactly four BlockDimensions
         op = wavesolver.op_fwd()
@@ -2768,10 +2767,10 @@ def test_fullopt(self):
         # * all of the six Arrays are allocated on the heap
         # * with OpenMP:
         #   four Arrays are defined globally for the cos/sin temporaries
-        #   six Arrays are defined globally for the sparse positions temporaries
+        #   3 Arrays are defined globally for the sparse positions temporaries
         # and two additional bock-sized Arrays are defined locally
         arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
-        extra_arrays = 2+6
+        extra_arrays = 2+3
         assert len(arrays) == 4 + extra_arrays
         assert all(i._mem_heap and not i._mem_external for i in arrays)
         bns, pbs = assert_blocking(op, {'x0_blk0'})
@@ -2807,7 +2806,7 @@ def test_fullopt_w_mpi(self):
     def test_opcounts(self, space_order, expected):
         op = self.tti_operator(opt='advanced', space_order=space_order)
         sections = list(op.op_fwd()._profiler._sections.values())
-        assert sections[3].sops == expected
+        assert sections[2].sops == expected
 
     @switchconfig(profiling='advanced')
     @pytest.mark.parametrize('space_order,expected', [
@@ -2817,8 +2816,8 @@ def test_opcounts_adjoint(self, space_order, expected):
         wavesolver = self.tti_operator(opt=('advanced', {'openmp': False}))
         op = wavesolver.op_adj()
 
-        assert op._profiler._sections['section3'].sops == expected
-        assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+6
+        assert op._profiler._sections['section2'].sops == expected
+        assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3
 
 
 class TestTTIv2(object):
diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py
@@ -102,7 +102,7 @@ def test_tile_insteadof_collapse(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 6
+        assert len(trees) == 4
 
         assert trees[1][1].pragmas[0].value ==\
             'acc parallel loop tile(32,4,4) present(u)'
@@ -130,7 +130,7 @@ def test_multiple_tile_sizes(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 6
+        assert len(trees) == 4
 
         assert trees[1][1].pragmas[0].value ==\
             'acc parallel loop tile(32,4,4) present(u)'
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
@@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
         assert not tree.root.pragmas
         assert len(tree[1].pragmas) == 1
         assert tree[1].pragmas[0].value ==\
-            ('omp target teams distribute parallel for collapse(3)'
+            ('omp target teams distribute parallel for collapse(2)'
              ' reduction(+:f[0])')
 
 
diff --git a/tests/test_mpi.py b/tests/test_mpi.py
@@ -2498,8 +2498,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save):
         op_adj = solver.op_adj()
         adj_calls = FindNodes(Call).visit(op_adj)
 
-        # one halo, 2 * ndim memalign and free (pos temp src/rec)
-        sf_calls = 2 * len(shape) + 2 * len(shape)
+        # one halo, ndim memalign and free (pos temp rec)
+        sf_calls = 2 * len(shape)
         assert len(fwd_calls) == 1 + sf_calls
         assert len(adj_calls) == 1 + sf_calls
 
diff --git a/tests/test_operator.py b/tests/test_operator.py
@@ -1803,7 +1803,7 @@ def test_scheduling_sparse_functions(self):
         # `trees` than 6
         op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 6
+        assert len(trees) == 5
         # Time loop not shared due to the WAR
         assert trees[0][0].dim is time and trees[0][0] is trees[1][0]  # this IS shared
         assert trees[1][0] is not trees[3][0]
@@ -1813,7 +1813,7 @@ def test_scheduling_sparse_functions(self):
         eqn2 = sf1.inject(u1.forward, expr=sf1)
         op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False}))
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 6
+        assert len(trees) == 5
         assert all(trees[0][0] is i[0] for i in trees)
 
     def test_scheduling_with_free_dims(self):