Skip to content

Commit 4e86b12

Browse files
Merge pull request #2390 from devitocodes/data-stream-improvs
compiler: Improve data streaming backend
2 parents e37d6ff + 4d08839 commit 4e86b12

File tree

14 files changed

+284
-78
lines changed

14 files changed

+284
-78
lines changed

devito/arch/archinfo.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
2020
'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
2121
'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
22+
# Brand-agnostic
23+
'ANYCPU', 'ANYGPU',
2224
# Intel CPUs
2325
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
2426
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
@@ -592,7 +594,7 @@ def get_platform():
592594
pass
593595

594596
# Unable to detect platform. Stick to default...
595-
return CPU64
597+
return ANYCPU
596598

597599

598600
class Platform:
@@ -893,7 +895,7 @@ def march(cls):
893895

894896

895897
# CPUs
896-
CPU64 = Cpu64('cpu64')
898+
ANYCPU = Cpu64('cpu64')
897899
CPU64_DUMMY = Intel64('cpu64-dummy', cores_logical=2, cores_physical=1, isa='sse')
898900

899901
INTEL64 = Intel64('intel64')
@@ -921,6 +923,8 @@ def march(cls):
921923
POWER9 = Power('power9')
922924

923925
# Devices
926+
ANYGPU = Cpu64('gpu')
927+
924928
NVIDIAX = NvidiaDevice('nvidiaX')
925929

926930
AMDGPUX = AmdDevice('amdgpuX')

devito/core/operator.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
from collections.abc import Iterable
2+
from functools import cached_property
23

34
from devito.core.autotuning import autotune
45
from devito.exceptions import InvalidArgument, InvalidOperator
6+
from devito.ir import FindSymbols
57
from devito.logger import warning
68
from devito.mpi.routines import mpi_registry
79
from devito.parameters import configuration
810
from devito.operator import Operator
911
from devito.tools import (as_tuple, is_integer, timed_pass,
1012
UnboundTuple, UnboundedMultiTuple)
11-
from devito.types import NThreads
13+
from devito.types import NThreads, PThreadArray
1214

1315
__all__ = ['CoreOperator', 'CustomOperator',
1416
# Optimization options
@@ -190,7 +192,7 @@ def _autotune(self, args, setup):
190192

191193
return args
192194

193-
@property
195+
@cached_property
194196
def nthreads(self):
195197
nthreads = [i for i in self.input if isinstance(i, NThreads)]
196198
if len(nthreads) == 0:
@@ -199,6 +201,12 @@ def nthreads(self):
199201
assert len(nthreads) == 1
200202
return nthreads.pop()
201203

204+
@cached_property
205+
def npthreads(self):
206+
symbols = FindSymbols().visit(self.body)
207+
ptas = [i for i in symbols if isinstance(i, PThreadArray)]
208+
return sum(i.size for i in ptas)
209+
202210

203211
class CoreOperator(BasicOperator):
204212
pass

devito/ir/clusters/algorithms.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def callback(self, clusters, prefix, seen=None):
433433

434434
key = lambda i: i in prefix[:-1] or i in hs.loc_indices
435435
ispace = c.ispace.project(key)
436-
# HaloTOuch are not parallel
436+
# HaloTouches are not parallel
437437
properties = c.properties.sequentialize()
438438

439439
halo_touch = c.rebuild(exprs=expr, ispace=ispace, properties=properties)
@@ -614,18 +614,25 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):
614614
# of the target backend
615615
lhs, rhs = e.args
616616

617+
try:
618+
f = rhs.function
619+
except AttributeError:
620+
f = None
621+
617622
if lhs.function.is_Array:
618623
# Probably a compiler-generated reduction, e.g. via
619624
# recursive compilation; it's an Array already, so nothing to do
620625
processed.append(e)
621626
elif rhs in mapper:
622627
# Seen this RHS already, so reuse the Array that was created for it
623628
processed.append(e.func(lhs, mapper[rhs].indexify()))
629+
elif f and f.is_Array and sum(flatten(f._size_nodomain)) == 0:
630+
# Special case: the RHS is an Array with no halo/padding, meaning
631+
# that the written data values are contiguous in memory, hence
632+
# we can simply reuse the Array itself as we're already in the
633+
# desired memory layout
634+
processed.append(e)
624635
else:
625-
# Here the LHS could be a Symbol or a user-level Function
626-
# In the latter case we copy the data into a temporary Array
627-
# because the Function might be padded, and reduction operations
628-
# require, in general, the data values to be contiguous
629636
name = sregistry.make_name()
630637
try:
631638
grid = cluster.grid

devito/ir/stree/algorithms.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,8 @@ def stree_build(clusters, profiler=None, **kwargs):
126126
if i.is_Halo:
127127
found = i
128128
elif i.is_Sync:
129-
if profiler._verbosity > 0 or not i.is_async:
130-
attach_section(i)
131-
section = None
129+
attach_section(i)
130+
section = None
132131
break
133132
elif i.is_Iteration:
134133
if (i.dim.is_Time and SEQUENTIAL in i.properties):

devito/mpi/halo_scheme.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,14 @@ def _uxreplace_dispatch_haloscheme(hs0, rule):
622622
# They indeed do change
623623
d1 = g.indices[d0]
624624
loc_indices[d1] = v.indices[d0]
625-
loc_dirs[d1] = hse0.loc_dirs[d0]
625+
626+
try:
627+
loc_dirs[d1] = hse0.loc_dirs[d0]
628+
except KeyError:
629+
# E.g., `usave(cd, x, y)` and `usave.dx` in an
630+
# adjoint Operator
631+
assert d0.is_Conditional
632+
loc_dirs[d1] = hse0.loc_dirs[d0.root]
626633

627634
if len(loc_indices) != len(hse0.loc_indices):
628635
# Nope, let's try with the next Indexed, if any

0 commit comments

Comments
 (0)