Skip to content

Commit

Permalink
Merge pull request #2390 from devitocodes/data-stream-improvs
Browse files Browse the repository at this point in the history
compiler: Improve data streaming backend
  • Loading branch information
FabioLuporini authored Jun 28, 2024
2 parents e37d6ff + 4d08839 commit 4e86b12
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 78 deletions.
8 changes: 6 additions & 2 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
# Brand-agnostic
'ANYCPU', 'ANYGPU',
# Intel CPUs
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
Expand Down Expand Up @@ -592,7 +594,7 @@ def get_platform():
pass

# Unable to detect platform. Stick to default...
return CPU64
return ANYCPU


class Platform:
Expand Down Expand Up @@ -893,7 +895,7 @@ def march(cls):


# CPUs
CPU64 = Cpu64('cpu64')
ANYCPU = Cpu64('cpu64')
CPU64_DUMMY = Intel64('cpu64-dummy', cores_logical=2, cores_physical=1, isa='sse')

INTEL64 = Intel64('intel64')
Expand Down Expand Up @@ -921,6 +923,8 @@ def march(cls):
POWER9 = Power('power9')

# Devices
ANYGPU = Cpu64('gpu')

NVIDIAX = NvidiaDevice('nvidiaX')

AMDGPUX = AmdDevice('amdgpuX')
Expand Down
12 changes: 10 additions & 2 deletions devito/core/operator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from collections.abc import Iterable
from functools import cached_property

from devito.core.autotuning import autotune
from devito.exceptions import InvalidArgument, InvalidOperator
from devito.ir import FindSymbols
from devito.logger import warning
from devito.mpi.routines import mpi_registry
from devito.parameters import configuration
from devito.operator import Operator
from devito.tools import (as_tuple, is_integer, timed_pass,
UnboundTuple, UnboundedMultiTuple)
from devito.types import NThreads
from devito.types import NThreads, PThreadArray

__all__ = ['CoreOperator', 'CustomOperator',
# Optimization options
Expand Down Expand Up @@ -190,7 +192,7 @@ def _autotune(self, args, setup):

return args

@property
@cached_property
def nthreads(self):
nthreads = [i for i in self.input if isinstance(i, NThreads)]
if len(nthreads) == 0:
Expand All @@ -199,6 +201,12 @@ def nthreads(self):
assert len(nthreads) == 1
return nthreads.pop()

@cached_property
def npthreads(self):
symbols = FindSymbols().visit(self.body)
ptas = [i for i in symbols if isinstance(i, PThreadArray)]
return sum(i.size for i in ptas)


class CoreOperator(BasicOperator):
pass
Expand Down
17 changes: 12 additions & 5 deletions devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def callback(self, clusters, prefix, seen=None):

key = lambda i: i in prefix[:-1] or i in hs.loc_indices
ispace = c.ispace.project(key)
# HaloTOuch are not parallel
# HaloTouches are not parallel
properties = c.properties.sequentialize()

halo_touch = c.rebuild(exprs=expr, ispace=ispace, properties=properties)
Expand Down Expand Up @@ -614,18 +614,25 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):
# of the target backend
lhs, rhs = e.args

try:
f = rhs.function
except AttributeError:
f = None

if lhs.function.is_Array:
# Probably a compiler-generated reduction, e.g. via
# recursive compilation; it's an Array already, so nothing to do
processed.append(e)
elif rhs in mapper:
# Seen this RHS already, so reuse the Array that was created for it
processed.append(e.func(lhs, mapper[rhs].indexify()))
elif f and f.is_Array and sum(flatten(f._size_nodomain)) == 0:
# Special case: the RHS is an Array with no halo/padding, meaning
# that the written data values are contiguous in memory, hence
# we can simply reuse the Array itself as we're already in the
# desired memory layout
processed.append(e)
else:
# Here the LHS could be a Symbol or a user-level Function
# In the latter case we copy the data into a temporary Array
# because the Function might be padded, and reduction operations
# require, in general, the data values to be contiguous
name = sregistry.make_name()
try:
grid = cluster.grid
Expand Down
5 changes: 2 additions & 3 deletions devito/ir/stree/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,8 @@ def stree_build(clusters, profiler=None, **kwargs):
if i.is_Halo:
found = i
elif i.is_Sync:
if profiler._verbosity > 0 or not i.is_async:
attach_section(i)
section = None
attach_section(i)
section = None
break
elif i.is_Iteration:
if (i.dim.is_Time and SEQUENTIAL in i.properties):
Expand Down
9 changes: 8 additions & 1 deletion devito/mpi/halo_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,14 @@ def _uxreplace_dispatch_haloscheme(hs0, rule):
# They indeed do change
d1 = g.indices[d0]
loc_indices[d1] = v.indices[d0]
loc_dirs[d1] = hse0.loc_dirs[d0]

try:
loc_dirs[d1] = hse0.loc_dirs[d0]
except KeyError:
# E.g., `usave(cd, x, y)` and `usave.dx` in an
# adjoint Operator
assert d0.is_Conditional
loc_dirs[d1] = hse0.loc_dirs[d0.root]

if len(loc_indices) != len(hse0.loc_indices):
# Nope, let's try with the next Indexed, if any
Expand Down
Loading

0 comments on commit 4e86b12

Please sign in to comment.