Skip to content

Commit 0597d00

Browse files
Merge pull request #2417 from devitocodes/restrain-device-blocking
compiler: Fix parlang reductions over >= 4 loops
2 parents 3e259d2 + 8c62ae9 commit 0597d00

File tree

5 files changed

+95
-29
lines changed

5 files changed

+95
-29
lines changed

devito/arch/archinfo.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
"""Collection of utilities to detect properties of the underlying architecture."""
22

3-
from subprocess import PIPE, Popen, DEVNULL, run
4-
53
from functools import cached_property
6-
import cpuinfo
4+
from subprocess import PIPE, Popen, DEVNULL, run
75
import ctypes
8-
import numpy as np
9-
import psutil
106
import re
117
import os
128
import sys
139
import json
1410

11+
import cpuinfo
12+
import numpy as np
13+
import psutil
14+
1515
from devito.logger import warning
1616
from devito.tools import as_tuple, all_equal, memoized_func
1717

@@ -665,6 +665,16 @@ def max_mem_trans_size(self, dtype):
665665
assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
666666
return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
667667

668+
def limits(self, compiler=None, language=None):
669+
"""
670+
Return the architecture-specific limits for the given compiler and
671+
language.
672+
"""
673+
return {
674+
'max-par-dims': sys.maxsize,
675+
'max-block-dims': sys.maxsize,
676+
}
677+
668678

669679
class Cpu64(Platform):
670680

@@ -847,6 +857,12 @@ def memavail(self, deviceid=0):
847857
except (AttributeError, KeyError):
848858
return None
849859

860+
def limits(self, compiler=None, language=None):
861+
return {
862+
'max-par-dims': 3,
863+
'max-block-dims': 3,
864+
}
865+
850866

851867
class IntelDevice(Device):
852868

devito/ir/clusters/algorithms.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -504,13 +504,10 @@ def reduction_comms(clusters):
504504
return processed
505505

506506

507-
def normalize(clusters, **kwargs):
508-
options = kwargs['options']
509-
sregistry = kwargs['sregistry']
510-
507+
def normalize(clusters, sregistry=None, options=None, platform=None, **kwargs):
511508
clusters = normalize_nested_indexeds(clusters, sregistry)
512509
if options['mapify-reduce']:
513-
clusters = normalize_reductions_dense(clusters, sregistry)
510+
clusters = normalize_reductions_dense(clusters, sregistry, platform)
514511
else:
515512
clusters = normalize_reductions_minmax(clusters)
516513
clusters = normalize_reductions_sparse(clusters, sregistry)
@@ -594,31 +591,49 @@ def normalize_reductions_minmax(cluster):
594591
return init + [cluster.rebuild(processed)]
595592

596593

597-
def normalize_reductions_dense(cluster, sregistry):
594+
def normalize_reductions_dense(cluster, sregistry, platform):
598595
"""
599596
Extract the right-hand sides of reduction Eq's in to temporaries.
600597
"""
601-
return _normalize_reductions_dense(cluster, sregistry, {})
598+
return _normalize_reductions_dense(cluster, {}, sregistry, platform)
602599

603600

604601
@cluster_pass(mode='dense')
605-
def _normalize_reductions_dense(cluster, sregistry, mapper):
606-
dims = [d for d in cluster.ispace.itdims
607-
if cluster.properties.is_parallel_atomic(d)]
608-
if not dims:
602+
def _normalize_reductions_dense(cluster, mapper, sregistry, platform):
603+
"""
604+
Transform augmented expressions whose left-hand side is a scalar into
605+
map-reduces.
606+
607+
Examples
608+
--------
609+
Given an increment expression such as
610+
611+
s += f(u[x], v[x], ...)
612+
613+
Turn it into
614+
615+
r[x] = f(u[x], v[x], ...)
616+
s += r[x]
617+
"""
618+
# The candidate Dimensions along which to perform the map part
619+
candidates = [d for d in cluster.ispace.itdims
620+
if cluster.properties.is_parallel_atomic(d)]
621+
if not candidates:
609622
return cluster
610623

624+
# If there are more parallel dimensions than the maximum allowed by the
625+
# target platform, we must restrain the number of candidates
626+
max_par_dims = platform.limits()['max-par-dims']
627+
dims = candidates[-max_par_dims:]
628+
629+
# All other dimensions must be sequentialized because the output Array
630+
# is constrained to `dims`
631+
sequentialize = candidates[:-max_par_dims]
632+
611633
processed = []
634+
properties = cluster.properties
612635
for e in cluster.exprs:
613636
if e.is_Reduction:
614-
# Transform `e` into what is in essence an explicit map-reduce
615-
# For example, turn:
616-
# `s += f(u[x], v[x], ...)`
617-
# into
618-
# `r[x] = f(u[x], v[x], ...)`
619-
# `s += r[x]`
620-
# This makes it much easier to parallelize the map part regardless
621-
# of the target backend
622637
lhs, rhs = e.args
623638

624639
try:
@@ -650,10 +665,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):
650665

651666
processed.extend([Eq(a.indexify(), rhs),
652667
e.func(lhs, a.indexify())])
668+
669+
for d in sequentialize:
670+
properties = properties.sequentialize(d)
653671
else:
654672
processed.append(e)
655673

656-
return cluster.rebuild(processed)
674+
return cluster.rebuild(exprs=processed, properties=properties)
657675

658676

659677
@cluster_pass(mode='sparse')

devito/passes/clusters/blocking.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,14 @@ def _has_atomic_blockable_dim(self, cluster, d):
164164
return any(cluster.properties.is_parallel_atomic(i)
165165
for i in set(cluster.ispace.itdims) - {d})
166166

167-
def _has_enough_large_blockable_dims(self, cluster, d):
168-
return len([i for i in set(cluster.ispace.itdims) - {d}
167+
def _has_enough_large_blockable_dims(self, cluster, d, nested=False):
168+
if nested:
169+
_, ispace = cluster.ispace.split(d)
170+
dims = set(ispace.itdims)
171+
else:
172+
ispace = cluster.ispace
173+
dims = set(cluster.ispace.itdims) - {d}
174+
return len([i for i in dims
169175
if (cluster.properties.is_parallel_relaxed(i) and
170176
not self._has_short_trip_count(i))]) >= 3
171177

@@ -191,6 +197,11 @@ def callback(self, clusters, prefix):
191197
# to have more than three large blockable Dimensions
192198
return clusters
193199

200+
if self._has_enough_large_blockable_dims(c, d, nested=True):
201+
# Virtually all device programming models forbid parallelism
202+
# along more than three dimensions
203+
return clusters
204+
194205
if any(self._has_short_trip_count(i) for i in c.ispace.itdims):
195206
properties = c.properties.block(d, 'small')
196207
elif self._has_data_reuse(c):

devito/types/dimension.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,6 +1327,8 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs):
13271327
# no value supplied -> the sub-block will span the entire block
13281328
return {name: args[self.parent.step.name]}
13291329
else:
1330+
# TODO": Check the args for space order and apply heuristics (e.g.,
1331+
# `2*space_order`?) for even better block sizes
13301332
value = self._arg_defaults()[name]
13311333
if value <= args[self.root.max_name] - args[self.root.min_name] + 1:
13321334
return {name: value}

tests/test_gpu_common.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
99
Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
1010
SubDimension, SubDomain, SubDomainSet, TimeFunction,
11-
Operator, configuration, switchconfig, TensorTimeFunction)
11+
Operator, configuration, switchconfig, TensorTimeFunction,
12+
Buffer)
1213
from devito.arch import get_gpu_info
1314
from devito.exceptions import InvalidArgument
1415
from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
1516
retrieve_iteration_tree)
1617
from devito.passes.iet.languages.openmp import OmpIteration
17-
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray
18+
from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray, Symbol
1819

1920
from conftest import skipif
2021

@@ -147,6 +148,24 @@ def test_incr_perfect_outer(self):
147148
op()
148149
assert np.all(w.data == 10)
149150

151+
def test_reduction_many_dims(self):
152+
grid = Grid(shape=(25, 25, 25))
153+
154+
u = TimeFunction(name='u', grid=grid, time_order=1, save=Buffer(1))
155+
s = Symbol(name='s', dtype=np.float32)
156+
157+
eqns = [Eq(s, 0),
158+
Inc(s, 2*u + 1)]
159+
160+
op0 = Operator(eqns)
161+
op1 = Operator(eqns, opt=('advanced', {'mapify-reduce': True}))
162+
163+
tree, = retrieve_iteration_tree(op0)
164+
assert 'collapse(4) reduction(+:s)' in str(tree.root.pragmas[0])
165+
166+
tree, = retrieve_iteration_tree(op1)
167+
assert 'collapse(3) reduction(+:s)' in str(tree[1].pragmas[0])
168+
150169

151170
class Bundle(SubDomain):
152171
"""

0 commit comments

Comments
 (0)