Skip to content

Commit

Permalink
Merge pull request #2215 from devitocodes/moar-omp-off
Browse files Browse the repository at this point in the history
arch: Intel PVC mapping
  • Loading branch information
mloubout authored Nov 1, 2023
2 parents 8e33092 + 0a3d808 commit 465e72b
Show file tree
Hide file tree
Showing 13 changed files with 230 additions and 130 deletions.
26 changes: 19 additions & 7 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@
'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power',
'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
# Intel
# Intel CPUs
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210',
'SKX', 'KLX', 'CLX', 'CLK', 'SPR',
# ARM
# ARM CPUs
'AMD', 'ARM', 'M1', 'GRAVITON',
# Other loosely supported CPU architectures
# Other legacy CPUs
'POWER8', 'POWER9',
# GPUs
'AMDGPUX', 'NVIDIAX', 'INTELGPUX']
# Generic GPUs
'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
# Intel GPUs
'PVC']


@memoized_func
Expand Down Expand Up @@ -638,13 +640,20 @@ def _detect_isa(self):

class Device(Platform):

def __init__(self, name, cores_logical=1, cores_physical=1, isa='cpp'):
def __init__(self, name, cores_logical=1, cores_physical=1, isa='cpp',
max_threads_per_block=1024, max_threads_dimx=1024,
max_threads_dimy=1024, max_threads_dimz=64):
super().__init__(name)

self.cores_logical = cores_logical
self.cores_physical = cores_physical
self.isa = isa

self.max_threads_per_block = max_threads_per_block
self.max_threads_dimx = max_threads_dimx
self.max_threads_dimy = max_threads_dimy
self.max_threads_dimz = max_threads_dimz

@classmethod
def _mro(cls):
# Retain only the Device Platforms
Expand Down Expand Up @@ -760,6 +769,8 @@ def march(cls):
AMDGPUX = AmdDevice('amdgpuX')
INTELGPUX = IntelDevice('intelgpuX')

PVC = IntelDevice('pvc', max_threads_per_block=4096)


platform_registry = {
'cpu64-dummy': CPU64_DUMMY,
Expand All @@ -783,7 +794,8 @@ def march(cls):
'power9': POWER9,
'nvidiaX': NVIDIAX, # Generic NVidia GPU
'amdgpuX': AMDGPUX, # Generic AMD GPU
'intelgpuX': INTELGPUX # Generic Intel GPU
'intelgpuX': INTELGPUX, # Generic Intel GPU
'pvc': PVC # Intel Ponte Vecchio GPU
}
"""
Registry dict for deriving Platform classes according to the environment variable
Expand Down
10 changes: 5 additions & 5 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from codepy.toolchain import GCCToolchain, call_capture_output as _call_capture_output

from devito.arch import (AMDGPUX, Cpu64, M1, NVIDIAX, POWER8, POWER9, GRAVITON,
INTELGPUX, get_nvidia_cc, check_cuda_runtime,
INTELGPUX, PVC, get_nvidia_cc, check_cuda_runtime,
get_m1_llvm_path)
from devito.exceptions import CompilationError
from devito.logger import debug, warning, error
Expand Down Expand Up @@ -804,11 +804,11 @@ def __init__(self, *args, **kwargs):

if platform is NVIDIAX:
self.cflags.append('-fopenmp-targets=nvptx64-cuda')
if platform is INTELGPUX:
self.cflags.append('-fopenmp-targets=spir64')
self.cflags.append('-fopenmp-target-simd')
if platform in [INTELGPUX, PVC]:
self.ldflags.append('-fiopenmp')
self.ldflags.append('-fopenmp-targets=spir64')
self.ldflags.append('-fopenmp-target-simd')

if platform is INTELGPUX:
self.cflags.remove('-g') # -g disables some optimizations in IGC
self.cflags.append('-gline-tables-only')
self.cflags.append('-fdebug-info-for-profiling')
Expand Down
2 changes: 1 addition & 1 deletion devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _normalize_kwargs(cls, **kwargs):
o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)

# GPU parallelism
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4))
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4))
o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE)
Expand Down
12 changes: 7 additions & 5 deletions devito/core/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from devito.mpi.routines import mpi_registry
from devito.parameters import configuration
from devito.operator import Operator
from devito.tools import as_tuple, is_integer, timed_pass
from devito.tools import as_tuple, is_integer, timed_pass, UnboundTuple
from devito.types import NThreads

__all__ = ['CoreOperator', 'CustomOperator',
Expand Down Expand Up @@ -327,10 +327,12 @@ class OptOption(object):
pass


class ParTileArg(tuple):
class ParTileArg(UnboundTuple):

def __new__(cls, items, rule=None, tag=None):
obj = super().__new__(cls, items)
if items is None:
items = tuple()
obj = super().__new__(cls, *items)
obj.rule = rule
obj.tag = tag
return obj
Expand All @@ -340,7 +342,7 @@ class ParTile(tuple, OptOption):

def __new__(cls, items, default=None):
if not items:
return None
return tuple()
elif isinstance(items, bool):
if not default:
raise ValueError("Expected `default` value, got None")
Expand All @@ -353,7 +355,7 @@ def __new__(cls, items, default=None):

x = items[0]
if is_integer(x):
# E.g., (32, 4, 8)
# E.g., 32
items = (ParTileArg(items),)

elif x is None:
Expand Down
14 changes: 5 additions & 9 deletions devito/passes/iet/languages/openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from devito.passes.iet.languages.C import CBB
from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration
from devito.symbolics import FieldFromPointer, Macro, cast_mapper
from devito.tools import filter_ordered
from devito.tools import filter_ordered, UnboundTuple
from devito.types import DeviceMap, Symbol

__all__ = ['DeviceAccizer', 'DeviceAccDataManager', 'AccOrchestrator']
Expand All @@ -30,7 +30,8 @@ def _make_clauses(cls, ncollapsed=0, reduction=None, tile=None, **kwargs):
clauses = []

if tile:
clauses.append('tile(%s)' % ','.join(str(i) for i in tile))
stile = [str(tile[i]) for i in range(ncollapsed)]
clauses.append('tile(%s)' % ','.join(stile))
elif ncollapsed > 1:
clauses.append('collapse(%d)' % ncollapsed)

Expand Down Expand Up @@ -159,18 +160,13 @@ def _make_partree(self, candidates, nthreads=None):
assert candidates

root, collapsable = self._select_candidates(candidates)
ncollapsable = len(collapsable)
ncollapsable = len(collapsable) + 1

if self._is_offloadable(root) and \
all(i.is_Affine for i in [root] + collapsable) and \
self.par_tile:
tile = self.par_tile.next()
assert isinstance(tile, tuple)
nremainder = (ncollapsable + 1) - len(tile)
if nremainder >= 0:
tile += (tile[-1],)*nremainder
else:
tile = tile[:ncollapsable + 1]
assert isinstance(tile, UnboundTuple)

body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile,
ncollapsed=ncollapsable, **root.args)
Expand Down
11 changes: 2 additions & 9 deletions devito/passes/iet/languages/openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import cgen as c
from sympy import And, Ne, Not

from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX
from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX, PVC
from devito.arch.compiler import GNUCompiler
from devito.ir import (Call, Conditional, DeviceCall, List, Prodder,
ParallelBlock, PointerCast, While, FindSymbols)
Expand Down Expand Up @@ -78,14 +78,6 @@ def _make_clauses(cls, **kwargs):

return clauses

@classmethod
def _process_kwargs(cls, **kwargs):
kwargs = super()._process_kwargs(**kwargs)

kwargs.pop('gpu_fit', None)

return kwargs


class ThreadedProdder(Conditional, Prodder):

Expand Down Expand Up @@ -117,6 +109,7 @@ class OmpBB(LangBB):
AMDGPUX: None,
NVIDIAX: None,
INTELGPUX: None,
PVC: None,
# Runtime library
'init': None,
'thread-num': lambda retobj=None:
Expand Down
41 changes: 27 additions & 14 deletions devito/passes/iet/parpragma.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from devito.passes.iet.langbase import (LangBB, LangTransformer, DeviceAwareMixin,
make_sections_from_imask)
from devito.symbolics import INT, ccode
from devito.tools import as_tuple, flatten, is_integer, prod
from devito.tools.data_structures import UnboundTuple
from devito.tools import UnboundTuple, as_tuple, flatten, is_integer, prod
from devito.types import Symbol

__all__ = ['PragmaSimdTransformer', 'PragmaShmTransformer',
Expand Down Expand Up @@ -47,8 +46,21 @@ def _support_array_reduction(cls, compiler):
def simd_reg_size(self):
return self.platform.simd_reg_size

@iet_pass
def make_simd(self, iet):
def _make_simd_pragma(self, iet):
indexeds = FindSymbols('indexeds').visit(iet)
aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction}
if aligned:
simd = self.lang['simd-for-aligned']
simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size))
else:
simd = as_tuple(self.lang['simd-for'])

return simd

def _make_simd(self, iet):
"""
Carry out the bulk of `make_simd`.
"""
mapper = {}
for tree in retrieve_iteration_tree(iet):
candidates = [i for i in tree if i.is_ParallelRelaxed]
Expand Down Expand Up @@ -103,13 +115,7 @@ def make_simd(self, iet):
continue

# Add SIMD pragma
indexeds = FindSymbols('indexeds').visit(candidate)
aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction}
if aligned:
simd = self.lang['simd-for-aligned']
simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size))
else:
simd = as_tuple(self.lang['simd-for'])
simd = self._make_simd_pragma(candidate)
pragmas = candidate.pragmas + simd

# Add VECTORIZED property
Expand All @@ -121,14 +127,20 @@ def make_simd(self, iet):

return iet, {}

@iet_pass
def make_simd(self, iet):
return self._make_simd(iet)


class PragmaIteration(ParallelIteration):

def __init__(self, *args, parallel=None, schedule=None, chunk_size=None,
nthreads=None, ncollapsed=None, reduction=None, tile=None,
gpu_fit=None, **kwargs):

construct = self._make_construct(parallel=parallel)
construct = self._make_construct(
parallel=parallel, ncollapsed=ncollapsed, tile=tile
)
clauses = self._make_clauses(
ncollapsed=ncollapsed, chunk_size=chunk_size, nthreads=nthreads,
reduction=reduction, schedule=schedule, tile=tile, gpu_fit=gpu_fit,
Expand Down Expand Up @@ -610,7 +622,7 @@ def __init__(self, sregistry, options, platform, compiler):
super().__init__(sregistry, options, platform, compiler)

self.gpu_fit = options['gpu-fit']
self.par_tile = UnboundTuple(options['par-tile'])
self.par_tile = UnboundTuple(*options['par-tile'])
self.par_disabled = options['par-disabled']

def _score_candidate(self, n0, root, collapsable=()):
Expand Down Expand Up @@ -645,7 +657,8 @@ def _make_partree(self, candidates, nthreads=None, index=None):

if self._is_offloadable(root):
body = self.DeviceIteration(gpu_fit=self.gpu_fit,
ncollapsed=len(collapsable) + 1,
ncollapsed=len(collapsable)+1,
tile=self.par_tile.next(),
**root.args)
partree = ParallelTree([], body, nthreads=nthreads)

Expand Down
Loading

0 comments on commit 465e72b

Please sign in to comment.