diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index db1b329ed3..0d35fa5aeb 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -19,6 +19,8 @@ 'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path', 'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice', + # Brand-agnostic + 'ANYCPU', 'ANYGPU', # Intel CPUs 'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'KNL', 'KNL7210', 'SKX', 'KLX', 'CLX', 'CLK', 'SPR', @@ -592,7 +594,7 @@ def get_platform(): pass # Unable to detect platform. Stick to default... - return CPU64 + return ANYCPU class Platform: @@ -893,7 +895,7 @@ def march(cls): # CPUs -CPU64 = Cpu64('cpu64') +ANYCPU = Cpu64('cpu64') CPU64_DUMMY = Intel64('cpu64-dummy', cores_logical=2, cores_physical=1, isa='sse') INTEL64 = Intel64('intel64') @@ -921,6 +923,8 @@ def march(cls): POWER9 = Power('power9') # Devices +ANYGPU = Cpu64('gpu') + NVIDIAX = NvidiaDevice('nvidiaX') AMDGPUX = AmdDevice('amdgpuX') diff --git a/devito/core/operator.py b/devito/core/operator.py index 96cb85bbde..fa0baa11f2 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -1,14 +1,16 @@ from collections.abc import Iterable +from functools import cached_property from devito.core.autotuning import autotune from devito.exceptions import InvalidArgument, InvalidOperator +from devito.ir import FindSymbols from devito.logger import warning from devito.mpi.routines import mpi_registry from devito.parameters import configuration from devito.operator import Operator from devito.tools import (as_tuple, is_integer, timed_pass, UnboundTuple, UnboundedMultiTuple) -from devito.types import NThreads +from devito.types import NThreads, PThreadArray __all__ = ['CoreOperator', 'CustomOperator', # Optimization options @@ -190,7 +192,7 @@ def _autotune(self, args, setup): return args - @property + @cached_property def nthreads(self): nthreads = [i for i in self.input if isinstance(i, NThreads)] if len(nthreads) == 0: @@ -199,6 +201,12 @@ def nthreads(self): assert len(nthreads) == 1 return nthreads.pop() + @cached_property + def npthreads(self): + symbols = FindSymbols().visit(self.body) + ptas = [i for i in symbols if isinstance(i, PThreadArray)] + return sum(i.size for i in ptas) + class CoreOperator(BasicOperator): pass diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 774439ebcf..1d2770a657 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -433,7 +433,7 @@ def callback(self, clusters, prefix, seen=None): key = lambda i: i in prefix[:-1] or i in hs.loc_indices ispace = c.ispace.project(key) - # HaloTOuch are not parallel + # HaloTouches are not parallel properties = c.properties.sequentialize() halo_touch = c.rebuild(exprs=expr, ispace=ispace, properties=properties) @@ -614,6 +614,11 @@ def _normalize_reductions_dense(cluster, sregistry, mapper): # of the target backend lhs, rhs = e.args + try: + f = rhs.function + except AttributeError: + f = None + if lhs.function.is_Array: # Probably a compiler-generated reduction, e.g. via # recursive compilation; it's an Array already, so nothing to do @@ -621,11 +626,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper): elif rhs in mapper: # Seen this RHS already, so reuse the Array that was created for it processed.append(e.func(lhs, mapper[rhs].indexify())) + elif f and f.is_Array and sum(flatten(f._size_nodomain)) == 0: + # Special case: the RHS is an Array with no halo/padding, meaning + # that the written data values are contiguous in memory, hence + # we can simply reuse the Array itself as we're already in the + # desired memory layout + processed.append(e) else: - # Here the LHS could be a Symbol or a user-level Function - # In the latter case we copy the data into a temporary Array - # because the Function might be padded, and reduction operations - # require, in general, the data values to be contiguous name = sregistry.make_name() try: grid = cluster.grid diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index a10be2b3ce..a85b93460d 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -126,9 +126,8 @@ def stree_build(clusters, profiler=None, **kwargs): if i.is_Halo: found = i elif i.is_Sync: - if profiler._verbosity > 0 or not i.is_async: - attach_section(i) - section = None + attach_section(i) + section = None break elif i.is_Iteration: if (i.dim.is_Time and SEQUENTIAL in i.properties): diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 79ee1755a6..0062b5b32f 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -622,7 +622,14 @@ def _uxreplace_dispatch_haloscheme(hs0, rule): # They indeed do change d1 = g.indices[d0] loc_indices[d1] = v.indices[d0] - loc_dirs[d1] = hse0.loc_dirs[d0] + + try: + loc_dirs[d1] = hse0.loc_dirs[d0] + except KeyError: + # E.g., `usave(cd, x, y)` and `usave.dx` in an + # adjoint Operator + assert d0.is_Conditional + loc_dirs[d1] = hse0.loc_dirs[d0.root] if len(loc_indices) != len(hse0.loc_indices): # Nope, let's try with the next Indexed, if any diff --git a/devito/operator/operator.py b/devito/operator/operator.py index f8749969cd..297ac544ee 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -1,12 +1,14 @@ from collections import OrderedDict, namedtuple from functools import cached_property import ctypes +import shutil from operator import attrgetter from math import ceil +from tempfile import gettempdir from sympy import sympify -from devito.arch import compiler_registry, platform_registry +from devito.arch import ANYCPU, Device, compiler_registry, platform_registry from devito.data import default_allocator from devito.exceptions import InvalidOperator, ExecutionError from devito.logger import debug, info, perf, warning, is_log_enabled_for, switch_log_level @@ -16,18 +18,19 @@ derive_parameters, iet_build) from devito.ir.support import AccessMode, SymbolRegistry from devito.ir.stree import stree_build -from devito.operator.profiling import AdvancedProfilerVerbose, create_profile +from devito.operator.profiling import create_profile from devito.operator.registry import operator_selector from devito.mpi import MPI from devito.parameters import configuration from devito.passes import (Graph, lower_index_derivatives, generate_implicit, generate_macros, minimize_symbols, unevaluate, - error_mapper) -from devito.symbolics import estimate_cost -from devito.tools import (DAG, OrderedSet, Signer, ReducerMap, as_tuple, flatten, - filter_sorted, frozendict, is_integer, split, timed_pass, - timed_region, contains_val) -from devito.types import Grid, Evaluable + error_mapper, is_on_device) +from devito.symbolics import estimate_cost, subs_op_args +from devito.tools import (DAG, OrderedSet, Signer, ReducerMap, as_mapper, as_tuple, + flatten, filter_sorted, frozendict, is_integer, + split, timed_pass, timed_region, contains_val) +from devito.types import (Buffer, Grid, Evaluable, host_layer, device_layer, + disk_layer) __all__ = ['Operator'] @@ -224,10 +227,11 @@ def _build(cls, expressions, **kwargs): op._func_table = OrderedDict() op._func_table.update(OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) - op._func_table.update(OrderedDict([(i.root.name, i) for i in byproduct.funcs])) + op._func_table.update(OrderedDict([(i.root.name, i) + for i in byproduct.funcs])) - # Internal mutable state to store information about previous runs, autotuning - # reports, etc + # Internal mutable state to store information about previous runs, + # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes @@ -514,6 +518,10 @@ def temporaries(self): def objects(self): return tuple(i for i in self.parameters if i.is_Object) + @cached_property + def threads_info(self): + return frozendict({'nthreads': self.nthreads, 'npthreads': self.npthreads}) + # Arguments processing @cached_property @@ -548,10 +556,13 @@ def _prepare_arguments(self, autotune=None, **kwargs): if set(d._arg_names).intersection(kwargs): futures.update(d._arg_values(self._dspace[d], args={}, **kwargs)) + # Prepare to process data-carriers + args = kwargs['args'] = ReducerMap() + kwargs['metadata'] = self.threads_info + overrides, defaults = split(self.input, lambda p: p.name in kwargs) # Process data-carrier overrides - args = kwargs['args'] = ReducerMap() for p in overrides: args.update(p._arg_values(**kwargs)) try: @@ -573,9 +584,9 @@ def _prepare_arguments(self, autotune=None, **kwargs): pass elif k in kwargs: # User is in control - # E.g., given a ConditionalDimension `t_sub` with factor `fact` and - # a TimeFunction `usave(t_sub, x, y)`, an override for `fact` is - # supplied w/o overriding `usave`; that's legal + # E.g., given a ConditionalDimension `t_sub` with factor `fact` + # and a TimeFunction `usave(t_sub, x, y)`, an override for + # `fact` is supplied w/o overriding `usave`; that's legal pass elif is_integer(args[k]) and not contains_val(args[k], v): raise ValueError("Default `%s` is incompatible with other args as " @@ -623,10 +634,13 @@ def _prepare_arguments(self, autotune=None, **kwargs): for o in self.objects: args.update(o._arg_values(grid=grid, **kwargs)) + # Purge `kwargs` + kwargs.pop('args') + kwargs.pop('metadata') + # In some "lower-level" Operators implementing a random piece of C, such as # one or more calls to third-party library functions, there could still be # at this point unprocessed arguments (e.g., scalars) - kwargs.pop('args') args.update({k: v for k, v in kwargs.items() if k not in args}) # Sanity check @@ -923,8 +937,10 @@ def _emit_apply_profiling(self, args): return summary if summary.globals: - # Note that with MPI enabled, the global performance indicators + # NOTE: with MPI enabled, the global performance indicators # represent "cross-rank" performance data + + # Print out global performance indicators metrics = [] v = summary.globals.get('vanilla') @@ -939,21 +955,23 @@ def _emit_apply_profiling(self, args): if metrics: perf("Global performance: [%s]" % ', '.join(metrics)) + # Same as above, but excluding the setup phase, e.g. the CPU-GPU + # data transfers in the case of a GPU run, mallocs, frees, etc. + metrics = [] + + v = summary.globals.get('fdlike-nosetup') + if v is not None: + metrics.append("%.2f s" % fround(v.time)) + metrics.append("%.2f GPts/s" % fround(v.gpointss)) + + perf("Global performance : [%s]" % ', '.join(metrics)) + + # Prepare for the local performance indicators perf("Local performance:") indent = " "*2 else: indent = "" - if isinstance(self._profiler, AdvancedProfilerVerbose): - metrics = [] - - v = summary.globals.get('fdlike-nosetup') - if v is not None: - metrics.append("%.2f GPts/s" % fround(v.gpointss)) - - if metrics: - perf("Global performance : [%s]" % ', '.join(metrics)) - # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted def lower_perfentry(v): @@ -972,24 +990,14 @@ def lower_perfentry(v): for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" + name = "%s%s" % (k.name, rank) if v.time <= 0.01: # Trim down the output for very fast sections - name = "%s%s<>" % (k.name, rank) perf("%s* %s ran in %.2f s" % (indent, name, fround(v.time))) continue metrics = lower_perfentry(v) - - itershapes = [",".join(str(i) for i in its) for its in v.itershapes] - if len(itershapes) > 1: - itershapes = ",".join("<%s>" % i for i in itershapes) - elif len(itershapes) == 1: - itershapes = itershapes[0] - else: - itershapes = "" - name = "%s%s<%s>" % (k.name, rank, itershapes) - perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics)) for n, v1 in summary.subsections.get(k.name, {}).items(): metrics = lower_perfentry(v1) @@ -1011,6 +1019,9 @@ def lower_perfentry(v): if a in args: perf_args[a] = args[a] break + if is_integer(self.npthreads): + perf_args['pthreads'] = self.npthreads + perf_args = {k: perf_args[k] for k in sorted(perf_args)} perf("Performance[mode=%s] arguments: %s" % (self._mode, perf_args)) return summary @@ -1113,12 +1124,7 @@ def __init__(self, args, grid, op): super().__init__(args) self.grid = grid - - self.allocator = op._allocator - self.platform = op._platform - self.language = op._language - self.compiler = op._compiler - self.options = op._options + self.op = op @property def comm(self): @@ -1134,6 +1140,110 @@ def opkwargs(self): 'compiler': compiler, 'language': self.language} + @property + def allocator(self): + return self.op._allocator + + @property + def platform(self): + return self.op._platform + + @property + def language(self): + return self.op._language + + @property + def compiler(self): + return self.op._compiler + + @property + def options(self): + return self.op._options + + @property + def saved_mapper(self): + """ + The number of saved TimeFunctions in the Operator, grouped by + memory hierarchy layer. + """ + key0 = lambda f: (f.is_TimeFunction and + f.save is not None and + not isinstance(f.save, Buffer)) + functions = [f for f in self.op.input if key0(f)] + + key1 = lambda f: f.layer + mapper = as_mapper(functions, key1) + + return mapper + + @cached_property + def nbytes_avail_mapper(self): + """ + The amount of memory available after accounting for the memory + consumed by the Operator, in bytes, grouped by memory hierarchy layer. + """ + mapper = {} + + # The amount of space available on the disk + usage = shutil.disk_usage(gettempdir()) + mapper[disk_layer] = usage.free + + # The amount of space available on the device + if isinstance(self.platform, Device): + deviceid = max(self.get('deviceid', 0), 0) + mapper[device_layer] = self.platform.memavail(deviceid=deviceid) + + # The amount of space available on the host + try: + nproc = self.grid.distributor.nprocs_local + except AttributeError: + nproc = 1 + mapper[host_layer] = int(ANYCPU.memavail() / nproc) + + # Temporaries such as Arrays are allocated and deallocated on-the-fly + # while in C land, so they need to be accounted for as well + for i in FindSymbols().visit(self.op): + if not i.is_Array or not i._mem_heap or i.alias: + continue + + if i.is_regular: + nbytes = i.nbytes + else: + nbytes = i.nbytes_max + v = subs_op_args(nbytes, self) + if not is_integer(v): + # E.g. the Arrays used to store the MPI halo exchanges + continue + + if i._mem_host: + mapper[host_layer] -= v + elif i._mem_local: + if isinstance(self.platform, Device): + mapper[device_layer] -= v + else: + mapper[host_layer] -= v + elif i._mem_mapped: + if isinstance(self.platform, Device): + mapper[device_layer] -= v + mapper[host_layer] -= v + + # All input Functions are yet to be memcpy-ed to the device + # TODO: this may not be true depending on `devicerm`, which is however + # virtually never used + if isinstance(self.platform, Device): + for i in self.op.input: + if not is_on_device(i, self.options['gpu-fit']): + continue + try: + if i._mem_mapped: + mapper[device_layer] -= i.nbytes + except AttributeError: + pass + + mapper = {k: int(v) for k, v in mapper.items()} + + return mapper + def parse_kwargs(**kwargs): """ diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 55796bbd51..5a6cddb87e 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -201,6 +201,8 @@ def callback(self, clusters, prefix): properties = c.properties.sequentialize(d) if not isinstance(d, BufferDimension): properties = properties.prefetchable(d) + # `c` may be a HaloTouch Cluster, so with no vision of the `bdims` + properties = properties.parallelize(v.bdims).affine(v.bdims) syncs = c.syncs @@ -440,7 +442,10 @@ def lastwrite(self): @property def is_read(self): - return self.firstread is not None + # Wild Clusters, and in particular HaloTouch Clusters, may contain mock + # read accesses to self's buffered Function (`self.f`), which we must + # ignore since here we're determining whether `self.f` is actually read + return any(not c.is_wild for c in self.clusters if c.scope.reads.get(self.f)) @property def is_write(self): @@ -448,11 +453,11 @@ def is_write(self): @property def is_readonly(self): - return self.firstread is not None and self.lastwrite is None + return self.is_read and not self.is_write @property def is_writeonly(self): - return self.lastwrite is not None and self.firstread is None + return self.is_write and not self.is_read @property def is_forward_buffering(self): diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index 214f4c6fa1..7462c8f07c 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -126,7 +126,7 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): Wrap sections within global barriers if deemed necessary by the profiler. """ try: - sync = lang['device-wait'] + sync = lang['map-wait'] except (KeyError, NotImplementedError): return iet, {} @@ -137,10 +137,11 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): for tl in FindNodes(TimedList).visit(iet): symbols = FindSymbols().visit(tl) - runs_async = any(isinstance(i, lang.AsyncQueue) for i in symbols) + queues = [i for i in symbols if isinstance(i, lang.AsyncQueue)] unnecessary = any(FindNodes(BusyWait).visit(tl)) - if runs_async and not unnecessary: - mapper[tl] = tl._rebuild(body=tl.body + (sync,)) + if queues and not unnecessary: + waits = tuple(sync(i) for i in queues) + mapper[tl] = tl._rebuild(body=tl.body + waits) iet = Transformer(mapper, nested=True).visit(iet) diff --git a/devito/types/basic.py b/devito/types/basic.py index 394d6276b6..e0fc5c084b 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -19,7 +19,8 @@ from devito.types.lazy import Evaluable from devito.types.utils import DimensionTuple -__all__ = ['Symbol', 'Scalar', 'Indexed', 'IndexedData', 'DeviceMap'] +__all__ = ['Symbol', 'Scalar', 'Indexed', 'IndexedData', 'DeviceMap', + 'IrregularFunctionInterface'] Size = namedtuple('Size', 'left right') @@ -584,7 +585,7 @@ def _arg_values(self, **kwargs): if self.name in kwargs: return {self.name: kwargs.pop(self.name)} else: - return self._arg_defaults() + return self._arg_defaults(**kwargs) class AbstractTensor(sympy.ImmutableDenseMatrix, Basic, Pickable, Evaluable): @@ -1247,6 +1248,19 @@ def _mem_mapped(self): def _mem_host(self): return self._space == 'host' + @cached_property + def _signature(self): + """ + The signature of an AbstractFunction is the set of fields that + makes it "compatible" with another AbstractFunction. The fact that + two AbstractFunctions are compatible may be exploited by the compiler + to generate smarter code + """ + ret = [type(self), self.indices] + attrs = set(self.__rkwargs__) - {'name', 'function'} + ret.extend(getattr(self, i) for i in attrs) + return frozenset(ret) + def _make_pointer(self): """Generate a symbolic pointer to self.""" raise NotImplementedError @@ -1609,3 +1623,16 @@ def _subs(self, old, new, **hints): except AttributeError: pass return super()._subs(old, new, **hints) + + +class IrregularFunctionInterface: + + """ + A common interface for all irregular AbstractFunctions. + """ + + is_regular = False + + @property + def nbytes_max(self): + raise NotImplementedError diff --git a/devito/types/dense.py b/devito/types/dense.py index 299ab68dc0..5060b57746 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -24,7 +24,7 @@ from devito.types.args import ArgProvider from devito.types.caching import CacheManager from devito.types.basic import AbstractFunction, Size -from devito.types.utils import Buffer, DimensionTuple, NODE, CELL +from devito.types.utils import Buffer, DimensionTuple, NODE, CELL, host_layer __all__ = ['Function', 'TimeFunction', 'SubFunction', 'TempFunction'] @@ -1442,6 +1442,13 @@ def backward(self): return self._subs(_t, _t - i * _t.spacing) + @property + def layer(self): + """ + The memory hierarchy layer in which the TimeFunction is stored. + """ + return host_layer + @property def _time_size(self): return self.shape_allocated[self._time_position] diff --git a/devito/types/parallel.py b/devito/types/parallel.py index fe195e9c06..65ee2fbdbc 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -25,7 +25,7 @@ 'SharedData', 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier', 'TBArray'] -class NThreadsBase(Scalar): +class NThreadsAbstract(Scalar): is_Input = True is_PerfKnob = True @@ -39,11 +39,28 @@ def __new__(cls, *args, **kwargs): def __dtype_setup__(cls, **kwargs): return np.int32 + +class NThreadsBase(NThreadsAbstract): + @cached_property def default_value(self): return int(os.environ.get('OMP_NUM_THREADS', configuration['platform'].cores_physical)) + def _arg_defaults(self, **kwargs): + base_nthreads = self.default_value + + try: + npthreads = kwargs['metadata']['npthreads'] + except KeyError: + raise InvalidArgument("Cannot determine `npthreads`") + + # If a symbolic object, it must be resolved + if isinstance(npthreads, NPThreads): + npthreads = kwargs.get(npthreads.name, npthreads.size) + + return {self.name: max(base_nthreads - npthreads, 1)} + class NThreads(NThreadsBase): @@ -64,7 +81,7 @@ def default_value(self): return configuration['platform'].threads_per_core -class NPThreads(NThreadsBase): +class NPThreads(NThreadsAbstract): name = 'npthreads' diff --git a/devito/types/utils.py b/devito/types/utils.py index 6762f982ea..59e6137422 100644 --- a/devito/types/utils.py +++ b/devito/types/utils.py @@ -4,7 +4,8 @@ # Additional Function-related APIs __all__ = ['Buffer', 'DimensionTuple', 'NODE', 'CELL', 'IgnoreDimSort', - 'HierarchyLayer', 'HostLayer'] + 'HierarchyLayer', 'HostLayer', 'DeviceLayer', 'DiskLayer', + 'host_layer', 'device_layer', 'disk_layer'] class Buffer(Tag): @@ -72,3 +73,16 @@ def __hash__(self): class HostLayer(HierarchyLayer): pass + + +class DeviceLayer(HierarchyLayer): + pass + + +class DiskLayer(HierarchyLayer): + pass + + +host_layer = HostLayer('host') +device_layer = DeviceLayer('device') +disk_layer = DiskLayer('disk') diff --git a/tests/test_dle.py b/tests/test_dle.py index e2af0ed7e3..13f6841c09 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -646,7 +646,7 @@ def test_nthreads_generation(self): assert op0.nthreads in op0.parameters # `nthreads` is bindable to a runtime value - assert op0.nthreads._arg_values() + assert op0.nthreads._arg_values(nthreads=3)['nthreads'] == 3 @pytest.mark.parametrize('exprs,expected', [ # trivial 1D diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index da9b3fdbe7..8f100a1082 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -185,7 +185,7 @@ def test_tasking_in_isolation(self, opt): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 2 + assert len(sections) == 3 assert str(sections[0].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0);' body = op._func_table['release_lock0'].root.body assert str(body.body[0].condition) == 'Ne(lock0[0], 2)' @@ -260,10 +260,10 @@ def test_tasking_unfused_two_locks(self): assert len(trees) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 5 sections = FindNodes(Section).visit(op) - assert len(sections) == 2 + assert len(sections) == 4 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock - body = trees[-1].root.nodes[-2] + body = sections[2].body[0].body[0] assert str(body.body[0]) == 'release_lock0(lock0);' assert str(body.body[1]) == 'activate0(time,sdata0);' assert len(op._func_table) == 5 @@ -300,7 +300,7 @@ def test_tasking_forcefuse(self): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 2 sections = FindNodes(Section).visit(op) - assert len(sections) == 2 + assert len(sections) == 3 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock body = op._func_table['release_lock0'].root.body @@ -369,7 +369,7 @@ def test_tasking_multi_output(self): assert len(retrieve_iteration_tree(op1)) == 2 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op1) - assert len(sections) == 1 + assert len(sections) == 2 assert str(sections[0].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t2] == 0);' body = op1._func_table['release_lock0'].root.body @@ -407,7 +407,7 @@ def test_tasking_lock_placement(self): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 2 + assert len(sections) == 3 assert sections[0].body[0].body[0].body[0].is_Iteration assert str(sections[1].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t1] == 0);' @@ -889,7 +889,7 @@ def test_tasking_over_compiler_generated(self): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 + assert len(sections) == 4 assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0]) op0.apply(time_M=nt-1)