diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 38fb0d1f525e7..7ed55e7d60ebe 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -19,27 +19,34 @@ namespace cms::alpakatools { // Return the integer division of the first argument by the second argument, rounded up to the next integer inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } + // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped + template >> + struct requires_single_thread_per_block : public std::true_type {}; + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + struct requires_single_thread_per_block> : public std::false_type {}; +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + struct requires_single_thread_per_block> : public std::false_type {}; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped + template >> + inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block::value; + // Create an accelerator-dependent work division for 1-dimensional kernels template and alpaka::Dim::value == 1>> inline WorkDiv make_workdiv(Idx blocks, Idx elements) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if constexpr (std::is_same_v>) { - // On GPU backends, each thread is looking at a single element: - // - the number of threads per block is "elements"; - // - the number of elements per thread is always 1. - return WorkDiv(blocks, elements, Idx{1}); - } else -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#if ALPAKA_ACC_GPU_HIP_ENABLED - if constexpr (std::is_same_v>) { + if constexpr (not requires_single_thread_per_block_v) { // On GPU backends, each thread is looking at a single element: // - the number of threads per block is "elements"; // - the number of elements per thread is always 1. return WorkDiv(blocks, elements, Idx{1}); - } else -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - { + } else { // On CPU backends, run serially with a single thread per block: // - the number of threads per block is always 1; // - the number of elements per thread is "elements". @@ -52,23 +59,12 @@ namespace cms::alpakatools { inline WorkDiv> make_workdiv(const Vec>& blocks, const Vec>& elements) { using Dim = alpaka::Dim; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if constexpr (std::is_same_v>) { + if constexpr (not requires_single_thread_per_block_v) { // On GPU backends, each thread is looking at a single element: // - the number of threads per block is "elements"; // - the number of elements per thread is always 1. return WorkDiv(blocks, elements, Vec::ones()); - } else -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if constexpr (std::is_same_v>) { - // On GPU backends, each thread is looking at a single element: - // - the number of threads per block is "elements"; - // - the number of elements per thread is always 1. - return WorkDiv(blocks, elements, Vec::ones()); - } else -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - { + } else { // On CPU backends, run serially with a single thread per block: // - the number of threads per block is always 1; // - the number of elements per thread is "elements". @@ -108,10 +104,12 @@ namespace cms::alpakatools { // pre-increment the iterator ALPAKA_FN_ACC inline iterator& operator++() { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < last_) - return *this; + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < last_) + return *this; + } // increment the thread index with the grid stride first_ += stride_; @@ -183,76 +181,123 @@ namespace cms::alpakatools { class iterator { friend class elements_with_stride_nd; - constexpr static const auto last_dimension = Dim::value - 1; - - ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{alpaka::elementwise_min(first, extent)}, - index_{first_}, - last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {} public: ALPAKA_FN_ACC inline Vec operator*() const { return index_; } // pre-increment the iterator - ALPAKA_FN_ACC inline iterator& operator++() { - // increment the index along the elements processed by the current thread - ++index_[last_dimension]; - if (index_[last_dimension] < last_) - return *this; - - // increment the thread index along with the last dimension with the grid stride - first_[last_dimension] += stride_[last_dimension]; - index_[last_dimension] = first_[last_dimension]; - last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); - if (index_[last_dimension] < extent_[last_dimension]) - return *this; - - // increment the thread index along the outer dimensions with the grid stride - if constexpr (last_dimension > 0) - for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) { - first_[dimension] += stride_[dimension]; - index_[dimension] = first_[dimension]; - if (index_[dimension] < extent_[dimension]) - return *this; - } - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - last_ = extent_[last_dimension]; + ALPAKA_FN_ACC constexpr inline iterator operator++() { + increment(); return *this; } // post-increment the iterator - ALPAKA_FN_ACC inline iterator operator++(int) { + ALPAKA_FN_ACC constexpr inline iterator operator++(int) { iterator old = *this; - ++(*this); + increment(); return old; } - ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } + ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); } - ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); } private: - // non-const to support iterator copy and assignment - Vec elements_; - Vec stride_; - Vec extent_; + // private, explicit constructor + ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first) + : loop_{loop}, + thread_{alpaka::elementwise_min(first, loop->extent_)}, + range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, + index_{thread_} {} + + template + ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { + bool overflow = false; + ++index_[I]; + if (index_[I] >= range_[I]) { + index_[I] = thread_[I]; + overflow = true; + } + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_elements_loop()) { + return false; + } else { + return do_elements_loops(); + } + } + } + + template + ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { + bool overflow = false; + thread_[I] += loop_->stride_[I]; + if (thread_[I] >= loop_->extent_[I]) { + thread_[I] = loop_->first_[I]; + overflow = true; + } + index_[I] = thread_[I]; + range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]); + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_strided_loop()) { + return false; + } else { + return do_strided_loops(); + } + } + } + + // increment the iterator + ALPAKA_FN_ACC inline constexpr void increment() { + if constexpr (requires_single_thread_per_block_v) { + // linear N-dimensional loops over the elements associated to the thread; + // do_elements_loops<>() returns true if any of those loops overflows + if (not do_elements_loops()) { + // the elements loops did not overflow, return the next index + return; + } + } + + // strided N-dimensional loop over the threads in the kernel launch grid; + // do_strided_loops<>() returns true if any of those loops overflows + if (not do_strided_loops()) { + // the strided loops did not overflow, return the next index + return; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + thread_ = loop_->extent_; + range_ = loop_->extent_; + index_ = loop_->extent_; + } + + // const pointer to the elements_with_stride_nd that the iterator refers to + const elements_with_stride_nd* loop_; + // modified by the pre/post-increment operator - Vec first_; - Vec index_; - Idx last_; + Vec thread_; // first element processed by this thread + Vec range_; // last element processed by this thread + Vec index_; // current element processed by this thread }; - ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; } - ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; } private: const Vec elements_; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc index 4cbf4f1dc08d8..a1125435e7440 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc @@ -37,6 +37,17 @@ struct VectorAddKernel1D { } }; +struct VectorAddKernel3D { + template + ALPAKA_FN_ACC void operator()( + TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const { + for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) { + auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2]; + out[index] = in1[index] + in2[index]; + } + } +}; + TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) { SECTION("VectorAddKernel") { // get the list of devices on the current platform @@ -125,3 +136,74 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) } } } + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) { + SECTION("VectorAddKernel3D") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // 3-dimensional and linearised buffer size + constexpr Vec3D ndsize = {50, 125, 16}; + constexpr size_t size = ndsize.prod(); + + // allocate input and output host buffers + auto in1_h = cms::alpakatools::make_host_buffer(size); + auto in2_h = cms::alpakatools::make_host_buffer(size); + auto out_h = cms::alpakatools::make_host_buffer(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + // allocate input and output buffers on the device + auto in1_d = cms::alpakatools::make_device_buffer(queue, size); + auto in2_d = cms::alpakatools::make_device_buffer(queue, size); + auto out_d = cms::alpakatools::make_device_buffer(queue, size); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::memcpy(queue, in1_d, in1_h); + alpaka::memcpy(queue, in2_d, in2_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::memset(queue, out_d, 0.); + + // launch the 3-dimensional kernel + auto div = cms::alpakatools::make_workdiv({5, 5, 1}, {4, 4, 4}); + alpaka::exec(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize); + + // copy the results from the device to the host + alpaka::memcpy(queue, out_h, out_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + REQUIRE(out_h[i] < sum + epsilon); + REQUIRE(out_h[i] > sum - epsilon); + } + } + } +}