diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 38fb0d1f525e7..7ed55e7d60ebe 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -19,27 +19,34 @@ namespace cms::alpakatools {
   // Return the integer division of the first argument by the second argument, rounded up to the next integer
   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
 
+  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
+  template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+  struct requires_single_thread_per_block : public std::true_type {};
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+  template <typename TDim>
+  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+  template <typename TDim>
+  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
+  template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+  inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
+
   // Create an accelerator-dependent work division for 1-dimensional kernels
   template <typename TAcc,
             typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
-      // On GPU backends, each thread is looking at a single element:
-      //   - the number of threads per block is "elements";
-      //   - the number of elements per thread is always 1.
-      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
-    } else
-#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
-#if ALPAKA_ACC_GPU_HIP_ENABLED
-        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
+    if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
-    } else
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
-    {
+    } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
@@ -52,23 +59,12 @@ namespace cms::alpakatools {
   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
     using Dim = alpaka::Dim<TAcc>;
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
+    if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
-    } else
-#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
-      // On GPU backends, each thread is looking at a single element:
-      //   - the number of threads per block is "elements";
-      //   - the number of elements per thread is always 1.
-      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
-    } else
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
-    {
+    } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
@@ -108,10 +104,12 @@ namespace cms::alpakatools {
 
       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
-        // increment the index along the elements processed by the current thread
-        ++index_;
-        if (index_ < last_)
-          return *this;
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < last_)
+            return *this;
+        }
 
         // increment the thread index with the grid stride
         first_ += stride_;
@@ -183,76 +181,123 @@ namespace cms::alpakatools {
 
     class iterator {
       friend class elements_with_stride_nd;
-      constexpr static const auto last_dimension = Dim::value - 1;
-
-      ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
-          : elements_{elements},
-            stride_{stride},
-            extent_{extent},
-            first_{alpaka::elementwise_min(first, extent)},
-            index_{first_},
-            last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}
 
     public:
       ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
 
       // pre-increment the iterator
-      ALPAKA_FN_ACC inline iterator& operator++() {
-        // increment the index along the elements processed by the current thread
-        ++index_[last_dimension];
-        if (index_[last_dimension] < last_)
-          return *this;
-
-        // increment the thread index along with the last dimension with the grid stride
-        first_[last_dimension] += stride_[last_dimension];
-        index_[last_dimension] = first_[last_dimension];
-        last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
-        if (index_[last_dimension] < extent_[last_dimension])
-          return *this;
-
-        // increment the thread index along the outer dimensions with the grid stride
-        if constexpr (last_dimension > 0)
-          for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
-            first_[dimension] += stride_[dimension];
-            index_[dimension] = first_[dimension];
-            if (index_[dimension] < extent_[dimension])
-              return *this;
-          }
-
-        // the iterator has reached or passed the end of the extent, clamp it to the extent
-        first_ = extent_;
-        index_ = extent_;
-        last_ = extent_[last_dimension];
+      ALPAKA_FN_ACC constexpr inline iterator operator++() {
+        increment();
         return *this;
       }
 
       // post-increment the iterator
-      ALPAKA_FN_ACC inline iterator operator++(int) {
+      ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
         iterator old = *this;
-        ++(*this);
+        increment();
         return old;
       }
 
-      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
-        return (index_ == other.index_) and (first_ == other.first_);
-      }
+      ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
 
-      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+      ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }
 
     private:
-      // non-const to support iterator copy and assignment
-      Vec elements_;
-      Vec stride_;
-      Vec extent_;
+      // private, explicit constructor
+      ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
+          : loop_{loop},
+            thread_{alpaka::elementwise_min(first, loop->extent_)},
+            range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
+            index_{thread_} {}
+
+      template <size_t I>
+      ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
+        bool overflow = false;
+        ++index_[I];
+        if (index_[I] >= range_[I]) {
+          index_[I] = thread_[I];
+          overflow = true;
+        }
+        return overflow;
+      }
+
+      template <size_t N>
+      ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
+        if constexpr (N == 0) {
+          // overflow
+          return true;
+        } else {
+          if (not nth_elements_loop<N - 1>()) {
+            return false;
+          } else {
+            return do_elements_loops<N - 1>();
+          }
+        }
+      }
+
+      template <size_t I>
+      ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
+        bool overflow = false;
+        thread_[I] += loop_->stride_[I];
+        if (thread_[I] >= loop_->extent_[I]) {
+          thread_[I] = loop_->first_[I];
+          overflow = true;
+        }
+        index_[I] = thread_[I];
+        range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
+        return overflow;
+      }
+
+      template <size_t N>
+      ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
+        if constexpr (N == 0) {
+          // overflow
+          return true;
+        } else {
+          if (not nth_strided_loop<N - 1>()) {
+            return false;
+          } else {
+            return do_strided_loops<N - 1>();
+          }
+        }
+      }
+
+      // increment the iterator
+      ALPAKA_FN_ACC inline constexpr void increment() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // linear N-dimensional loops over the elements associated to the thread;
+          // do_elements_loops<>() returns true if any of those loops overflows
+          if (not do_elements_loops<Dim::value>()) {
+            // the elements loops did not overflow, return the next index
+            return;
+          }
+        }
+
+        // strided N-dimensional loop over the threads in the kernel launch grid;
+        // do_strided_loops<>() returns true if any of those loops overflows
+        if (not do_strided_loops<Dim::value>()) {
+          // the strided loops did not overflow, return the next index
+          return;
+        }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        thread_ = loop_->extent_;
+        range_ = loop_->extent_;
+        index_ = loop_->extent_;
+      }
+
+      // const pointer to the elements_with_stride_nd that the iterator refers to
+      const elements_with_stride_nd* loop_;
+
       // modified by the pre/post-increment operator
-      Vec first_;
-      Vec index_;
-      Idx last_;
+      Vec thread_;  // first element processed by this thread
+      Vec range_;   // last element processed by this thread
+      Vec index_;   // current element processed by this thread
     };
 
-    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
 
-    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+    ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
 
   private:
     const Vec elements_;
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
index 4cbf4f1dc08d8..a1125435e7440 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
@@ -37,6 +37,17 @@ struct VectorAddKernel1D {
   }
 };
 
+struct VectorAddKernel3D {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(
+      TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
+    for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
+      auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
+      out[index] = in1[index] + in2[index];
+    }
+  }
+};
+
 TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) {
   SECTION("VectorAddKernel") {
     // get the list of devices on the current platform
@@ -125,3 +136,74 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag)
     }
   }
 }
+
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
+  SECTION("VectorAddKernel3D") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      return;
+    }
+
+    // random number generator with a gaussian distribution
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0., 1.};
+
+    // tolerance
+    constexpr float epsilon = 0.000001;
+
+    // 3-dimensional and linearised buffer size
+    constexpr Vec3D ndsize = {50, 125, 16};
+    constexpr size_t size = ndsize.prod();
+
+    // allocate input and output host buffers
+    auto in1_h = cms::alpakatools::make_host_buffer<float[]>(size);
+    auto in2_h = cms::alpakatools::make_host_buffer<float[]>(size);
+    auto out_h = cms::alpakatools::make_host_buffer<float[]>(size);
+
+    // fill the input buffers with random data, and the output buffer with zeros
+    for (size_t i = 0; i < size; ++i) {
+      in1_h[i] = dist(rand);
+      in2_h[i] = dist(rand);
+      out_h[i] = 0.;
+    }
+
+    // run the test on each device
+    for (auto const& device : devices) {
+      std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n';
+      auto queue = Queue(device);
+
+      // allocate input and output buffers on the device
+      auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+      auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+      // copy the input data to the device; the size is known from the buffer objects
+      alpaka::memcpy(queue, in1_d, in1_h);
+      alpaka::memcpy(queue, in2_d, in2_h);
+
+      // fill the output buffer with zeros; the size is known from the buffer objects
+      alpaka::memset(queue, out_d, 0.);
+
+      // launch the 3-dimensional kernel
+      auto div = cms::alpakatools::make_workdiv<Acc3D>({5, 5, 1}, {4, 4, 4});
+      alpaka::exec<Acc3D>(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);
+
+      // copy the results from the device to the host
+      alpaka::memcpy(queue, out_h, out_d);
+
+      // wait for all the operations to complete
+      alpaka::wait(queue);
+
+      // check the results
+      for (size_t i = 0; i < size; ++i) {
+        float sum = in1_h[i] + in2_h[i];
+        REQUIRE(out_h[i] < sum + epsilon);
+        REQUIRE(out_h[i] > sum - epsilon);
+      }
+    }
+  }
+}