diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt
index c3ae2f682c..0ad4fb096c 100644
--- a/benchmarks/gbench/mp/CMakeLists.txt
+++ b/benchmarks/gbench/mp/CMakeLists.txt
@@ -80,13 +80,31 @@ add_executable(wave_equation wave_equation.cpp)
 target_link_libraries(wave_equation cxxopts DR::mpi)
 target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK)
 add_mp_ctest(NAME wave_equation)
+add_executable(wave_equation_wide wave_equation_wide.cpp)
+target_link_libraries(wave_equation_wide cxxopts DR::mpi)
+target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK)
+add_mp_ctest(NAME wave_equation_wide)
 # add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) #
 # DRA-92
 if(ENABLE_SYCL)
   add_mp_ctest(
-    TEST_NAME wave_equation-sycl NAME wave_equation NPROC 2 SYCL)
+    TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL)
   add_mp_ctest(
-    TEST_NAME wave_equation_fused-sycl NAME wave_equation NPROC 2 SYCL TARGS -f)
+    TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t)
+  add_mp_ctest(
+    TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f)
+  add_mp_ctest(
+    TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL)
+  foreach(redundancy RANGE 1 8)
+    add_mp_ctest(
+      TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy})
+  endforeach()
+  add_mp_ctest(
+    TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory)
+  foreach(redundancy RANGE 1 8)
+    add_mp_ctest(
+      TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy})
+  endforeach()
 endif()
 
 add_executable(shallow_water shallow_water.cpp)
diff --git a/benchmarks/gbench/mp/wave_equation_wide.cpp b/benchmarks/gbench/mp/wave_equation_wide.cpp
new file mode 100644
index 0000000000..51830d9deb
--- /dev/null
+++ b/benchmarks/gbench/mp/wave_equation_wide.cpp
@@ -0,0 +1,454 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cxxopts.hpp"
+#include "dr/mp.hpp"
+#include "mpi.h"
+#include "wave_utils.hpp"
+#include <chrono>
+#include <iomanip>
+#include <memory>
+
+#ifdef STANDALONE_BENCHMARK
+
+MPI_Comm comm;
+int comm_rank;
+int comm_size;
+
+#else
+
+#include "../common/dr_bench.hpp"
+
+#endif
+
+namespace WaveEquation {
+
+using T = double;
+using Array = dr::mp::distributed_mdarray<T, 2>;
+
+// gravitational acceleration
+constexpr double g = 9.81;
+// water depth
+constexpr double h = 1.0;
+
+// Get number of read/write bytes and flops for a single time step
+// These numbers correspond to the fused kernel version
+void calculate_complexity(std::size_t nx, std::size_t ny, std::size_t &nread,
+                          std::size_t &nwrite, std::size_t &nflop) {
+  // stage1: 2+2+3 = 7
+  // stage2: 3+3+4 = 10
+  // stage3: 3+3+4 = 10
+  nread = (27 * nx * ny) * sizeof(T);
+  // stage1: 3
+  // stage2: 3
+  // stage3: 3
+  nwrite = (9 * nx * ny) * sizeof(T);
+  // stage1: 3+3+6 = 12
+  // stage2: 6+6+9 = 21
+  // stage3: 6+6+9 = 21
+  nflop = 54 * nx * ny;
+}
+
+double exact_elev(double x, double y, double t, double lx, double ly) {
+  /**
+   * Exact solution for elevation field.
+   *
+   * Returns time-dependent elevation of a 2D standing wave in a
+   * rectangular domain.
+   */
+  double amp = 0.5;
+  double c = std::sqrt(g * h);
+  double sol_x = std::cos(2.0 * M_PI * x / lx);
+  double sol_y = std::cos(2.0 * M_PI * y / ly);
+  double omega = c * M_PI * std::hypot(1.0 / lx, 1.0 / ly);
+  double sol_t = std::cos(2.0 * omega * t);
+  return amp * sol_x * sol_y * sol_t;
+}
+
+double initial_elev(double x, double y, double lx, double ly) {
+  return exact_elev(x, y, 0.0, lx, ly);
+}
+
+void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt,
+         double dx_inv, double dy_inv, double dt) {
+  /**
+   * Evaluate right hand side of the equations
+   */
+  auto rhs_dedx = [dt, dx_inv](auto v) {
+    auto [in, out] = v;
+    out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv;
+  };
+  stencil_for_each_extended<2>(rhs_dedx, {0, 0}, {1, 0}, e, dudt);
+
+  auto rhs_dedy = [dt, dy_inv](auto v) {
+    auto [in, out] = v;
+    out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv;
+  };
+  stencil_for_each_extended<2>(rhs_dedy, {0, 1}, {0, 0}, e, dvdt);
+
+  auto rhs_div = [dt, dx_inv, dy_inv](auto args) {
+    auto [u, v, out] = args;
+    auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv;
+    auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv;
+    out(0, 0) = -dt * h * (dudx + dvdy);
+  };
+  stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt);
+}
+
+int run(
+    std::size_t n, std::size_t redundancy, std::size_t steps,
+    std::function<void()> iter_callback = []() {}) {
+  // construct grid
+  // number of cells in x, y direction
+  std::size_t nx = n;
+  std::size_t ny = n;
+  const double xmin = -1, xmax = 1;
+  const double ymin = -1, ymax = 1;
+  ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny);
+
+  auto dist = dr::mp::distribution().halo(1).redundancy(redundancy);
+
+  // statistics
+  std::size_t nread, nwrite, nflop;
+  calculate_complexity(nx, ny, nread, nwrite, nflop);
+
+  if (comm_rank == 0) {
+    std::cout << "Using backend: dr" << std::endl;
+    std::cout << "Grid size: " << nx << " x " << ny << std::endl;
+    std::cout << "Elevation DOFs: " << nx * ny << std::endl;
+    std::cout << "Velocity  DOFs: " << (nx + 1) * ny + nx * (ny + 1)
+              << std::endl;
+    std::cout << "Total     DOFs: " << nx * ny + (nx + 1) * ny + nx * (ny + 1);
+    std::cout << std::endl;
+  }
+
+  // compute time step
+  double c = std::sqrt(g * h);
+  double alpha = 0.5;
+  double dt = alpha * std::min(grid.dx, grid.dy) / c;
+  std::size_t nt = steps;
+  dt = 1e-5;
+  double t_end = static_cast<double>(nt) * dt;
+  double t_export = 25 * dt;
+
+  if (comm_rank == 0) {
+    std::cout << "Time step: " << dt << " s" << std::endl;
+    std::cout << "Total run time: " << std::fixed << std::setprecision(1);
+    std::cout << t_end << " s, ";
+    std::cout << nt << " time steps" << std::endl;
+    std::cout << "Redundancy " << redundancy << std::endl;
+  }
+
+  std::cout << "before e\n";
+  // state variables
+  // water elevation at T points
+  Array e({nx + 1, ny}, dist);
+  std::cout << "after e\n";
+  dr::mp::fill(e, 0.0);
+  std::cout << "after fill e\n";
+  // x velocity at U points
+  Array u({nx + 1, ny}, dist);
+  dr::mp::fill(u, 0.0);
+  // y velocity at V points
+  Array v({nx + 1, ny + 1}, dist);
+  dr::mp::fill(v, 0.0);
+
+  // state for RK stages
+  Array e1({nx + 1, ny}, dist);
+  Array u1({nx + 1, ny}, dist);
+  Array v1({nx + 1, ny + 1}, dist);
+  Array e2({nx + 1, ny}, dist);
+  Array u2({nx + 1, ny}, dist);
+  Array v2({nx + 1, ny + 1}, dist);
+
+  // time tendencies
+  // NOTE not needed if rhs kernels are fused with RK stage assignment
+  Array dedt({nx + 1, ny}, dist);
+  Array dudt({nx + 1, ny}, dist);
+  Array dvdt({nx + 1, ny + 1}, dist);
+
+  std::cout << "After all arrays\n";
+
+  dr::mp::fill(dedt, 0);
+  dr::mp::fill(dudt, 0);
+  dr::mp::fill(dvdt, 0);
+  std::cout << "After fill\n";
+
+  dr::mp::halo(dedt).exchange();
+  dr::mp::halo(dudt).exchange();
+  dr::mp::halo(dvdt).exchange();
+  std::cout << "After first exchange\n";
+
+  auto init_op = [xmin, ymin, grid](auto index, auto v) {
+    auto &[o] = v;
+
+    std::size_t global_i = index[0];
+    if (global_i > 0) {
+      std::size_t global_j = index[1];
+      T x = xmin + grid.dx / 2 + static_cast<double>(global_i - 1) * grid.dx;
+      T y = ymin + grid.dy / 2 + static_cast<double>(global_j) * grid.dy;
+      o = initial_elev(x, y, grid.lx, grid.ly);
+    }
+  };
+  dr::mp::for_each(init_op, e);
+  std::cout << "After mp::for_each\n";
+
+  auto add = [](auto ops) { return ops.first + ops.second; };
+  auto max = [](double x, double y) { return std::max(x, y); };
+  auto rk_update2 = [](auto ops) {
+    return 0.75 * std::get<0>(ops) +
+           0.25 * (std::get<1>(ops) + std::get<2>(ops));
+  };
+  auto rk_update3 = [](auto ops) {
+    return 1.0 / 3.0 * std::get<0>(ops) +
+           2.0 / 3.0 * (std::get<1>(ops) + std::get<2>(ops));
+  };
+
+  std::size_t i_export = 0;
+  double next_t_export = 0.0;
+  double t = 0.0;
+  double initial_v = 0.0;
+  auto tic = std::chrono::steady_clock::now();
+
+  // RK stage 1: u1 = u + dt*rhs(u)
+  auto stage_1 = [&] {
+    rhs(u, v, e, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt);
+    dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add);
+    dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add);
+    dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add);
+  };
+  // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1))
+  auto stage_2 = [&] {
+    rhs(u1, v1, e1, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt);
+    dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2);
+    dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2);
+    dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2);
+  };
+  // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2))
+  auto stage_3 = [&] {
+    rhs(u2, v2, e2, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt);
+    dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3);
+    dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3);
+    dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3);
+  };
+
+  for (std::size_t i = 0; i < nt + 1; i++) {
+    std::cout << "i = " << i << "\n";
+    t = static_cast<double>(i) * dt;
+
+    if (t >= next_t_export - 1e-8) {
+
+      double elev_max = dr::mp::reduce(e, static_cast<T>(0), max);
+      double u_max = dr::mp::reduce(u, static_cast<T>(0), max);
+
+      double total_v = (dr::mp::reduce(e, static_cast<T>(0), std::plus{}) + h) *
+                       grid.dx * grid.dy;
+      if (i == 0) {
+        initial_v = total_v;
+      }
+      double diff_v = total_v - initial_v;
+
+      if (comm_rank == 0) {
+        printf("%2lu %4lu %.3f ", i_export, i, t);
+        printf("elev=%7.5f ", elev_max);
+        printf("u=%7.5f ", u_max);
+        printf("dV=% 6.3e ", diff_v);
+        printf("\n");
+      }
+      if (elev_max > 1e3) {
+        if (comm_rank == 0) {
+          std::cout << "Invalid elevation value: " << elev_max << std::endl;
+        }
+        return 1;
+      }
+      i_export += 1;
+      next_t_export = static_cast<double>(i_export) * t_export;
+    }
+
+    // step
+    iter_callback();
+    if ((i + 1) % redundancy == 0) {
+      // phase with communication - once after (redundancy - 1) steps without
+      // communication
+      dr::mp::halo(e).exchange();
+      dr::mp::halo(u).exchange();
+      dr::mp::halo(v).exchange();
+      stage_1();
+
+      dr::mp::halo(u1).exchange();
+      dr::mp::halo(v1).exchange();
+      dr::mp::halo(e1).exchange();
+      stage_2();
+
+      dr::mp::halo(u2).exchange();
+      dr::mp::halo(v2).exchange();
+      dr::mp::halo(e2).exchange();
+      stage_3();
+    } else {
+      // Phase without communication
+      stage_1();
+      stage_2();
+      stage_3();
+    }
+  }
+
+  dr::mp::halo(e).exchange();
+  dr::mp::halo(u).exchange();
+  dr::mp::halo(v).exchange();
+  dr::mp::halo(u1).exchange();
+  dr::mp::halo(v1).exchange();
+  dr::mp::halo(e1).exchange();
+  dr::mp::halo(u2).exchange();
+  dr::mp::halo(v2).exchange();
+  dr::mp::halo(e2).exchange();
+
+  auto toc = std::chrono::steady_clock::now();
+  std::chrono::duration<double> duration = toc - tic;
+  if (comm_rank == 0) {
+    double t_cpu = duration.count();
+    double t_step = t_cpu / static_cast<double>(nt);
+    double read_bw = double(nread) / t_step / (1024 * 1024 * 1024);
+    double write_bw = double(nwrite) / t_step / (1024 * 1024 * 1024);
+    double flop_rate = double(nflop) / t_step / (1000 * 1000 * 1000);
+    double ai = double(nflop) / double(nread + nwrite);
+    std::cout << "Duration: " << std::setprecision(3) << t_cpu;
+    std::cout << " s" << std::endl;
+    std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000;
+    std::cout << " ms" << std::endl;
+    std::cout << "Reads : " << std::setprecision(3) << read_bw;
+    std::cout << " GB/s" << std::endl;
+    std::cout << "Writes: " << std::setprecision(3) << write_bw;
+    std::cout << " GB/s" << std::endl;
+    std::cout << "FLOP/s: " << std::setprecision(3) << flop_rate;
+    std::cout << " GFLOP/s" << std::endl;
+    std::cout << "Arithmetic intensity: " << std::setprecision(5) << ai;
+    std::cout << " FLOP/Byte" << std::endl;
+  }
+
+  // Compute error against exact solution
+  Array e_exact({nx + 1, ny}, dist);
+  dr::mp::fill(e_exact, 0.0);
+  Array error({nx + 1, ny}, dist);
+
+  auto exact_op = [xmin, ymin, grid, t](auto index, auto v) {
+    auto &[o] = v;
+
+    std::size_t global_i = index[0];
+    if (global_i > 0) {
+      std::size_t global_j = index[1];
+      T x = xmin + grid.dx / 2 + static_cast<double>(global_i - 1) * grid.dx;
+      T y = ymin + grid.dy / 2 + static_cast<double>(global_j) * grid.dy;
+      o = exact_elev(x, y, t, grid.lx, grid.ly);
+    }
+  };
+  dr::mp::for_each(exact_op, e_exact);
+  dr::mp::halo(e_exact).exchange();
+  auto error_kernel = [](auto ops) {
+    auto err = ops.first - ops.second;
+    return err * err;
+  };
+  dr::mp::transform(dr::mp::views::zip(e, e_exact), error.begin(),
+                    error_kernel);
+  double err_L2 = dr::mp::reduce(error, static_cast<T>(0), std::plus{}) *
+                  grid.dx * grid.dy / grid.lx / grid.ly;
+  err_L2 = std::sqrt(err_L2);
+  if (comm_rank == 0) {
+    std::cout << "L2 error: " << std::setw(7) << std::scientific;
+    std::cout << std::setprecision(5) << err_L2 << std::endl;
+  }
+  return 0;
+}
+
+} // namespace WaveEquation
+
+#ifdef STANDALONE_BENCHMARK
+
+int main(int argc, char *argv[]) {
+
+  MPI_Init(&argc, &argv);
+  comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &comm_rank);
+  MPI_Comm_size(comm, &comm_size);
+
+  cxxopts::Options options_spec(argv[0], "wave equation");
+  // clang-format off
+  options_spec.add_options()
+    ("n", "Grid size", cxxopts::value<std::size_t>()->default_value("128"))
+    ("t,steps", "Run a fixed number of time steps.", cxxopts::value<std::size_t>()->default_value("100"))
+    ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value<std::size_t>()->default_value("2"))
+    ("sycl", "Execute on SYCL device")
+    ("l,log", "enable logging")
+    ("logprefix", "appended .RANK.log", cxxopts::value<std::string>()->default_value("dr"))
+    ("device-memory", "Use device memory")
+    ("h,help", "Print help");
+  // clang-format on
+
+  cxxopts::ParseResult options;
+  try {
+    options = options_spec.parse(argc, argv);
+  } catch (const cxxopts::OptionParseException &e) {
+    std::cout << options_spec.help() << "\n";
+    exit(1);
+  }
+
+  std::unique_ptr<std::ofstream> logfile;
+  if (options.count("log")) {
+    logfile =
+        std::make_unique<std::ofstream>(options["logprefix"].as<std::string>() +
+                                        fmt::format(".{}.log", comm_rank));
+    dr::drlog.set_file(*logfile);
+  }
+
+  if (options.count("sycl")) {
+#ifdef SYCL_LANGUAGE_VERSION
+    sycl::queue q = dr::mp::select_queue();
+    std::cout << "Run on: "
+              << q.get_device().get_info<sycl::info::device::name>() << "\n";
+    dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device
+                                                   : sycl::usm::alloc::shared);
+#else
+    std::cout << "Sycl support requires icpx\n";
+    exit(1);
+#endif
+  } else {
+    if (comm_rank == 0) {
+      std::cout << "Run on: CPU\n";
+    }
+    dr::mp::init();
+  }
+
+  std::size_t n = options["n"].as<std::size_t>();
+  std::size_t redundancy = options["r"].as<std::size_t>();
+  std::size_t steps = options["t"].as<std::size_t>();
+
+  auto error = WaveEquation::run(n, redundancy, steps);
+  dr::mp::finalize();
+  MPI_Finalize();
+  return error;
+}
+
+#else
+
+static void WaveEquation_DR(benchmark::State &state) {
+
+  int n = ::sqrtl(default_vector_size);
+
+  // ugly hack to make it working in reasonable time in benchmarking framework
+  // drbench.py should specify right size or there should be another size option
+  // to use here instead of default_vector_size
+  n /= 4;
+
+  std::size_t nread, nwrite, nflop;
+  WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop);
+  Stats stats(state, nread, nwrite, nflop);
+
+  auto iter_callback = [&stats]() { stats.rep(); };
+  for (auto _ : state) {
+    WaveEquation::run(n, true, true, iter_callback);
+  }
+}
+
+DR_BENCHMARK(WaveEquation_DR);
+
+#endif
diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp
index 8851208198..2b841be43b 100644
--- a/include/dr/mp/algorithms/for_each.hpp
+++ b/include/dr/mp/algorithms/for_each.hpp
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <execution>
+#include <ranges>
 #include <type_traits>
 #include <utility>
 
@@ -14,6 +15,7 @@
 #include <dr/detail/onedpl_direct_iterator.hpp>
 #include <dr/detail/ranges_shim.hpp>
 #include <dr/detail/sycl_utils.hpp>
+#include <dr/detail/tuple_utils.hpp>
 #include <dr/mp/global.hpp>
 
 namespace dr::mp {
@@ -62,4 +64,187 @@ DI for_each_n(DI first, I n, auto op) {
   return last;
 }
 
+namespace __detail {
+template <std::size_t Rank>
+using stencil_index_type = dr::__detail::dr_extents<Rank>;
+
+void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin,
+                                 stencil_index_type<1> end, const auto &segs) {
+  auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end);
+
+  auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); };
+  auto is_zero = [](auto a) { return a != 0; };
+
+  auto zipped = zip_view(seg0_begin, seg0_end);
+  auto distance = zipped | std::views::transform(sub);
+
+  if (rng::empty(distance | std::views::filter(is_zero)))
+    return;
+
+  auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) {
+    return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0],
+                          seg.extents());
+  });
+
+  auto do_point = [seg_infos, op](auto index) {
+    auto stencils =
+        dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) {
+          return md::mdspan(
+              std::to_address(dr::ranges::local(seg_info.first + index)),
+              seg_info.second);
+        });
+    op(stencils);
+  };
+  if (mp::use_sycl()) {
+#ifdef SYCL_LANGUAGE_VERSION
+    dr::__detail::parallel_for(dr::mp::sycl_queue(),
+                               sycl::range<1>(distance[0]), do_point)
+        .wait();
+#else
+    assert(false);
+#endif
+  } else {
+    for (std::size_t i = 0; i < distance[0]; i++) {
+      do_point(i);
+    }
+  }
+}
+
+void stencil_for_each_extended_2(auto op, stencil_index_type<2> &begin,
+                                 stencil_index_type<2> end, const auto &segs) {
+  auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end);
+
+  auto sub = [](auto a) {
+    auto x = std::get<0>(a);
+    auto y = std::get<1>(a);
+    return y > x ? y - x : 0;
+  };
+  auto is_zero = [](auto a) { return a != 0; };
+
+  auto zipped = zip_view(seg0_begin, seg0_end);
+  auto distance = zipped | std::views::transform(sub);
+
+  if (rng::empty(distance | std::views::filter(is_zero)))
+    return;
+
+  auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) {
+    auto ext = seg.root_mdspan().extents();
+    auto begin_stencil = seg.begin_stencil(begin);
+    return std::make_pair(md::mdspan(std::to_address(&seg.mdspan_extended()(
+                                         begin_stencil[0], begin_stencil[1])),
+                                     ext),
+                          ext);
+  });
+
+  auto do_point = [seg_infos, op](auto index) {
+    auto stencils =
+        dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) {
+          return md::mdspan(
+              std::to_address(&seg_info.first(index[0], index[1])),
+              seg_info.second);
+        });
+    op(stencils);
+  };
+  if (mp::use_sycl()) {
+#ifdef SYCL_LANGUAGE_VERSION
+    dr::__detail::parallel_for(dr::mp::sycl_queue(),
+                               sycl::range<2>(distance[0], distance[1]),
+                               do_point)
+        .wait();
+#else
+    assert(false);
+#endif
+  } else {
+    for (std::size_t i = 0; i < distance[0]; i++) {
+      for (std::size_t j = 0; j < distance[1]; j++) {
+        do_point(stencil_index_type<2>{i, j});
+      }
+    }
+  }
+}
+
+void stencil_for_each_extended_3(auto op, stencil_index_type<3> &begin,
+                                 stencil_index_type<3> end, const auto &segs) {
+  auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end);
+
+  auto sub = [](auto a) {
+    auto x = std::get<0>(a);
+    auto y = std::get<1>(a);
+    return y > x ? y - x : 0;
+  };
+  auto is_zero = [](auto a) { return a != 0; };
+
+  auto zipped = zip_view(seg0_begin, seg0_end);
+  auto distance = zipped | std::views::transform(sub);
+
+  if (rng::empty(distance | std::views::filter(is_zero)))
+    return;
+
+  auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) {
+    auto ext = seg.root_mdspan().extents();
+    auto begin_stencil = seg.begin_stencil(begin);
+    return std::make_pair(
+        md::mdspan(std::to_address(&seg.mdspan_extended()(
+                       begin_stencil[0], begin_stencil[1], begin_stencil[2])),
+                   ext),
+        ext);
+  });
+
+  auto do_point = [seg_infos, op](auto index) {
+    auto stencils =
+        dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) {
+          return md::mdspan(
+              std::to_address(&seg_info.first(index[0], index[1], index[2])),
+              seg_info.second);
+        });
+    op(stencils);
+  };
+  if (mp::use_sycl()) {
+#ifdef SYCL_LANGUAGE_VERSION
+    dr::__detail::parallel_for(
+        dr::mp::sycl_queue(),
+        sycl::range<3>(distance[0], distance[1], distance[2]), do_point)
+        .wait();
+#else
+    assert(false);
+#endif
+  } else {
+    for (std::size_t i = 0; i < distance[0]; i++) {
+      for (std::size_t j = 0; j < distance[1]; j++) {
+        for (std::size_t k = 0; k < distance[3]; k++) {
+          do_point(stencil_index_type<3>{i, j, k});
+        }
+      }
+    }
+  }
+}
+} // namespace __detail
+
+template <std::size_t Rank, typename... Ts>
+requires(1 <= Rank && Rank <= 3) void stencil_for_each_extended(
+    auto op, __detail::stencil_index_type<Rank> begin,
+    __detail::stencil_index_type<Rank> end,
+    dr::distributed_range auto &&...drs) {
+  auto ranges = std::tie(drs...);
+  auto &&dr0 = std::get<0>(ranges);
+  if (rng::empty(dr0)) {
+    return;
+  }
+
+  auto all_segments = rng::views::zip(dr::ranges::segments(drs)...);
+  for (const auto &segs : all_segments) {
+    if constexpr (Rank == 1) {
+      __detail::stencil_for_each_extended_1(op, begin, end, segs);
+    } else if constexpr (Rank == 2) {
+      __detail::stencil_for_each_extended_2(op, begin, end, segs);
+    } else if constexpr (Rank == 3) {
+      __detail::stencil_for_each_extended_3(op, begin, end, segs);
+    } else {
+      static_assert(false, "Not supported"); // sycl for_each does not support
+                                             // more than 3 dimensions
+    }
+  }
+  barrier();
+}
+
 } // namespace dr::mp
diff --git a/include/dr/mp/algorithms/md_for_each.hpp b/include/dr/mp/algorithms/md_for_each.hpp
index 9d92f0fe98..e0b29578e5 100644
--- a/include/dr/mp/algorithms/md_for_each.hpp
+++ b/include/dr/mp/algorithms/md_for_each.hpp
@@ -26,12 +26,12 @@ struct any {
 
 template <typename F, typename Arg1>
 concept one_argument = requires(F &f) {
-  { f(Arg1{}) };
+  {f(Arg1{})};
 };
 
 template <typename F, typename Arg1, typename Arg2>
 concept two_arguments = requires(F &f) {
-  { f(Arg1{}, Arg2{}) };
+  {f(Arg1{}, Arg2{})};
 };
 
 }; // namespace dr::mp::__detail
diff --git a/include/dr/mp/alignment.hpp b/include/dr/mp/alignment.hpp
index 71ce55dcc9..790e9620a4 100644
--- a/include/dr/mp/alignment.hpp
+++ b/include/dr/mp/alignment.hpp
@@ -11,7 +11,9 @@
 namespace dr::mp {
 
 template <typename T>
-concept has_segments = requires(T &t) { dr::ranges::segments(t); };
+concept has_segments = requires(T &t) {
+  dr::ranges::segments(t);
+};
 
 template <typename T>
 concept no_segments = !has_segments<T>;
diff --git a/include/dr/mp/containers/distributed_mdarray.hpp b/include/dr/mp/containers/distributed_mdarray.hpp
index 14cfdd2945..8050b4b035 100644
--- a/include/dr/mp/containers/distributed_mdarray.hpp
+++ b/include/dr/mp/containers/distributed_mdarray.hpp
@@ -17,20 +17,21 @@ template <typename T, std::size_t Rank> class distributed_mdarray {
   distributed_mdarray(dr::__detail::dr_extents<Rank> shape,
                       distribution dist = distribution())
       : tile_shape_(tile_shape(shape)), dv_(dv_size(), dv_dist(dist, shape)),
-        md_view_(make_md_view(dv_, shape, tile_shape_)) {}
+        md_view_(make_md_view(dv_, shape, tile_shape_, dist)), dist_(dist) {}
 
   auto begin() const { return rng::begin(md_view_); }
   auto end() const { return rng::end(md_view_); }
   auto size() const { return rng::size(md_view_); }
   auto operator[](auto n) { return md_view_[n]; }
 
-  auto segments() { return dr::ranges::segments(md_view_); }
+  auto segments() const { return dr::ranges::segments(md_view_); }
   auto &halo() const { return dr::mp::halo(dv_); }
 
   auto mdspan() const { return md_view_.mdspan(); }
   auto extent(std::size_t r) const { return mdspan().extent(r); }
   auto grid() { return md_view_.grid(); }
   auto view() const { return md_view_; }
+  auto dist() const { return dist_; }
 
   auto operator==(const distributed_mdarray &other) const {
     return std::equal(begin(), end(), other.begin());
@@ -70,16 +71,17 @@ template <typename T, std::size_t Rank> class distributed_mdarray {
   // This wrapper seems to avoid an issue with template argument
   // deduction for mdspan_view
   static auto make_md_view(const DV &dv, shape_type shape,
-                           shape_type tile_shape) {
-    return views::mdspan(dv, shape, tile_shape);
+                           shape_type tile_shape, distribution dist) {
+    return views::mdspan(dv, shape, tile_shape, dist);
   }
 
   shape_type tile_shape_;
   DV dv_;
-  using mdspan_type =
-      decltype(make_md_view(std::declval<DV>(), std::declval<shape_type>(),
-                            std::declval<shape_type>()));
+  using mdspan_type = decltype(make_md_view(
+      std::declval<DV>(), std::declval<shape_type>(),
+      std::declval<shape_type>(), std::declval<distribution>()));
   mdspan_type md_view_;
+  distribution dist_;
 };
 
 template <typename T, std::size_t Rank>
diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp
index 2611963064..0c8af9d4c0 100644
--- a/include/dr/mp/containers/distributed_vector.hpp
+++ b/include/dr/mp/containers/distributed_vector.hpp
@@ -276,6 +276,8 @@ template <typename T, class BackendT = MpiBackend> class distributed_vector {
 
   void fence() { backend.fence(); }
 
+  const auto &dist() const { return distribution_; }
+
 private:
   void init(auto size, auto dist) {
     size_ = size;
@@ -292,6 +294,8 @@ template <typename T, class BackendT = MpiBackend> class distributed_vector {
     segment_size_ = gran * std::max({(size / gran + comm_size - 1) / comm_size,
                                      hb.prev / gran, hb.next / gran});
 
+    extended_local_data_distribution ext_dist(segment_size_, size_, hb);
+
     data_size_ = segment_size_ + hb.prev + hb.next;
 
     if (size_ > 0) {
@@ -303,7 +307,8 @@ template <typename T, class BackendT = MpiBackend> class distributed_vector {
     std::size_t segment_index = 0;
     for (std::size_t i = 0; i < size; i += segment_size_) {
       segments_.emplace_back(this, segment_index++,
-                             std::min(segment_size_, size - i), data_size_);
+                             std::min(segment_size_, size - i), data_size_,
+                             ext_dist);
     }
 
     fence();
diff --git a/include/dr/mp/containers/distribution.hpp b/include/dr/mp/containers/distribution.hpp
index 44f4f43eb4..6fa00145a1 100644
--- a/include/dr/mp/containers/distribution.hpp
+++ b/include/dr/mp/containers/distribution.hpp
@@ -22,7 +22,19 @@ struct distribution {
     return *this;
   }
 
-  auto halo() const { return halo_bounds_; }
+  auto halo() const {
+    halo_bounds halo_bounds_resized = halo_bounds_;
+    halo_bounds_resized.prev *= redundancy_;
+    halo_bounds_resized.next *= redundancy_;
+    return halo_bounds_resized;
+  }
+
+  distribution &redundancy(std::size_t redundancy) {
+    redundancy_ = redundancy;
+    return *this;
+  }
+
+  auto redundancy() const { return redundancy_; }
 
   distribution &periodic(bool periodic) {
     halo_bounds_.periodic = periodic;
@@ -40,7 +52,25 @@ struct distribution {
 
 private:
   halo_bounds halo_bounds_;
+  std::size_t redundancy_ = 1;
   std::size_t granularity_ = 1;
 };
 
+struct extended_local_data_distribution {
+  std::size_t begin;
+  std::size_t end;
+  std::size_t segment_size;
+
+  extended_local_data_distribution() = default;
+  extended_local_data_distribution(std::size_t segment_size, std::size_t size,
+                                   halo_bounds hb)
+      : segment_size(segment_size) {
+    if (default_comm().rank() * segment_size >= hb.prev)
+      begin = default_comm().rank() * segment_size - hb.prev;
+    else
+      begin = 0;
+    end = std::min((default_comm().rank() + 1) * segment_size + hb.next, size);
+  }
+};
+
 } // namespace dr::mp
diff --git a/include/dr/mp/containers/segment.hpp b/include/dr/mp/containers/segment.hpp
index 56724ac61b..69c0ace053 100644
--- a/include/dr/mp/containers/segment.hpp
+++ b/include/dr/mp/containers/segment.hpp
@@ -215,15 +215,23 @@ template <typename DV> class dv_segment {
 private:
   using iterator = dv_segment_iterator<DV>;
 
+  using stencil_index_type = dr::__detail::dr_extents<1>;
+
 public:
   using difference_type = std::ptrdiff_t;
   dv_segment() = default;
   dv_segment(DV *dv, std::size_t segment_index, std::size_t size,
-             std::size_t reserved) {
+             std::size_t reserved,
+             const extended_local_data_distribution &ext_dist) {
     dv_ = dv;
     segment_index_ = segment_index;
     size_ = size;
     reserved_ = reserved;
+    ext_dist_ = ext_dist;
+
+    begin_index_ = segment_index * ext_dist.segment_size;
+    end_index_ = begin_index_ + size_;
+
     assert(dv_ != nullptr);
   }
 
@@ -236,6 +244,24 @@ template <typename DV> class dv_segment {
   auto end() const { return begin() + size(); }
   auto reserved() const { return reserved_; }
 
+  [[nodiscard]] stencil_index_type
+  begin_stencil(stencil_index_type stencil) const {
+    return {std::min(std::max(begin_index_, ext_dist_.begin + stencil[0]),
+                     end_index_) -
+            begin_index_};
+  }
+  [[nodiscard]] stencil_index_type
+  end_stencil(stencil_index_type stencil) const {
+    return {std::max(std::min(end_index_, ext_dist_.end - stencil[0]),
+                     begin_index_) -
+            begin_index_};
+  }
+  [[nodiscard]] std::pair<stencil_index_type, stencil_index_type>
+  stencil(stencil_index_type begin, stencil_index_type end) const {
+    return {begin_stencil(begin), end_stencil(end)};
+  }
+  auto extents() const { return md::extents(reserved_); }
+
   auto operator[](difference_type n) const { return *(begin() + n); }
 
   bool is_local() const { return segment_index_ == default_comm().rank(); }
@@ -245,6 +271,10 @@ template <typename DV> class dv_segment {
   std::size_t segment_index_;
   std::size_t size_;
   std::size_t reserved_;
+
+  std::size_t begin_index_;
+  std::size_t end_index_;
+  extended_local_data_distribution ext_dist_;
 }; // dv_segment
 
 //
@@ -253,7 +283,7 @@ template <typename DV> class dv_segment {
 //
 template <typename DR>
 concept has_halo_method = dr::distributed_range<DR> && requires(DR &&dr) {
-  { rng::begin(dr::ranges::segments(dr)[0]).halo() };
+  {rng::begin(dr::ranges::segments(dr)[0]).halo()};
 };
 
 auto &halo(has_halo_method auto &&dr) {
diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp
index 7f7b7dbdb1..3d76fdfe7d 100644
--- a/include/dr/mp/halo.hpp
+++ b/include/dr/mp/halo.hpp
@@ -4,430 +4,7 @@
 
 #pragma once
 
-#include <dr/mp/global.hpp>
-#include <dr/mp/sycl_support.hpp>
-
-namespace dr::mp {
-
-enum class halo_tag {
-  invalid,
-  forward,
-  reverse,
-  index,
-};
-
-template <typename Group> class halo_impl {
-  using T = typename Group::element_type;
-  using Memory = typename Group::memory_type;
-
-public:
-  using group_type = Group;
-
-  // Destructor frees buffer_, so cannot copy
-  halo_impl(const halo_impl &) = delete;
-  halo_impl operator=(const halo_impl &) = delete;
-
-  /// halo constructor
-  halo_impl(communicator comm, const std::vector<Group> &owned_groups,
-            const std::vector<Group> &halo_groups,
-            const Memory &memory = Memory())
-      : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups),
-        memory_(memory) {
-    DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups),
-          rng::size(halo_groups));
-    buffer_size_ = 0;
-    std::size_t i = 0;
-    std::vector<std::size_t> buffer_index;
-    for (auto &g : owned_groups_) {
-      buffer_index.push_back(buffer_size_);
-      g.request_index = i++;
-      buffer_size_ += g.buffer_size();
-      map_.push_back(&g);
-    }
-    for (auto &g : halo_groups_) {
-      buffer_index.push_back(buffer_size_);
-      g.request_index = i++;
-      buffer_size_ += g.buffer_size();
-      map_.push_back(&g);
-    }
-    buffer_ = memory_.allocate(buffer_size_);
-    assert(buffer_ != nullptr);
-    i = 0;
-    for (auto &g : owned_groups_) {
-      g.buffer = &buffer_[buffer_index[i++]];
-    }
-    for (auto &g : halo_groups_) {
-      g.buffer = &buffer_[buffer_index[i++]];
-    }
-    requests_.resize(i);
-  }
-
-  /// Begin a halo exchange
-  void exchange_begin() {
-    DRLOG("Halo exchange receiving");
-    receive(halo_groups_);
-    DRLOG("Halo exchange sending");
-    send(owned_groups_);
-    DRLOG("Halo exchange begin finished");
-  }
-
-  /// Complete a halo exchange
-  void exchange_finalize() {
-    DRLOG("Halo exchange finalize started");
-    reduce_finalize();
-    DRLOG("Halo exchange finalize finished");
-  }
-
-  void exchange() {
-    exchange_begin();
-    exchange_finalize();
-  }
-
-  /// Begin a halo reduction
-  void reduce_begin() {
-    receive(owned_groups_);
-    send(halo_groups_);
-  }
-
-  /// Complete a halo reduction
-  void reduce_finalize(const auto &op) {
-    for (int pending = rng::size(requests_); pending > 0; pending--) {
-      int completed;
-      MPI_Waitany(rng::size(requests_), requests_.data(), &completed,
-                  MPI_STATUS_IGNORE);
-      DRLOG("reduce_finalize(op) waitany completed: {}", completed);
-      auto &g = *map_[completed];
-      if (g.receive && g.buffered) {
-        g.unpack(op);
-      }
-    }
-  }
-
-  /// Complete a halo reduction
-  void reduce_finalize() {
-    for (int pending = rng::size(requests_); pending > 0; pending--) {
-      int completed;
-      MPI_Waitany(rng::size(requests_), requests_.data(), &completed,
-                  MPI_STATUS_IGNORE);
-      DRLOG("reduce_finalize() waitany completed: {}", completed);
-      auto &g = *map_[completed];
-      if (g.receive && g.buffered) {
-        g.unpack();
-      }
-    }
-  }
-
-  struct second_op {
-    T operator()(T &a, T &b) const { return b; }
-  } second;
-
-  struct plus_op {
-    T operator()(T &a, T &b) const { return a + b; }
-  } plus;
-
-  struct max_op {
-    T operator()(T &a, T &b) const { return std::max(a, b); }
-  } max;
-
-  struct min_op {
-    T operator()(T &a, T &b) const { return std::min(a, b); }
-  } min;
-
-  struct multiplies_op {
-    T operator()(T &a, T &b) const { return a * b; }
-  } multiplies;
-
-  ~halo_impl() {
-    if (buffer_) {
-      memory_.deallocate(buffer_, buffer_size_);
-      buffer_ = nullptr;
-    }
-  }
-
-private:
-  void send(std::vector<Group> &sends) {
-    for (auto &g : sends) {
-      g.pack();
-      g.receive = false;
-      DRLOG("sending: {}", g.request_index);
-      comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(),
-                  &requests_[g.request_index]);
-    }
-  }
-
-  void receive(std::vector<Group> &receives) {
-    for (auto &g : receives) {
-      g.receive = true;
-      DRLOG("receiving: {}", g.request_index);
-      comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(),
-                  &requests_[g.request_index]);
-    }
-  }
-
-  communicator comm_;
-  std::vector<Group> halo_groups_, owned_groups_;
-  T *buffer_ = nullptr;
-  std::size_t buffer_size_;
-  std::vector<MPI_Request> requests_;
-  std::vector<Group *> map_;
-  Memory memory_;
-};
-
-template <typename T, typename Memory = default_memory<T>> class index_group {
-public:
-  using element_type = T;
-  using memory_type = Memory;
-  T *buffer = nullptr;
-  std::size_t request_index;
-  bool receive;
-  bool buffered;
-
-  /// Constructor
-  index_group(T *data, std::size_t rank,
-              const std::vector<std::size_t> &indices, const Memory &memory)
-      : memory_(memory), data_(data), rank_(rank) {
-    buffered = false;
-    for (std::size_t i = 0; i < rng::size(indices) - 1; i++) {
-      buffered = buffered || (indices[i + 1] - indices[i] != 1);
-    }
-    indices_size_ = rng::size(indices);
-    indices_ = memory_.template allocate<std::size_t>(indices_size_);
-    assert(indices_ != nullptr);
-    memory_.memcpy(indices_, indices.data(),
-                   indices_size_ * sizeof(std::size_t));
-  }
-
-  index_group(const index_group &o)
-      : buffer(o.buffer), request_index(o.request_index), receive(o.receive),
-        buffered(o.buffered), memory_(o.memory_), data_(o.data_),
-        rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) {
-    indices_ = memory_.template allocate<std::size_t>(indices_size_);
-    assert(indices_ != nullptr);
-    memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t));
-  }
-
-  void unpack(const auto &op) {
-    T *dpt = data_;
-    auto n = indices_size_;
-    auto *ipt = indices_;
-    auto *b = buffer;
-    memory_.offload([=]() {
-      for (std::size_t i = 0; i < n; i++) {
-        dpt[ipt[i]] = op(dpt[ipt[i]], b[i]);
-      }
-    });
-  }
-
-  void pack() {
-    T *dpt = data_;
-    auto n = indices_size_;
-    auto *ipt = indices_;
-    auto *b = buffer;
-    memory_.offload([=]() {
-      for (std::size_t i = 0; i < n; i++) {
-        b[i] = dpt[ipt[i]];
-      }
-    });
-  }
-
-  std::size_t buffer_size() {
-    if (buffered) {
-      return indices_size_;
-    }
-    return 0;
-  }
-
-  T *data_pointer() {
-    if (buffered) {
-      return buffer;
-    } else {
-      return &data_[indices_[0]];
-    }
-  }
-
-  std::size_t data_size() { return indices_size_; }
-
-  std::size_t rank() { return rank_; }
-  auto tag() { return tag_; }
-
-  ~index_group() {
-    if (indices_) {
-      memory_.template deallocate<std::size_t>(indices_, indices_size_);
-      indices_ = nullptr;
-    }
-  }
-
-private:
-  Memory memory_;
-  T *data_ = nullptr;
-  std::size_t rank_;
-  std::size_t indices_size_;
-  std::size_t *indices_;
-  halo_tag tag_ = halo_tag::index;
-};
-
-template <typename T, typename Memory>
-using unstructured_halo_impl = halo_impl<index_group<T, Memory>>;
-
-template <typename T, typename Memory = default_memory<T>>
-class unstructured_halo : public unstructured_halo_impl<T, Memory> {
-public:
-  using group_type = index_group<T, Memory>;
-  using index_map = std::pair<std::size_t, std::vector<std::size_t>>;
-
-  ///
-  /// Constructor
-  ///
-  unstructured_halo(communicator comm, T *data,
-                    const std::vector<index_map> &owned,
-                    const std::vector<index_map> &halo,
-                    const Memory &memory = Memory())
-      : unstructured_halo_impl<T, Memory>(
-            comm, make_groups(comm, data, owned, memory),
-            make_groups(comm, data, halo, memory), memory) {}
-
-private:
-  static std::vector<group_type> make_groups(communicator comm, T *data,
-                                             const std::vector<index_map> &map,
-                                             const Memory &memory) {
-    std::vector<group_type> groups;
-    for (auto const &[rank, indices] : map) {
-      groups.emplace_back(data, rank, indices, memory);
-    }
-    return groups;
-  }
-};
-
-template <typename T, typename Memory = default_memory<T>> class span_group {
-public:
-  using element_type = T;
-  using memory_type = Memory;
-  T *buffer = nullptr;
-  std::size_t request_index = 0;
-  bool receive = false;
-  bool buffered = false;
-
-  span_group(std::span<T> data, std::size_t rank, halo_tag tag)
-      : data_(data), rank_(rank), tag_(tag) {
-#ifdef SYCL_LANGUAGE_VERSION
-    if (use_sycl() && sycl_mem_kind() == sycl::usm::alloc::shared) {
-      buffered = true;
-    }
-#endif
-  }
-
-  void unpack() {
-    if (buffered) {
-      if (mp::use_sycl()) {
-        __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data());
-      } else {
-        std::copy(buffer, buffer + rng::size(data_), data_.data());
-      }
-    }
-  }
-
-  void pack() {
-    if (buffered) {
-      if (mp::use_sycl()) {
-        __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_),
-                            buffer);
-      } else {
-        std::copy(data_.begin(), data_.end(), buffer);
-      }
-    }
-  }
-  std::size_t buffer_size() { return rng::size(data_); }
-
-  std::size_t data_size() { return rng::size(data_); }
-
-  T *data_pointer() {
-    if (buffered) {
-      return buffer;
-    } else {
-      return data_.data();
-    }
-  }
-
-  std::size_t rank() { return rank_; }
-
-  auto tag() { return tag_; }
-
-private:
-  Memory memory_;
-  std::span<T> data_;
-  std::size_t rank_;
-  halo_tag tag_ = halo_tag::invalid;
-};
-
-struct halo_bounds {
-  std::size_t prev = 0, next = 0;
-  bool periodic = false;
-};
-
-template <typename T, typename Memory>
-using span_halo_impl = halo_impl<span_group<T, Memory>>;
-
-template <typename T, typename Memory = default_memory<T>>
-class span_halo : public span_halo_impl<T, Memory> {
-public:
-  using group_type = span_group<T, Memory>;
-
-  span_halo() : span_halo_impl<T, Memory>(communicator(), {}, {}) {}
-
-  span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb)
-      : span_halo_impl<T, Memory>(comm, owned_groups(comm, {data, size}, hb),
-                                  halo_groups(comm, {data, size}, hb)) {
-    check(size, hb);
-  }
-
-  span_halo(communicator comm, std::span<T> span, halo_bounds hb)
-      : span_halo_impl<T, Memory>(comm, owned_groups(comm, span, hb),
-                                  halo_groups(comm, span, hb)) {}
-
-private:
-  void check(auto size, auto hb) {
-    assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next));
-  }
-
-  static std::vector<group_type>
-  owned_groups(communicator comm, std::span<T> span, halo_bounds hb) {
-    std::vector<group_type> owned;
-    DRLOG("owned groups {}/{} first/last", comm.first(), comm.last());
-    if (hb.next > 0 && (hb.periodic || !comm.first())) {
-      owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(),
-                         halo_tag::reverse);
-    }
-    if (hb.prev > 0 && (hb.periodic || !comm.last())) {
-      owned.emplace_back(
-          span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev),
-          comm.next(), halo_tag::forward);
-    }
-    return owned;
-  }
-
-  static std::vector<group_type>
-  halo_groups(communicator comm, std::span<T> span, halo_bounds hb) {
-    std::vector<group_type> halo;
-    if (hb.prev > 0 && (hb.periodic || !comm.first())) {
-      halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward);
-    }
-    if (hb.next > 0 && (hb.periodic || !comm.last())) {
-      halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse);
-    }
-    return halo;
-  }
-};
-
-} // namespace dr::mp
-
-#ifdef DR_FORMAT
-
-template <>
-struct fmt::formatter<dr::mp::halo_bounds> : formatter<string_view> {
-  template <typename FmtContext>
-  auto format(dr::mp::halo_bounds hb, FmtContext &ctx) {
-    return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next);
-  }
-};
-
-#endif
+#include <dr/mp/halo/format.hpp>
+#include <dr/mp/halo/group.hpp>
+#include <dr/mp/halo/halo.hpp>
+#include <dr/mp/halo/instance.hpp>
diff --git a/include/dr/mp/halo/format.hpp b/include/dr/mp/halo/format.hpp
new file mode 100644
index 0000000000..2b602c5db6
--- /dev/null
+++ b/include/dr/mp/halo/format.hpp
@@ -0,0 +1,20 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <dr/mp/global.hpp>
+#include <dr/mp/halo/halo.hpp>
+
+#ifdef DR_FORMAT
+
+template <>
+struct fmt::formatter<dr::mp::halo_bounds> : formatter<string_view> {
+  template <typename FmtContext>
+  auto format(dr::mp::halo_bounds hb, FmtContext &ctx) {
+    return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next);
+  }
+};
+
+#endif
diff --git a/include/dr/mp/halo/group.hpp b/include/dr/mp/halo/group.hpp
new file mode 100644
index 0000000000..bcf24727ad
--- /dev/null
+++ b/include/dr/mp/halo/group.hpp
@@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <dr/mp/global.hpp>
+#include <dr/mp/sycl_support.hpp>
+
+namespace dr::mp {
+
+
+  template <typename T, typename Memory = default_memory<T>>
+  class index_group {
+  public:
+    using element_type = T;
+    using memory_type = Memory;
+    T *buffer = nullptr;
+    std::size_t request_index;
+    bool receive;
+    bool buffered;
+
+    /// Constructor
+    index_group(T *data, std::size_t rank,
+                const std::vector<std::size_t> &indices, const Memory &memory)
+        : memory_(memory), data_(data), rank_(rank) {
+      buffered = false;
+      for (std::size_t i = 0; i < rng::size(indices) - 1; i++) {
+        buffered = buffered || (indices[i + 1] - indices[i] != 1);
+      }
+      indices_size_ = rng::size(indices);
+      indices_ = memory_.template allocate<std::size_t>(indices_size_);
+      assert(indices_ != nullptr);
+      memory_.memcpy(indices_, indices.data(),
+                     indices_size_ * sizeof(std::size_t));
+    }
+
+    index_group(const index_group &o)
+        : buffer(o.buffer), request_index(o.request_index), receive(o.receive),
+          buffered(o.buffered), memory_(o.memory_), data_(o.data_),
+          rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) {
+      indices_ = memory_.template allocate<std::size_t>(indices_size_);
+      assert(indices_ != nullptr);
+      memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t));
+    }
+
+    void unpack(const auto &op) {
+      T *dpt = data_;
+      auto n = indices_size_;
+      auto *ipt = indices_;
+      auto *b = buffer;
+      memory_.offload([=]() {
+        for (std::size_t i = 0; i < n; i++) {
+          dpt[ipt[i]] = op(dpt[ipt[i]], b[i]);
+        }
+      });
+    }
+
+    void pack() {
+      T *dpt = data_;
+      auto n = indices_size_;
+      auto *ipt = indices_;
+      auto *b = buffer;
+      memory_.offload([=]() {
+        for (std::size_t i = 0; i < n; i++) {
+          b[i] = dpt[ipt[i]];
+        }
+      });
+    }
+
+    std::size_t buffer_size() {
+      if (buffered) {
+        return indices_size_;
+      }
+      return 0;
+    }
+
+    T *data_pointer() {
+      if (buffered) {
+        return buffer;
+      } else {
+        return &data_[indices_[0]];
+      }
+    }
+
+    std::size_t data_size() { return indices_size_; }
+
+    std::size_t rank() { return rank_; }
+    auto tag() { return tag_; }
+
+    ~index_group() {
+      if (indices_) {
+        memory_.template deallocate<std::size_t>(indices_, indices_size_);
+        indices_ = nullptr;
+      }
+    }
+
+  private:
+    Memory memory_;
+    T *data_ = nullptr;
+    std::size_t rank_;
+    std::size_t indices_size_;
+    std::size_t *indices_;
+    halo_tag tag_ = halo_tag::index;
+  };
+
+  template <typename T, typename Memory = default_memory<T>> class span_group {
+  public:
+    using element_type = T;
+    using memory_type = Memory;
+    T *buffer = nullptr;
+    std::size_t request_index = 0;
+    bool receive = false;
+    bool buffered = false;
+
+    span_group(std::span<T> data, std::size_t rank, halo_tag tag)
+        : data_(data), rank_(rank), tag_(tag) {
+#ifdef SYCL_LANGUAGE_VERSION
+      if (mp::use_sycl() && mp::sycl_mem_kind() == sycl::usm::alloc::shared) {
+        buffered = true;
+      }
+#endif
+    }
+
+    /// If span is buffered, push buffer to data
+    void unpack() {
+      if (buffered) {
+        __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data());
+      }
+    }
+
+    /// If span is buffered, pull data into buffer
+    void pack() {
+      if (buffered) {
+        __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_), buffer);
+      }
+    }
+
+    std::size_t buffer_size() { return rng::size(data_); }
+
+    std::size_t data_size() { return rng::size(data_); }
+
+    T *data_pointer() {
+      if (buffered) {
+        return buffer;
+      } else {
+        return data_.data();
+      }
+    }
+
+    std::size_t rank() { return rank_; }
+
+    auto tag() { return tag_; }
+
+  private:
+    std::span<T> data_;
+    std::size_t rank_;
+    halo_tag tag_ = halo_tag::invalid;
+  };
+}
\ No newline at end of file
diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp
new file mode 100644
index 0000000000..a8d8b92419
--- /dev/null
+++ b/include/dr/mp/halo/halo.hpp
@@ -0,0 +1,196 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <dr/mp/global.hpp>
+#include <dr/mp/sycl_support.hpp>
+
+namespace dr::mp {
+
+enum class halo_tag {
+  invalid,
+  forward,
+  reverse,
+  index,
+};
+
+struct halo_bounds {
+  // How many values before and after the data segment are in halo
+  std::size_t prev = 0, next = 0;
+  bool periodic = false;
+};
+
+template <typename Group> class halo_impl {
+  using T = typename Group::element_type;
+  using Memory = typename Group::memory_type;
+
+public:
+  using group_type = Group;
+
+  // Destructor frees buffer_, so cannot copy
+  halo_impl(const halo_impl &) = delete;
+
+  halo_impl operator=(const halo_impl &) = delete;
+
+  /// halo constructor
+  halo_impl(communicator comm, const std::vector<Group> &owned_groups,
+            const std::vector<Group> &halo_groups,
+            const Memory &memory = Memory())
+      : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups),
+        memory_(memory) {
+    DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups),
+          rng::size(halo_groups));
+    buffer_size_ = 0;
+    std::size_t i = 0;
+    std::vector<std::size_t> buffer_index;
+    for (auto &g : owned_groups_) {
+      buffer_index.push_back(buffer_size_);
+      g.request_index = i++;
+      buffer_size_ += g.buffer_size();
+      map_.push_back(&g);
+    }
+    for (auto &g : halo_groups_) {
+      buffer_index.push_back(buffer_size_);
+      g.request_index = i++;
+      buffer_size_ += g.buffer_size();
+      map_.push_back(&g);
+    }
+    buffer_ = memory_.allocate(buffer_size_);
+    assert(buffer_ != nullptr);
+    i = 0;
+    for (auto &g : owned_groups_) {
+      g.buffer = &buffer_[buffer_index[i++]];
+    }
+    for (auto &g : halo_groups_) {
+      g.buffer = &buffer_[buffer_index[i++]];
+    }
+    requests_.resize(i);
+  }
+
+  /// Begin a halo exchange
+  void exchange_begin() {
+    DRLOG("Halo exchange receiving");
+    receive(halo_groups_);
+    DRLOG("Halo exchange sending");
+    send(owned_groups_);
+    DRLOG("Halo exchange begin finished");
+  }
+
+  /// Complete a halo exchange
+  void exchange_finalize() {
+    DRLOG("Halo exchange finalize started");
+    reduce_finalize();
+    DRLOG("Halo exchange finalize finished");
+  }
+
+  void exchange() {
+    exchange_begin();
+    exchange_finalize();
+  }
+
+  /// Begin a halo reduction
+  void reduce_begin() {
+    receive(owned_groups_);
+    send(halo_groups_);
+  }
+
+  /// Complete a halo reduction
+  void reduce_finalize(const auto &op) {
+    for (int pending = rng::size(requests_); pending > 0; pending--) {
+      int completed;
+      MPI_Waitany(rng::size(requests_), requests_.data(), &completed,
+                  MPI_STATUS_IGNORE);
+      DRLOG("reduce_finalize(op) waitany completed: {}", completed);
+      auto &g = *map_[completed];
+      if (g.receive && g.buffered) {
+        g.unpack(op);
+      }
+    }
+  }
+
+  /// Complete a halo reduction
+  void reduce_finalize() {
+    for (int pending = rng::size(requests_); pending > 0; pending--) {
+      int completed;
+      MPI_Waitany(rng::size(requests_), requests_.data(), &completed,
+                  MPI_STATUS_IGNORE);
+      DRLOG("reduce_finalize() waitany completed: {}", completed);
+      auto &g = *map_[completed];
+      if (g.receive && g.buffered) {
+        g.unpack();
+      }
+    }
+  }
+
+  struct second_op {
+    T operator()(T &a, T &b) const { return b; }
+  } second;
+
+  struct plus_op {
+    T operator()(T &a, T &b) const { return a + b; }
+  } plus;
+
+  struct max_op {
+    T operator()(T &a, T &b) const { return std::max(a, b); }
+  } max;
+
+  struct min_op {
+    T operator()(T &a, T &b) const { return std::min(a, b); }
+  } min;
+
+  struct multiplies_op {
+    T operator()(T &a, T &b) const { return a * b; }
+  } multiplies;
+
+  ~halo_impl() {
+    if (buffer_) {
+      memory_.deallocate(buffer_, buffer_size_);
+      buffer_ = nullptr;
+    }
+  }
+
+private:
+  void send(std::vector<Group> &sends) {
+    for (auto &g : sends) {
+      g.pack();
+      g.receive = false;
+      DRLOG("sending: {}", g.request_index);
+      //        std::cout << "send(" << g.data_pointer() << ", " <<
+      //        g.data_size() << ", " << g.rank() << ", <tag>, " <<
+      //        &requests_[g.request_index] << ")\n";
+      comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(),
+                  &requests_[g.request_index]);
+    }
+  }
+
+  void receive(std::vector<Group> &receives) {
+    for (auto &g : receives) {
+      g.receive = true;
+      DRLOG("receiving: {}", g.request_index);
+      //        std::cout << "recv(" << g.data_pointer() << ", " <<
+      //        g.data_size() << ", " << g.rank() << ", <tag>, " <<
+      //        &requests_[g.request_index] << ")\n";
+      comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(),
+                  &requests_[g.request_index]);
+    }
+  }
+
+  communicator comm_;
+  std::vector<Group> halo_groups_, owned_groups_;
+  T *buffer_ = nullptr;
+  std::size_t buffer_size_;
+  std::vector<MPI_Request> requests_;
+  std::vector<Group *> map_;
+  Memory memory_;
+};
+
+template <typename T, typename... Ts>
+void halo_exchange(auto &&f, T &dv, Ts &...dvs) {
+  for (std::size_t step = 0; step < dv.dist().redundancy(); step++) {
+    f(dv, dvs...);
+  }
+  halo(dv).exchange();
+}
+} // namespace dr::mp
diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp
new file mode 100644
index 0000000000..04be307690
--- /dev/null
+++ b/include/dr/mp/halo/instance.hpp
@@ -0,0 +1,98 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <dr/mp/global.hpp>
+#include <dr/mp/halo/group.hpp>
+#include <dr/mp/halo/halo.hpp>
+#include <dr/mp/sycl_support.hpp>
+
+namespace dr::mp {
+template <typename T, typename Memory>
+using unstructured_halo_impl = halo_impl<index_group<T, Memory>>;
+
+template <typename T, typename Memory = default_memory<T>>
+class unstructured_halo : public unstructured_halo_impl<T, Memory> {
+public:
+  using group_type = index_group<T, Memory>;
+  using index_map = std::pair<std::size_t, std::vector<std::size_t>>;
+
+  ///
+  /// Constructor
+  ///
+  unstructured_halo(communicator comm, T *data,
+                    const std::vector<index_map> &owned,
+                    const std::vector<index_map> &halo,
+                    const Memory &memory = Memory())
+      : unstructured_halo_impl<T, Memory>(
+            comm, make_groups(comm, data, owned, memory),
+            make_groups(comm, data, halo, memory), memory) {}
+
+private:
+  static std::vector<group_type> make_groups(communicator comm, T *data,
+                                             const std::vector<index_map> &map,
+                                             const Memory &memory) {
+    std::vector<group_type> groups;
+    for (auto const &[rank, indices] : map) {
+      groups.emplace_back(data, rank, indices, memory);
+    }
+    return groups;
+  }
+};
+
+template <typename T, typename Memory>
+using span_halo_impl = halo_impl<span_group<T, Memory>>;
+
+template <typename T, typename Memory = default_memory<T>>
+class span_halo : public span_halo_impl<T, Memory> {
+public:
+  using group_type = span_group<T, Memory>;
+
+  span_halo() : span_halo_impl<T, Memory>(communicator(), {}, {}) {}
+
+  span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb)
+      : span_halo_impl<T, Memory>(comm, owned_groups(comm, {data, size}, hb),
+                                  halo_groups(comm, {data, size}, hb)) {
+    check(size, hb);
+  }
+
+  span_halo(communicator comm, std::span<T> span, halo_bounds hb)
+      : span_halo_impl<T, Memory>(comm, owned_groups(comm, span, hb),
+                                  halo_groups(comm, span, hb)) {}
+
+private:
+  void check(auto size, auto hb) {
+    assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next));
+  }
+
+  static std::vector<group_type>
+  owned_groups(communicator comm, std::span<T> span, halo_bounds hb) {
+    std::vector<group_type> owned;
+    DRLOG("owned groups {}/{} first/last", comm.first(), comm.last());
+    if (hb.next > 0 && (hb.periodic || !comm.first())) {
+      owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(),
+                         halo_tag::reverse);
+    }
+    if (hb.prev > 0 && (hb.periodic || !comm.last())) {
+      owned.emplace_back(
+          span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev),
+          comm.next(), halo_tag::forward);
+    }
+    return owned;
+  }
+
+  static std::vector<group_type>
+  halo_groups(communicator comm, std::span<T> span, halo_bounds hb) {
+    std::vector<group_type> halo;
+    if (hb.prev > 0 && (hb.periodic || !comm.first())) {
+      halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward);
+    }
+    if (hb.next > 0 && (hb.periodic || !comm.last())) {
+      halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse);
+    }
+    return halo;
+  }
+};
+} // namespace dr::mp
diff --git a/include/dr/mp/views/mdspan_view.hpp b/include/dr/mp/views/mdspan_view.hpp
index 1a67f8d78f..44da9f9bbe 100644
--- a/include/dr/mp/views/mdspan_view.hpp
+++ b/include/dr/mp/views/mdspan_view.hpp
@@ -25,16 +25,24 @@ namespace dr::mp::__detail {
 template <typename BaseSegment, std::size_t Rank>
 class md_segment : public rng::view_interface<md_segment<BaseSegment, Rank>> {
 private:
+  using stencil_index_type = dr::__detail::dr_extents<Rank>;
+
 public:
   using index_type = dr::__detail::dr_extents<Rank>;
 
   md_segment() {}
-  md_segment(index_type origin, BaseSegment segment, index_type tile_shape)
+  md_segment(index_type origin, BaseSegment segment, index_type tile_shape,
+             extended_local_data_distribution ext_dist)
       : base_(segment), origin_(origin),
-        mdspan_(local_tile(segment, tile_shape)) {
+        mdspan_(local_tile(segment, tile_shape)),
+        mdspan_extended_(local_tile_extended(segment, tile_shape)),
+        ext_dist_(ext_dist) {
     dr::drlog.debug(dr::logger::mdspan_view,
                     "md_segment\n  origin: {}\n  tile shape: {}\n", origin,
                     tile_shape);
+
+    for (std::size_t i = 0; i < Rank; i++)
+      end_[i] = origin_[i] + tile_shape[i];
   }
 
   // view_interface uses below to define everything else
@@ -45,8 +53,39 @@ class md_segment : public rng::view_interface<md_segment<BaseSegment, Rank>> {
 
   auto halo() const { return dr::mp::halo(base_); }
 
+  [[nodiscard]] stencil_index_type
+  begin_stencil(stencil_index_type stencil) const {
+    stencil_index_type out;
+    // Supports only 1d distribution
+    for (std::size_t i = 0; i < Rank; i++) {
+      out[i] = std::min(std::max(origin_[i],
+                                 (i == 0 ? ext_dist_.begin : origin_[i]) +
+                                     stencil[i]),
+                        end_[i]) -
+               origin_[i];
+    }
+    return out;
+  }
+  [[nodiscard]] stencil_index_type
+  end_stencil(stencil_index_type stencil) const {
+    stencil_index_type out;
+    // Supports only 1d distribution
+    for (std::size_t i = 0; i < Rank; i++) {
+      out[i] = std::max(std::min(end_[i], (i == 0 ? ext_dist_.end : end_[i]) -
+                                              stencil[i]),
+                        origin_[i]) -
+               origin_[i];
+    }
+    return out;
+  }
+  [[nodiscard]] std::pair<stencil_index_type, stencil_index_type>
+  stencil(stencil_index_type begin, stencil_index_type end) const {
+    return {begin_stencil(begin), end_stencil(end)};
+  }
+
   // mdspan-specific methods
   auto mdspan() const { return mdspan_; }
+  auto mdspan_extended() const { return mdspan_extended_; }
   auto origin() const { return origin_; }
   // for slices, this would be the underlying mdspan
   auto root_mdspan() const { return mdspan(); }
@@ -62,9 +101,20 @@ class md_segment : public rng::view_interface<md_segment<BaseSegment, Rank>> {
     return md::mdspan(ptr, tile_shape);
   }
 
+  static auto local_tile_extended(BaseSegment segment,
+                                  const index_type &tile_shape) {
+    T *ptr = std::to_address(dr::ranges::local(rng::begin(segment)));
+    return md::mdspan(ptr, tile_shape);
+  }
+
   BaseSegment base_;
   index_type origin_;
+  index_type end_;
   md::mdspan<T, dr::__detail::md_extents<Rank>, md::layout_stride> mdspan_;
+  md::mdspan<T, dr::__detail::md_extents<Rank>, md::layout_stride>
+      mdspan_extended_;
+
+  extended_local_data_distribution ext_dist_;
 };
 
 } // namespace dr::mp::__detail
@@ -107,7 +157,11 @@ struct mdspan_view : public rng::view_interface<mdspan_view<R, Rank>> {
     return origin;
   }
 
-  static auto make_segments(auto base, auto full_shape, auto tile_shape) {
+  static auto make_segments(auto base, auto full_shape, auto tile_shape,
+                            auto dist) {
+    extended_local_data_distribution ext_dist(tile_shape[0], full_shape[0],
+                                              dist.halo());
+
     auto make_md = [=](auto v) {
       auto clipped = tile_shape;
       std::size_t segment_index = std::get<0>(v);
@@ -117,7 +171,7 @@ struct mdspan_view : public rng::view_interface<mdspan_view<R, Rank>> {
       }
       return __detail::md_segment(
           segment_index_to_global_origin(segment_index, full_shape, tile_shape),
-          std::get<1>(v), clipped);
+          std::get<1>(v), clipped, ext_dist);
     };
 
     dr::drlog.debug(dr::logger::mdspan_view,
@@ -127,11 +181,12 @@ struct mdspan_view : public rng::view_interface<mdspan_view<R, Rank>> {
     return dr::__detail::bounded_enumerate(dr::ranges::segments(base)) |
            rng::views::transform(make_md);
   }
-  using segments_type = decltype(make_segments(std::declval<base_type>(),
-                                               full_shape_, tile_shape_));
+  using segments_type =
+      decltype(make_segments(std::declval<base_type>(), full_shape_,
+                             tile_shape_, std::declval<distribution>()));
 
 public:
-  mdspan_view(R r, dr::__detail::dr_extents<Rank> full_shape)
+  mdspan_view(R r, dr::__detail::dr_extents<Rank> full_shape, distribution dist)
       : base_(rng::views::all(std::forward<R>(r))) {
     full_shape_ = full_shape;
 
@@ -140,15 +195,15 @@ struct mdspan_view : public rng::view_interface<mdspan_view<R, Rank>> {
     tile_shape_[0] = decomp::div;
 
     replace_decomp();
-    segments_ = make_segments(base_, full_shape_, tile_shape_);
+    segments_ = make_segments(base_, full_shape_, tile_shape_, dist);
   }
 
   mdspan_view(R r, dr::__detail::dr_extents<Rank> full_shape,
-              dr::__detail::dr_extents<Rank> tile_shape)
+              dr::__detail::dr_extents<Rank> tile_shape, distribution dist)
       : base_(rng::views::all(std::forward<R>(r))), full_shape_(full_shape),
         tile_shape_(tile_shape) {
     replace_decomp();
-    segments_ = make_segments(base_, full_shape_, tile_shape_);
+    segments_ = make_segments(base_, full_shape_, tile_shape_, dist);
   }
 
   // Base implements random access range
@@ -194,17 +249,18 @@ struct mdspan_view : public rng::view_interface<mdspan_view<R, Rank>> {
 };
 
 template <typename R, std::size_t Rank>
-mdspan_view(R &&r, dr::__detail::dr_extents<Rank> extents)
+mdspan_view(R &&r, dr::__detail::dr_extents<Rank> extents, distribution dist)
     -> mdspan_view<rng::views::all_t<R>, Rank>;
 
 template <typename R, std::size_t Rank>
 mdspan_view(R &&r, dr::__detail::dr_extents<Rank> full_shape,
-            dr::__detail::dr_extents<Rank> tile_shape)
+            dr::__detail::dr_extents<Rank> tile_shape, distribution dist)
     -> mdspan_view<rng::views::all_t<R>, Rank>;
 
 template <typename R>
-concept is_mdspan_view =
-    dr::distributed_range<R> && requires(R &r) { r.mdspan(); };
+concept is_mdspan_view = dr::distributed_range<R> && requires(R &r) {
+  r.mdspan();
+};
 
 } // namespace dr::mp
 
@@ -213,17 +269,20 @@ namespace dr::mp::views {
 template <std::size_t Rank> class mdspan_adapter_closure {
 public:
   mdspan_adapter_closure(dr::__detail::dr_extents<Rank> full_shape,
-                         dr::__detail::dr_extents<Rank> tile_shape)
-      : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true) {}
+                         dr::__detail::dr_extents<Rank> tile_shape,
+                         distribution dist)
+      : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true),
+        dist_(dist) {}
 
-  mdspan_adapter_closure(dr::__detail::dr_extents<Rank> full_shape)
-      : full_shape_(full_shape) {}
+  mdspan_adapter_closure(dr::__detail::dr_extents<Rank> full_shape,
+                         distribution dist)
+      : full_shape_(full_shape), dist_(dist) {}
 
   template <rng::viewable_range R> auto operator()(R &&r) const {
     if (tile_valid_) {
-      return mdspan_view(std::forward<R>(r), full_shape_, tile_shape_);
+      return mdspan_view(std::forward<R>(r), full_shape_, tile_shape_, dist_);
     } else {
-      return mdspan_view(std::forward<R>(r), full_shape_);
+      return mdspan_view(std::forward<R>(r), full_shape_, dist_);
     }
   }
 
@@ -236,31 +295,35 @@ template <std::size_t Rank> class mdspan_adapter_closure {
   dr::__detail::dr_extents<Rank> full_shape_;
   dr::__detail::dr_extents<Rank> tile_shape_;
   bool tile_valid_ = false;
+  distribution dist_;
 };
 
 class mdspan_fn_ {
 public:
   template <rng::viewable_range R, typename Shape>
-  auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape) const {
+  auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape,
+                  distribution dist) const {
     return mdspan_adapter_closure(std::forward<Shape>(full_shape),
-                                  std::forward<Shape>(tile_shape))(
-        std::forward<R>(r));
+                                  std::forward<Shape>(tile_shape),
+                                  dist)(std::forward<R>(r));
   }
 
   template <rng::viewable_range R, typename Shape>
-  auto operator()(R &&r, Shape &&full_shape) const {
-    return mdspan_adapter_closure(std::forward<Shape>(full_shape))(
-        std::forward<R>(r));
+  auto operator()(R &&r, Shape &&full_shape, distribution dist) const {
+    return mdspan_adapter_closure(std::forward<Shape>(full_shape),
+                                  dist)(std::forward<R>(r));
   }
 
   template <typename Shape>
-  auto operator()(Shape &&full_shape, Shape &&tile_shape) const {
+  auto operator()(Shape &&full_shape, Shape &&tile_shape,
+                  distribution dist) const {
     return mdspan_adapter_closure(std::forward<Shape>(full_shape),
-                                  std::forward<Shape>(tile_shape));
+                                  std::forward<Shape>(tile_shape), dist);
   }
 
-  template <typename Shape> auto operator()(Shape &&full_shape) const {
-    return mdspan_adapter_closure(std::forward<Shape>(full_shape));
+  template <typename Shape>
+  auto operator()(Shape &&full_shape, distribution dist) const {
+    return mdspan_adapter_closure(std::forward<Shape>(full_shape), dist);
   }
 };
 
diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt
index 32f26d120a..175915a87d 100644
--- a/test/gtest/mp/CMakeLists.txt
+++ b/test/gtest/mp/CMakeLists.txt
@@ -37,6 +37,7 @@ add_executable(
   copy.cpp
   distributed_vector.cpp
   halo.cpp
+  halo-2d.cpp
   mdstar.cpp
   mpsort.cpp
   reduce.cpp
@@ -46,38 +47,53 @@ add_executable(
   wave_kernel.cpp)
 
 add_executable(
-  mp-tests-3
-  mp-tests.cpp
-  communicator-3.cpp
-  halo-3.cpp
-  slide_view-3.cpp)
+        mp-tests-3
+        mp-tests.cpp
+        communicator-3.cpp
+        halo-3.cpp
+        slide_view-3.cpp
+        wide-halo-1d-3.cpp
+        wide-halo-2d-3.cpp
+)
 
-# mp-quick-test is for development. By reducing the number of source files, it
+# mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it
 # builds much faster. Change the source files to match what you need to test. It
 # is OK to commit changes to the source file list.
-
 add_executable(mp-quick-test
   mp-tests.cpp
-  ../common/count.cpp
-  )
+  halo-2d.cpp
+)
+add_executable(mp-quick-test-3-only
+  mp-tests.cpp
+  wide-halo-2d-3.cpp
+)
 # cmake-format: on
 
 target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST)
+target_compile_definitions(mp-quick-test-3-only PRIVATE QUICK_TEST)
 
-foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test)
+foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-only)
   if(ENABLE_ISHMEM)
     target_link_ishmem(${test-exec})
   endif()
   target_link_libraries(${test-exec} GTest::gtest_main cxxopts DR::mpi)
   set_property(TARGET ${test-exec} PROPERTY RULE_LAUNCH_COMPILE
-                                            "${CMAKE_COMMAND} -E time")
+          "${CMAKE_COMMAND} -E time")
 endforeach()
 
+# Game of life
+add_executable(game_of_life game_of_life.cpp)
+if(ENABLE_ISHMEM)
+  target_link_ishmem(game_of_life)
+endif()
+target_link_libraries(game_of_life cxxopts DR::mpi)
+
 # tests without --sycl flag will fail on IshmemBackend TODO: make them be
 # running somehow if ENABLE_ISHMEM will be default CI config
 if(NOT ENABLE_ISHMEM)
   add_mp_ctest(NAME mp-quick-test NPROC 1)
   add_mp_ctest(NAME mp-quick-test NPROC 2)
+  add_mp_ctest(NAME mp-quick-test-3-only NPROC 3)
 
   cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL)
 
@@ -88,8 +104,7 @@ if(NOT ENABLE_ISHMEM)
   foreach(nproc RANGE 2 4)
     add_mp_ctest(NAME mp-tests NPROC ${nproc} TIMEOUT 150)
   endforeach()
-  add_mp_ctest(
-    TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150)
+  add_mp_ctest(TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150)
 endif()
 
 if(ENABLE_SYCL)
@@ -103,7 +118,7 @@ if(ENABLE_SYCL)
     # equality of these values: *(--counted_result.end()) Which is: 5, should be
     # 77 Mdspan, Mdarray hangs sometimes on ISHMEM.
     set(sycl-exclusions
-        ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:)
+            ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:)
   endif()
 
   foreach(nproc RANGE 1 4)
@@ -111,6 +126,8 @@ if(ENABLE_SYCL)
     add_mp_pvc_ctest(
       NAME mp-quick-test NPROC ${nproc} OFFLOAD SYCL TARGS --device-memory)
   endforeach()
+  add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 SYCL)
+  add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 OFFLOAD SYCL TARGS --device-memory)
 
   add_mp_pvc_ctest(
     NAME mp-tests NPROC 2 TIMEOUT 150 OFFLOAD SYCL TARGS --device-memory
diff --git a/test/gtest/mp/copy.cpp b/test/gtest/mp/copy.cpp
index 66e54def9c..a54d194890 100644
--- a/test/gtest/mp/copy.cpp
+++ b/test/gtest/mp/copy.cpp
@@ -6,9 +6,7 @@
 
 // Fixture
 
-template <typename T> class CopyMP : public testing::Test {
-public:
-};
+template <typename T> class CopyMP : public testing::Test { public: };
 
 TYPED_TEST_SUITE(CopyMP, AllTypes);
 
diff --git a/test/gtest/mp/game_of_life.cpp b/test/gtest/mp/game_of_life.cpp
new file mode 100644
index 0000000000..446338c0b9
--- /dev/null
+++ b/test/gtest/mp/game_of_life.cpp
@@ -0,0 +1,282 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cxxopts.hpp"
+#include "dr/mp.hpp"
+#include "mpi.h"
+
+inline void barrier() { dr::mp::barrier(); }
+inline void fence() { dr::mp::fence(); }
+inline void fence_on(auto &&obj) { obj.fence(); }
+
+#include <chrono>
+#include <memory>
+#include <iomanip>
+
+struct MPI_data {
+  MPI_Comm comm;
+  int rank;
+  int size;
+
+  bool host() {
+    return rank == 0;
+  }
+};
+
+static MPI_data mpi_data;
+
+struct Options {
+  std::size_t width;
+  std::size_t height;
+  std::size_t steps;
+  std::size_t redundancy;
+  bool debug;
+
+  std::unique_ptr<std::ofstream> logfile;
+
+  bool sycl;
+  bool device_memory;
+};
+
+namespace GameOfLife {
+
+using T = int;
+using Array = dr::mp::distributed_mdarray<T, 2>;
+
+void init(std::size_t n, Array& out) {
+  std::vector<std::vector<int>> in(4, std::vector<int>(4, 0));
+  /*
+    1 0 0
+    0 1 1
+    1 1 0
+   */
+  // clang-format off
+  in[1][1] = 1; in[1][2] = 0; in[1][3] = 0;
+  in[2][1] = 0; in[2][2] = 1; in[2][3] = 1;
+  in[3][1] = 1; in[3][2] = 1; in[3][3] = 0;
+  // clang-format on
+  std::vector<int> local(n * 4);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      local[i * n + j] = in[i][j];
+    }
+  }
+  dr::mp::copy(local.begin(), local.end(), out.begin());
+}
+
+void run(std::size_t n, std::size_t m, std::size_t redundancy, std::size_t steps, bool debug) {
+  if (mpi_data.host()) {
+    std::cout << "Using backend: dr" << std::endl;
+    std::cout << "Grid size: " << n << " x " << m << std::endl;
+    std::cout << "Time steps:" << steps << std::endl;
+    std::cout << "Redundancy " << redundancy << std::endl;
+    std::cout << std::endl;
+  }
+
+  // construct grid
+  auto dist = dr::mp::distribution().halo(1).redundancy(redundancy);
+  Array array({n, m}, dist);
+  Array array_out({n, m}, dist);
+  dr::mp::fill(array, 0);
+
+  init(m, array);
+
+  // execute one calculation for one cell in game of life
+  auto calculate = [](auto stencils) {
+      auto [x, x_out] = stencils;
+      // because below we calculate the sum of all 9 cells,
+      // but we want the output only of 8 neighbourhs, subtract the value of self.
+      int live_neighbours = -x(0, 0);
+      for (int i = -1; i <= 1; i++) {
+        for (int j = -1; j <= 1; j++) {
+          live_neighbours += x(i, j); // alive == 1, dead == 0, so simple addition works
+        }
+      }
+
+      if (x(0, 0) == 1) { // self if alive
+        if (live_neighbours == 2 || live_neighbours == 3) {
+          x_out(0, 0) = 1;
+        } else {
+          x_out(0, 0) = 0;
+        }
+      }
+      else { // self is dead
+        if (live_neighbours == 3) {
+          x_out(0, 0) = 1;
+        } else {
+          x_out(0, 0) = 0;
+        }
+      }
+    };
+
+  // assign values of second array to first array
+  auto assign = [](auto stencils) {
+      auto [x, x_out] = stencils;
+      x(0, 0) = x_out(0, 0);
+    };
+
+  auto print = [n, m](const auto &v) {
+      std::vector<int> local(n * n);
+      dr::mp::copy(0, v, local.begin());
+      if (mpi_data.host()) {
+        for (int i = 0; i < n; i++) {
+          for (int j = 0; j < m; j++) {
+            fmt::print("{}", local[i * m + j] == 1 ? '#' : '.');
+          }
+          fmt::print("\n");
+        }
+      }
+    };
+
+  std::chrono::duration<double> exchange_duration;
+  std::size_t exchange_count = 0;
+
+  auto tic = std::chrono::steady_clock::now();
+  for (std::size_t i = 0, next_treshold = 0; i < steps; i++) {
+    if (i >= next_treshold && mpi_data.host()) {
+      next_treshold += round(static_cast<double>(steps / 100));
+      double percent = round(static_cast<double>(i) * 100 / static_cast<double>(steps));
+      fmt::print("Steps done {}% ({} of {} steps)\n", percent, i, steps);
+    }
+
+    // step
+    stencil_for_each_extended<2>(calculate, {1, 1}, {1, 1}, array, array_out);
+    stencil_for_each_extended<2>(assign, {0, 0}, {0, 0}, array, array_out);
+
+    // phase with communication - once after (redundancy - 1) steps without communication
+    if ((i + 1) % redundancy == 0) {
+      if (debug && mpi_data.host()) {
+        fmt::print("Exchange at step {}\n", i);
+      }
+      auto exchange_tic = std::chrono::steady_clock::now();
+      array.halo().exchange();
+      auto exchange_toc = std::chrono::steady_clock::now();
+      exchange_duration += exchange_toc - exchange_tic;
+      exchange_count++;
+
+      // Array_out is a temporary, no need to exchange it
+    }
+
+    // debug print
+    if (debug) {
+      if (mpi_data.host()) {
+        fmt::print("Array {}:\n", i);
+      }
+      // print needs a synchronication accros MPI boundary (dr::mp::copy), each node has to execute it
+      print(array);
+    }
+  }
+  auto toc = std::chrono::steady_clock::now();
+
+  std::chrono::duration<double> duration = toc - tic;
+
+  if (mpi_data.host()) {
+    double t_cpu = duration.count();
+    double t_exch = exchange_duration.count();
+    double t_step = t_cpu / static_cast<double>(steps);
+    double t_exch_step = t_exch / static_cast<double>(exchange_count);
+
+    fmt::print("Steps done 100% ({} of {} steps)\n", steps, steps);
+    fmt::print("Duration {} s, including exchange total time {} s\n", t_cpu, t_exch);
+    fmt::print("Time per step {} ms\n", t_step * 1000);
+    fmt::print("Time per exchange {} ms\n", t_exch_step * 1000);
+  }
+}
+
+} // namespace GameOfLife
+
+// Initialization functions
+
+void init_MPI(int argc, char *argv[]) {
+  MPI_Init(&argc, &argv);
+  mpi_data.comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(mpi_data.comm, &mpi_data.rank);
+  MPI_Comm_size(mpi_data.comm, &mpi_data.size);
+
+  dr::drlog.debug("MPI: rank = {}, size = {}\n", mpi_data.rank, mpi_data.size);
+}
+
+Options parse_options(int argc, char *argv[]) {
+  Options out;
+
+  cxxopts::Options options_spec(argv[0], "game of life");
+
+  // clang-format off
+  options_spec.add_options()
+    ("drhelp", "Print help")
+    ("log", "Enable logging")
+    ("logprefix", "appended .RANK.log", cxxopts::value<std::string>()->default_value("dr"))
+    ("log-filter", "Filter the log", cxxopts::value<std::vector<std::string>>())
+    ("device-memory", "Use device memory")
+    ("sycl", "Execute on SYCL device")
+    ("d,debug", "enable debug logging")
+    ("n,size", "Grid width", cxxopts::value<std::size_t>()->default_value("128"))
+    ("m,height", "Grid height", cxxopts::value<std::size_t>()->default_value("128"))
+    ("t,steps", "Run a fixed number of time steps.", cxxopts::value<std::size_t>()->default_value("100"))
+    ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value<std::size_t>()->default_value("2"));
+  // clang-format on
+
+  cxxopts::ParseResult options;
+  try {
+    options = options_spec.parse(argc, argv);
+  } catch (const cxxopts::OptionParseException &e) {
+    std::cout << options_spec.help() << "\n";
+    exit(1);
+  }
+
+  out.sycl = options.count("sycl") != 0;
+  out.device_memory = options.count("device-memory") != 0;
+
+  if (options.count("drhelp")) {
+    std::cout << options_spec.help() << "\n";
+    exit(0);
+  }
+
+  if (options.count("log")) {
+    out.logfile.reset(new std::ofstream(options["logprefix"].as<std::string>() +
+                                    fmt::format(".{}.log", mpi_data.rank)));
+    dr::drlog.set_file(*out.logfile);
+    if (options.count("log-filter")) {
+      dr::drlog.filter(options["log-filter"].as<std::vector<std::string>>());
+    }
+  }
+
+  out.width = options["n"].as<std::size_t>();
+  out.height = options.count("m") != 0 ? options["m"].as<std::size_t>() : out.width;
+  out.redundancy = options["r"].as<std::size_t>();
+  out.steps = options["t"].as<std::size_t>();
+
+  out.debug = options.count("debug") != 0;
+
+  return out;
+}
+
+void dr_init(const Options& options) {
+#ifdef SYCL_LANGUAGE_VERSION
+  if (options.sycl) {
+    sycl::queue q;
+    fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info<sycl::info::device::name>(), options.device_memory ? "device" : "shared");
+    dr::mp::init(q, options.device_memory ? sycl::usm::alloc::device
+                                          : sycl::usm::alloc::shared);
+    return;
+  }
+#endif
+  fmt::print("Running on CPU\n");
+  dr::mp::init();
+}
+
+// Main loop
+
+int main(int argc, char *argv[]) {
+  init_MPI(argc, argv);
+  Options options = parse_options(argc, argv);
+  dr_init(options);
+
+  GameOfLife::run(options.width, options.height, options.redundancy, options.steps, options.debug);
+
+  dr::mp::finalize();
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/test/gtest/mp/halo-2d.cpp b/test/gtest/mp/halo-2d.cpp
new file mode 100644
index 0000000000..5168bd6511
--- /dev/null
+++ b/test/gtest/mp/halo-2d.cpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+
+TEST(Halo2D, exchange_2d_test) {
+  dr::mp::distributed_mdarray<int, 2> dv({10, 10}, dr::mp::distribution().halo(1));
+
+  DRLOG("exchange start");
+  dv.halo().exchange();
+  DRLOG("exchange end");
+}
diff --git a/test/gtest/mp/halo-3.cpp b/test/gtest/mp/halo-3.cpp
index dfc372ebd5..aa4127f397 100644
--- a/test/gtest/mp/halo-3.cpp
+++ b/test/gtest/mp/halo-3.cpp
@@ -18,19 +18,27 @@ TYPED_TEST(Halo3, halo_is_visible_after_exchange_not_earlier) {
   dv.halo().exchange();
 
   fill(dv, 13);
+  std::cout << "switch to check values: \n";
   switch (dr::mp::default_comm().rank()) {
   case 0:
+    std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 0).local(), 13);
+    std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 1).local(), 7);
     break;
   case 1:
+    std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 0).local(), 7);
+    std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 1).local(), 13);
+    std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 2).local(), 7);
     break;
   case 2:
     EXPECT_EQ(*(dv.begin() + 1).local(), 7);
+    std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n";
     EXPECT_EQ(*(dv.begin() + 2).local(), 13);
+    std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n";
     break;
   }
 
@@ -222,3 +230,72 @@ TYPED_TEST(Halo3, dv_halos_prev_0) {
     EXPECT_EQ(*(dv.begin() + 5).local(), 6);
   }
 }
+
+TYPED_TEST(Halo3, halo_wide) {
+  TypeParam dv(9, dr::mp::distribution().halo(3));
+  fill(dv, 7);
+  dv.halo().exchange();
+
+  fill(dv, 13);
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(*(dv.begin() + 0).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 1).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 2).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 3).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 7);
+    break;
+  case 1:
+    EXPECT_EQ(*(dv.begin() + 0).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 1).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 2).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 3).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 6).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 7).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 8).local(), 7);
+    break;
+  case 2:
+    EXPECT_EQ(*(dv.begin() + 3).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 7);
+    EXPECT_EQ(*(dv.begin() + 6).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 7).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 8).local(), 13);
+    break;
+  }
+
+  dv.halo().exchange();
+
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(*(dv.begin() + 0).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 1).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 2).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 3).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 13);
+    break;
+  case 1:
+    EXPECT_EQ(*(dv.begin() + 0).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 1).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 2).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 3).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 6).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 7).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 8).local(), 13);
+    break;
+  case 2:
+    EXPECT_EQ(*(dv.begin() + 3).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 4).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 5).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 6).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 7).local(), 13);
+    EXPECT_EQ(*(dv.begin() + 8).local(), 13);
+    break;
+  }
+}
\ No newline at end of file
diff --git a/test/gtest/mp/mdstar.cpp b/test/gtest/mp/mdstar.cpp
index 0e964a0d18..7670a0717e 100644
--- a/test/gtest/mp/mdstar.cpp
+++ b/test/gtest/mp/mdstar.cpp
@@ -35,7 +35,7 @@ class Mdspan : public ::testing::Test {
 
 TEST_F(Mdspan, StaticAssert) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
-  auto mdspan = xp::views::mdspan(dist, extents2d);
+  auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
   static_assert(rng::forward_range<decltype(mdspan)>);
   static_assert(dr::distributed_range<decltype(mdspan)>);
   auto segments = dr::ranges::segments(mdspan);
@@ -47,7 +47,7 @@ TEST_F(Mdspan, StaticAssert) {
 
 TEST_F(Mdspan, Iterator) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
-  auto mdspan = xp::views::mdspan(dist, extents2d);
+  auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
 
   *mdspan.begin() = 17;
   xp::fence();
@@ -57,7 +57,7 @@ TEST_F(Mdspan, Iterator) {
 
 TEST_F(Mdspan, Mdindex2D) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
-  auto dmdspan = xp::views::mdspan(dist, extents2d);
+  auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
 
   std::size_t i = 1, j = 2;
   dmdspan.mdspan()(i, j) = 17;
@@ -68,7 +68,7 @@ TEST_F(Mdspan, Mdindex2D) {
 
 TEST_F(Mdspan, Mdindex3D) {
   xp::distributed_vector<T> dist(n3d, dist3d_1d);
-  auto dmdspan = xp::views::mdspan(dist, extents3d);
+  auto dmdspan = xp::views::mdspan(dist, extents3d, dist3d_1d);
 
   std::size_t i = 1, j = 2, k = 0;
   dmdspan.mdspan()(i, j, k) = 17;
@@ -79,7 +79,7 @@ TEST_F(Mdspan, Mdindex3D) {
 
 TEST_F(Mdspan, Pipe) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
-  auto mdspan = dist | xp::views::mdspan(extents2d);
+  auto mdspan = dist | xp::views::mdspan(extents2d, dist2d_1d);
 
   *mdspan.begin() = 17;
   xp::fence();
@@ -89,7 +89,7 @@ TEST_F(Mdspan, Pipe) {
 
 TEST_F(Mdspan, SegmentExtents) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
-  auto dmdspan = xp::views::mdspan(dist, extents2d);
+  auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
 
   // Sum of leading dimension matches original
   std::size_t x = 0;
@@ -106,7 +106,7 @@ TEST_F(Mdspan, Subrange) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
   auto inner = rng::subrange(dist.begin() + ydim, dist.end() - ydim);
   std::array<std::size_t, 2> inner_extents({extents2d[0] - 2, extents2d[1]});
-  auto dmdspan = xp::views::mdspan(inner, inner_extents);
+  auto dmdspan = xp::views::mdspan(inner, inner_extents, dist2d_1d);
 
   // Summing up leading dimension size of segments should equal
   // original minus 2 rows
@@ -123,7 +123,7 @@ TEST_F(Mdspan, Subrange) {
 TEST_F(Mdspan, GridExtents) {
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
   xp::iota(dist, 100);
-  auto dmdspan = xp::views::mdspan(dist, extents2d);
+  auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
   auto grid = dmdspan.grid();
 
   auto x = 0;
@@ -147,7 +147,7 @@ TEST_F(Mdspan, GridLocalReference) {
 
   xp::distributed_vector<T> dist(n2d, dist2d_1d);
   xp::iota(dist, 100);
-  auto dmdspan = xp::views::mdspan(dist, extents2d);
+  auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d);
   auto grid = dmdspan.grid();
 
   auto tile = grid(0, 0).mdspan();
diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp
index e663188bbd..4fb683138b 100644
--- a/test/gtest/mp/reduce.cpp
+++ b/test/gtest/mp/reduce.cpp
@@ -6,9 +6,7 @@
 
 // Fixture
 
-template <typename T> class ReduceMP : public testing::Test {
-public:
-};
+template <typename T> class ReduceMP : public testing::Test { public: };
 
 TYPED_TEST_SUITE(ReduceMP, AllTypes);
 
diff --git a/test/gtest/mp/segments.cpp b/test/gtest/mp/segments.cpp
index 029a5faaa3..8817bed379 100644
--- a/test/gtest/mp/segments.cpp
+++ b/test/gtest/mp/segments.cpp
@@ -6,9 +6,7 @@
 
 #include <dr/mp/views/segmented.hpp>
 
-template <typename T> class Segmented : public testing::Test {
-public:
-};
+template <typename T> class Segmented : public testing::Test { public: };
 
 TYPED_TEST_SUITE(Segmented, AllTypesWithoutIshmem);
 
@@ -27,9 +25,7 @@ TYPED_TEST(Segmented, Basic) {
   EXPECT_EQ(dr::ranges::segments(ops.dist_vec), segmented);
 }
 
-template <typename T> class SegmentUtils : public testing::Test {
-public:
-};
+template <typename T> class SegmentUtils : public testing::Test { public: };
 
 // traversing on host over local_segment does not work in case of both:
 // device_memory and IshmemBackend (which uses device memory)
diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp
new file mode 100644
index 0000000000..686e09dcf1
--- /dev/null
+++ b/test/gtest/mp/wide-halo-1d-3.cpp
@@ -0,0 +1,193 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+
+template <typename T> class WideHalo3_1D : public testing::Test {};
+
+using T = int;
+using Array = dr::mp::distributed_vector<T>;
+
+static const std::size_t redundancy = 2;
+static const std::size_t size = 6;
+
+static dr::mp::distribution get_distribution() {
+  return dr::mp::distribution().halo(1).redundancy(redundancy);
+}
+
+static int &get(Array &v, std::size_t i) { return *(v.begin() + i).local(); }
+
+TEST(WideHalo3_1D, suite_works_for_3_processes_only) {
+  EXPECT_EQ(dr::mp::default_comm().size(), 3);
+}
+
+TEST(WideHalo3_1D, halo_is_visible_after_exchange_not_earlier) {
+  dr::mp::distribution dist = get_distribution();
+  Array dv(size, dist);
+  Array dv_out(size, dist);
+
+  fill(dv, 1);
+  fill(dv_out, 1);
+  dv.halo().exchange();
+  dv_out.halo().exchange();
+
+  auto print = [&](const auto &v) {
+    for (auto seg : v.segments()) {
+      for (auto i = seg.begin_stencil({0ul})[0]; i < seg.end_stencil({0ul})[0];
+           i++) {
+        std::cout << *(seg.begin() + i).local() << " ";
+      }
+    }
+    std::cout << "\n";
+  };
+
+  auto transform = [&] {
+    stencil_for_each_extended<1>(
+        [](auto stencils) {
+          auto [x, x_out] = stencils;
+          x_out(0) = x(-1) + x(0) + x(1);
+        },
+        {1}, {1}, dv, dv_out);
+    stencil_for_each_extended<1>(
+        [](auto stencils) {
+          auto [x, x_out] = stencils;
+          x(0) = x_out(0);
+        },
+        {0}, {0}, dv, dv_out);
+  };
+
+  transform();
+  print(dv);
+
+  // after first step, only actually stored values and their neighbours are
+  // guaranteed to be correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 3);
+    EXPECT_EQ(get(dv, 2), 3);
+    EXPECT_EQ(get(dv, 3), 1);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 3);
+    EXPECT_EQ(get(dv, 2), 3);
+    EXPECT_EQ(get(dv, 3), 3);
+    EXPECT_EQ(get(dv, 4), 3);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2), 1);
+    EXPECT_EQ(get(dv, 3), 3);
+    EXPECT_EQ(get(dv, 4), 3);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  }
+
+  // after second step, only actually stored values are guaranteed to be correct
+
+  transform();
+  print(dv);
+
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 7);
+    EXPECT_EQ(get(dv, 3), 1);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2), 1);
+    EXPECT_EQ(get(dv, 3), 7);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  }
+
+  // after exchange all are correct
+  dv.halo().exchange();
+  print(dv);
+
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  }
+}
+
+TEST(WideHalo3_1D, halo_api_works) {
+  dr::mp::distribution dist = get_distribution();
+  Array dv(size, dist);
+  Array dv_out(size, dist);
+
+  fill(dv, 1);
+  fill(dv_out, 1);
+  dv.halo().exchange();
+  dv_out.halo().exchange();
+
+  halo_exchange(
+      [](Array &dv, Array &dv_out) {
+        stencil_for_each_extended<1>(
+            [](auto stencils) {
+              auto [x, x_out] = stencils;
+              x_out(0) = x(-1) + x(0) + x(1);
+            },
+            {1}, {1}, dv, dv_out);
+        stencil_for_each_extended<1>(
+            [](auto stencils) {
+              auto [x, x_out] = stencils;
+              x(0) = x_out(0);
+            },
+            {0}, {0}, dv, dv_out);
+      },
+      dv, dv_out);
+  // after exchange all are correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0), 1);
+    EXPECT_EQ(get(dv, 1), 7);
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2), 9);
+    EXPECT_EQ(get(dv, 3), 9);
+    EXPECT_EQ(get(dv, 4), 7);
+    EXPECT_EQ(get(dv, 5), 1);
+    break;
+  }
+}
diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp
new file mode 100644
index 0000000000..08ce9fcc96
--- /dev/null
+++ b/test/gtest/mp/wide-halo-2d-3.cpp
@@ -0,0 +1,267 @@
+// SPDX-FileCopyrightText: Intel Corporation
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "xp-tests.hpp"
+
+template <typename T> class WideHalo3_2D : public testing::Test {};
+
+using T = int;
+using Array = dr::mp::distributed_mdarray<T, 2>;
+
+static const std::size_t redundancy = 2;
+static const std::array<std::size_t, 2> size = {6, 6};
+
+static dr::mp::distribution get_distribution() {
+  return dr::mp::distribution().halo(1).redundancy(redundancy);
+}
+
+static int &get(Array &v, std::size_t i, std::size_t j) {
+  return *(v.begin() + i * size[0] + j).local();
+}
+
+TEST(WideHalo3_2D, suite_works_for_3_processes_only) {
+  EXPECT_EQ(dr::mp::default_comm().size(), 3);
+}
+
+TEST(WideHalo3_2D, halo2d_is_visible_after_exchange_not_earlier) {
+  dr::mp::distribution dist = get_distribution();
+  Array dv(size, dist);
+  Array dv_out(size, dist);
+
+  fill(dv, 1);
+  fill(dv_out, 1);
+  dv.halo().exchange();
+  dv_out.halo().exchange();
+
+  auto transform = [&] {
+    stencil_for_each_extended<2>(
+        [](auto stencils) {
+          auto [x, x_out] = stencils;
+          x_out(0, 0) = 0;
+          for (int i = -1; i <= 1; i++) {
+            for (int j = -1; j <= 1; j++) {
+              x_out(0, 0) += x(i, j);
+            }
+          }
+        },
+        {1, 1}, {1, 1}, dv, dv_out);
+    stencil_for_each_extended<2>(
+        [](auto stencils) {
+          auto [x, x_out] = stencils;
+          x(0, 0) = x_out(0, 0);
+        },
+        {0, 0}, {0, 0}, dv, dv_out);
+  };
+  auto print = [](std::string s, const auto &v) {
+    std::cout << s << "\n";
+    for (auto seg : v.segments()) {
+      auto [beg, end] = seg.stencil({0, 0}, {0, 0});
+      for (std::size_t i = beg[0]; i < end[0]; i++) {
+        for (std::size_t j = beg[1]; j < end[1]; j++) {
+          std::cout << seg.mdspan_extended()(i, j) << "\t";
+        }
+        std::cout << "\n";
+      }
+    }
+    std::cout << "\n";
+  };
+
+  print("dv", dv);
+
+  transform();
+  print("dv", dv);
+  // after first step, only actually stored values and their neighbours are
+  // guaranteed to be correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 9);
+    EXPECT_EQ(get(dv, 2, 1), 9);
+    EXPECT_EQ(get(dv, 3, 1), 1);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 9);
+    EXPECT_EQ(get(dv, 2, 1), 9);
+    EXPECT_EQ(get(dv, 3, 1), 9);
+    EXPECT_EQ(get(dv, 4, 1), 9);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2, 1), 1);
+    EXPECT_EQ(get(dv, 3, 1), 9);
+    EXPECT_EQ(get(dv, 4, 1), 9);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    break;
+  }
+
+  transform();
+  print("dv", dv);
+  // after second step, only actually stored values are guaranteed to be correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 41);
+    EXPECT_EQ(get(dv, 3, 1), 1);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 57);
+    EXPECT_EQ(get(dv, 3, 2), 1);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2, 1), 1);
+    EXPECT_EQ(get(dv, 3, 1), 41);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 2, 2), 1);
+    EXPECT_EQ(get(dv, 3, 2), 57);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  }
+
+  dv.halo().exchange();
+  dv_out.halo().exchange();
+  print("dv", dv);
+  // after exchange all are correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  }
+}
+
+TEST(WideHalo3_2D, halo2d_api_works) {
+  dr::mp::distribution dist = get_distribution();
+  Array dv(size, dist);
+  Array dv_out(size, dist);
+
+  fill(dv, 1);
+  fill(dv_out, 1);
+  dv.halo().exchange();
+  dv_out.halo().exchange();
+
+  auto print = [](std::string s, const auto &v) {
+    std::cout << s << "\n";
+    for (auto seg : v.segments()) {
+      auto [beg, end] = seg.stencil({0, 0}, {0, 0});
+      for (std::size_t i = beg[0]; i < end[0]; i++) {
+        for (std::size_t j = beg[1]; j < end[1]; j++) {
+          std::cout << seg.mdspan_extended()(i, j) << "\t";
+        }
+        std::cout << "\n";
+      }
+    }
+    std::cout << "\n";
+  };
+
+  print("dv", dv);
+
+  halo_exchange(
+      [](Array &dv, Array &dv_out) {
+        stencil_for_each_extended<2>(
+            [](auto stencils) {
+              auto [x, x_out] = stencils;
+              x_out(0, 0) = 0;
+              for (int i = -1; i <= 1; i++) {
+                for (int j = -1; j <= 1; j++) {
+                  x_out(0, 0) += x(i, j);
+                }
+              }
+            },
+            {1, 1}, {1, 1}, dv, dv_out);
+        stencil_for_each_extended<2>(
+            [](auto stencils) {
+              auto [x, x_out] = stencils;
+              x(0, 0) = x_out(0, 0);
+            },
+            {0, 0}, {0, 0}, dv, dv_out);
+      },
+      dv, dv_out);
+
+  print("dv", dv);
+  // after exchange all are correct
+  switch (dr::mp::default_comm().rank()) {
+  case 0:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    break;
+  case 1:
+    EXPECT_EQ(get(dv, 0, 1), 1);
+    EXPECT_EQ(get(dv, 1, 1), 41);
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 0, 2), 1);
+    EXPECT_EQ(get(dv, 1, 2), 57);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  case 2:
+    EXPECT_EQ(get(dv, 2, 1), 57);
+    EXPECT_EQ(get(dv, 3, 1), 57);
+    EXPECT_EQ(get(dv, 4, 1), 41);
+    EXPECT_EQ(get(dv, 5, 1), 1);
+    EXPECT_EQ(get(dv, 2, 2), 81);
+    EXPECT_EQ(get(dv, 3, 2), 81);
+    EXPECT_EQ(get(dv, 4, 2), 57);
+    EXPECT_EQ(get(dv, 5, 2), 1);
+    break;
+  }
+}
diff --git a/test/gtest/mp/xp-tests.hpp b/test/gtest/mp/xp-tests.hpp
index d4c14fc0e5..4705b53baf 100644
--- a/test/gtest/mp/xp-tests.hpp
+++ b/test/gtest/mp/xp-tests.hpp
@@ -20,15 +20,15 @@ namespace xp = dr::mp;
 
 template <typename V>
 concept compliant_view = rng::forward_range<V> && rng::random_access_range<V> &&
-                         rng::viewable_range<V> && requires(V &v) {
-                           // test one at a time so error is apparent
-                           dr::ranges::segments(v);
-                           dr::ranges::segments(v).begin();
-                           *dr::ranges::segments(v).begin();
-                           dr::ranges::rank(*dr::ranges::segments(v).begin());
-                           //  dr::ranges::local(rng::begin(dr::ranges::segments(v)[0]));
-                           //  dr::mp::local_segments(v);
-                         };
+    rng::viewable_range<V> && requires(V &v) {
+  // test one at a time so error is apparent
+  dr::ranges::segments(v);
+  dr::ranges::segments(v).begin();
+  *dr::ranges::segments(v).begin();
+  dr::ranges::rank(*dr::ranges::segments(v).begin());
+  //  dr::ranges::local(rng::begin(dr::ranges::segments(v)[0]));
+  //  dr::mp::local_segments(v);
+};
 
 inline void barrier() { dr::mp::barrier(); }
 inline void fence() { dr::mp::fence(); }