diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index c3ae2f682c..0ad4fb096c 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -80,13 +80,31 @@ add_executable(wave_equation wave_equation.cpp) target_link_libraries(wave_equation cxxopts DR::mpi) target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) add_mp_ctest(NAME wave_equation) +add_executable(wave_equation_wide wave_equation_wide.cpp) +target_link_libraries(wave_equation_wide cxxopts DR::mpi) +target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) +add_mp_ctest(NAME wave_equation_wide) # add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # # DRA-92 if(ENABLE_SYCL) add_mp_ctest( - TEST_NAME wave_equation-sycl NAME wave_equation NPROC 2 SYCL) + TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) add_mp_ctest( - TEST_NAME wave_equation_fused-sycl NAME wave_equation NPROC 2 SYCL TARGS -f) + TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) + add_mp_ctest( + TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) + endforeach() + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) + endforeach() endif() add_executable(shallow_water shallow_water.cpp) diff --git a/benchmarks/gbench/mp/wave_equation_wide.cpp b/benchmarks/gbench/mp/wave_equation_wide.cpp new file mode 100644 index 0000000000..51830d9deb --- /dev/null +++ b/benchmarks/gbench/mp/wave_equation_wide.cpp @@ -0,0 +1,454 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cxxopts.hpp" +#include "dr/mp.hpp" +#include "mpi.h" +#include "wave_utils.hpp" +#include +#include +#include + +#ifdef STANDALONE_BENCHMARK + +MPI_Comm comm; +int comm_rank; +int comm_size; + +#else + +#include "../common/dr_bench.hpp" + +#endif + +namespace WaveEquation { + +using T = double; +using Array = dr::mp::distributed_mdarray; + +// gravitational acceleration +constexpr double g = 9.81; +// water depth +constexpr double h = 1.0; + +// Get number of read/write bytes and flops for a single time step +// These numbers correspond to the fused kernel version +void calculate_complexity(std::size_t nx, std::size_t ny, std::size_t &nread, + std::size_t &nwrite, std::size_t &nflop) { + // stage1: 2+2+3 = 7 + // stage2: 3+3+4 = 10 + // stage3: 3+3+4 = 10 + nread = (27 * nx * ny) * sizeof(T); + // stage1: 3 + // stage2: 3 + // stage3: 3 + nwrite = (9 * nx * ny) * sizeof(T); + // stage1: 3+3+6 = 12 + // stage2: 6+6+9 = 21 + // stage3: 6+6+9 = 21 + nflop = 54 * nx * ny; +} + +double exact_elev(double x, double y, double t, double lx, double ly) { + /** + * Exact solution for elevation field. + * + * Returns time-dependent elevation of a 2D standing wave in a + * rectangular domain. + */ + double amp = 0.5; + double c = std::sqrt(g * h); + double sol_x = std::cos(2.0 * M_PI * x / lx); + double sol_y = std::cos(2.0 * M_PI * y / ly); + double omega = c * M_PI * std::hypot(1.0 / lx, 1.0 / ly); + double sol_t = std::cos(2.0 * omega * t); + return amp * sol_x * sol_y * sol_t; +} + +double initial_elev(double x, double y, double lx, double ly) { + return exact_elev(x, y, 0.0, lx, ly); +} + +void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, + double dx_inv, double dy_inv, double dt) { + /** + * Evaluate right hand side of the equations + */ + auto rhs_dedx = [dt, dx_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; + }; + stencil_for_each_extended<2>(rhs_dedx, {0, 0}, {1, 0}, e, dudt); + + auto rhs_dedy = [dt, dy_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; + }; + stencil_for_each_extended<2>(rhs_dedy, {0, 1}, {0, 0}, e, dvdt); + + auto rhs_div = [dt, dx_inv, dy_inv](auto args) { + auto [u, v, out] = args; + auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; + auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; + out(0, 0) = -dt * h * (dudx + dvdy); + }; + stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt); +} + +int run( + std::size_t n, std::size_t redundancy, std::size_t steps, + std::function iter_callback = []() {}) { + // construct grid + // number of cells in x, y direction + std::size_t nx = n; + std::size_t ny = n; + const double xmin = -1, xmax = 1; + const double ymin = -1, ymax = 1; + ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); + + auto dist = dr::mp::distribution().halo(1).redundancy(redundancy); + + // statistics + std::size_t nread, nwrite, nflop; + calculate_complexity(nx, ny, nread, nwrite, nflop); + + if (comm_rank == 0) { + std::cout << "Using backend: dr" << std::endl; + std::cout << "Grid size: " << nx << " x " << ny << std::endl; + std::cout << "Elevation DOFs: " << nx * ny << std::endl; + std::cout << "Velocity DOFs: " << (nx + 1) * ny + nx * (ny + 1) + << std::endl; + std::cout << "Total DOFs: " << nx * ny + (nx + 1) * ny + nx * (ny + 1); + std::cout << std::endl; + } + + // compute time step + double c = std::sqrt(g * h); + double alpha = 0.5; + double dt = alpha * std::min(grid.dx, grid.dy) / c; + std::size_t nt = steps; + dt = 1e-5; + double t_end = static_cast(nt) * dt; + double t_export = 25 * dt; + + if (comm_rank == 0) { + std::cout << "Time step: " << dt << " s" << std::endl; + std::cout << "Total run time: " << std::fixed << std::setprecision(1); + std::cout << t_end << " s, "; + std::cout << nt << " time steps" << std::endl; + std::cout << "Redundancy " << redundancy << std::endl; + } + + std::cout << "before e\n"; + // state variables + // water elevation at T points + Array e({nx + 1, ny}, dist); + std::cout << "after e\n"; + dr::mp::fill(e, 0.0); + std::cout << "after fill e\n"; + // x velocity at U points + Array u({nx + 1, ny}, dist); + dr::mp::fill(u, 0.0); + // y velocity at V points + Array v({nx + 1, ny + 1}, dist); + dr::mp::fill(v, 0.0); + + // state for RK stages + Array e1({nx + 1, ny}, dist); + Array u1({nx + 1, ny}, dist); + Array v1({nx + 1, ny + 1}, dist); + Array e2({nx + 1, ny}, dist); + Array u2({nx + 1, ny}, dist); + Array v2({nx + 1, ny + 1}, dist); + + // time tendencies + // NOTE not needed if rhs kernels are fused with RK stage assignment + Array dedt({nx + 1, ny}, dist); + Array dudt({nx + 1, ny}, dist); + Array dvdt({nx + 1, ny + 1}, dist); + + std::cout << "After all arrays\n"; + + dr::mp::fill(dedt, 0); + dr::mp::fill(dudt, 0); + dr::mp::fill(dvdt, 0); + std::cout << "After fill\n"; + + dr::mp::halo(dedt).exchange(); + dr::mp::halo(dudt).exchange(); + dr::mp::halo(dvdt).exchange(); + std::cout << "After first exchange\n"; + + auto init_op = [xmin, ymin, grid](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + static_cast(global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + static_cast(global_j) * grid.dy; + o = initial_elev(x, y, grid.lx, grid.ly); + } + }; + dr::mp::for_each(init_op, e); + std::cout << "After mp::for_each\n"; + + auto add = [](auto ops) { return ops.first + ops.second; }; + auto max = [](double x, double y) { return std::max(x, y); }; + auto rk_update2 = [](auto ops) { + return 0.75 * std::get<0>(ops) + + 0.25 * (std::get<1>(ops) + std::get<2>(ops)); + }; + auto rk_update3 = [](auto ops) { + return 1.0 / 3.0 * std::get<0>(ops) + + 2.0 / 3.0 * (std::get<1>(ops) + std::get<2>(ops)); + }; + + std::size_t i_export = 0; + double next_t_export = 0.0; + double t = 0.0; + double initial_v = 0.0; + auto tic = std::chrono::steady_clock::now(); + + // RK stage 1: u1 = u + dt*rhs(u) + auto stage_1 = [&] { + rhs(u, v, e, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); + dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); + dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); + }; + // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) + auto stage_2 = [&] { + rhs(u1, v1, e1, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); + }; + // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + auto stage_3 = [&] { + rhs(u2, v2, e2, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); + }; + + for (std::size_t i = 0; i < nt + 1; i++) { + std::cout << "i = " << i << "\n"; + t = static_cast(i) * dt; + + if (t >= next_t_export - 1e-8) { + + double elev_max = dr::mp::reduce(e, static_cast(0), max); + double u_max = dr::mp::reduce(u, static_cast(0), max); + + double total_v = (dr::mp::reduce(e, static_cast(0), std::plus{}) + h) * + grid.dx * grid.dy; + if (i == 0) { + initial_v = total_v; + } + double diff_v = total_v - initial_v; + + if (comm_rank == 0) { + printf("%2lu %4lu %.3f ", i_export, i, t); + printf("elev=%7.5f ", elev_max); + printf("u=%7.5f ", u_max); + printf("dV=% 6.3e ", diff_v); + printf("\n"); + } + if (elev_max > 1e3) { + if (comm_rank == 0) { + std::cout << "Invalid elevation value: " << elev_max << std::endl; + } + return 1; + } + i_export += 1; + next_t_export = static_cast(i_export) * t_export; + } + + // step + iter_callback(); + if ((i + 1) % redundancy == 0) { + // phase with communication - once after (redundancy - 1) steps without + // communication + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + stage_1(); + + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); + stage_2(); + + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + stage_3(); + } else { + // Phase without communication + stage_1(); + stage_2(); + stage_3(); + } + } + + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + + auto toc = std::chrono::steady_clock::now(); + std::chrono::duration duration = toc - tic; + if (comm_rank == 0) { + double t_cpu = duration.count(); + double t_step = t_cpu / static_cast(nt); + double read_bw = double(nread) / t_step / (1024 * 1024 * 1024); + double write_bw = double(nwrite) / t_step / (1024 * 1024 * 1024); + double flop_rate = double(nflop) / t_step / (1000 * 1000 * 1000); + double ai = double(nflop) / double(nread + nwrite); + std::cout << "Duration: " << std::setprecision(3) << t_cpu; + std::cout << " s" << std::endl; + std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000; + std::cout << " ms" << std::endl; + std::cout << "Reads : " << std::setprecision(3) << read_bw; + std::cout << " GB/s" << std::endl; + std::cout << "Writes: " << std::setprecision(3) << write_bw; + std::cout << " GB/s" << std::endl; + std::cout << "FLOP/s: " << std::setprecision(3) << flop_rate; + std::cout << " GFLOP/s" << std::endl; + std::cout << "Arithmetic intensity: " << std::setprecision(5) << ai; + std::cout << " FLOP/Byte" << std::endl; + } + + // Compute error against exact solution + Array e_exact({nx + 1, ny}, dist); + dr::mp::fill(e_exact, 0.0); + Array error({nx + 1, ny}, dist); + + auto exact_op = [xmin, ymin, grid, t](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + static_cast(global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + static_cast(global_j) * grid.dy; + o = exact_elev(x, y, t, grid.lx, grid.ly); + } + }; + dr::mp::for_each(exact_op, e_exact); + dr::mp::halo(e_exact).exchange(); + auto error_kernel = [](auto ops) { + auto err = ops.first - ops.second; + return err * err; + }; + dr::mp::transform(dr::mp::views::zip(e, e_exact), error.begin(), + error_kernel); + double err_L2 = dr::mp::reduce(error, static_cast(0), std::plus{}) * + grid.dx * grid.dy / grid.lx / grid.ly; + err_L2 = std::sqrt(err_L2); + if (comm_rank == 0) { + std::cout << "L2 error: " << std::setw(7) << std::scientific; + std::cout << std::setprecision(5) << err_L2 << std::endl; + } + return 0; +} + +} // namespace WaveEquation + +#ifdef STANDALONE_BENCHMARK + +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + cxxopts::Options options_spec(argv[0], "wave equation"); + // clang-format off + options_spec.add_options() + ("n", "Grid size", cxxopts::value()->default_value("128")) + ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) + ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")) + ("sycl", "Execute on SYCL device") + ("l,log", "enable logging") + ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("device-memory", "Use device memory") + ("h,help", "Print help"); + // clang-format on + + cxxopts::ParseResult options; + try { + options = options_spec.parse(argc, argv); + } catch (const cxxopts::OptionParseException &e) { + std::cout << options_spec.help() << "\n"; + exit(1); + } + + std::unique_ptr logfile; + if (options.count("log")) { + logfile = + std::make_unique(options["logprefix"].as() + + fmt::format(".{}.log", comm_rank)); + dr::drlog.set_file(*logfile); + } + + if (options.count("sycl")) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + std::cout << "Run on: " + << q.get_device().get_info() << "\n"; + dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); +#else + std::cout << "Sycl support requires icpx\n"; + exit(1); +#endif + } else { + if (comm_rank == 0) { + std::cout << "Run on: CPU\n"; + } + dr::mp::init(); + } + + std::size_t n = options["n"].as(); + std::size_t redundancy = options["r"].as(); + std::size_t steps = options["t"].as(); + + auto error = WaveEquation::run(n, redundancy, steps); + dr::mp::finalize(); + MPI_Finalize(); + return error; +} + +#else + +static void WaveEquation_DR(benchmark::State &state) { + + int n = ::sqrtl(default_vector_size); + + // ugly hack to make it working in reasonable time in benchmarking framework + // drbench.py should specify right size or there should be another size option + // to use here instead of default_vector_size + n /= 4; + + std::size_t nread, nwrite, nflop; + WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop); + Stats stats(state, nread, nwrite, nflop); + + auto iter_callback = [&stats]() { stats.rep(); }; + for (auto _ : state) { + WaveEquation::run(n, true, true, iter_callback); + } +} + +DR_BENCHMARK(WaveEquation_DR); + +#endif diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 8851208198..2b841be43b 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include namespace dr::mp { @@ -62,4 +64,187 @@ DI for_each_n(DI first, I n, auto op) { return last; } +namespace __detail { +template +using stencil_index_type = dr::__detail::dr_extents; + +void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, + stencil_index_type<1> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { + return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], + seg.extents()); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(dr::ranges::local(seg_info.first + index)), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for(dr::mp::sycl_queue(), + sycl::range<1>(distance[0]), do_point) + .wait(); +#else + assert(false); +#endif + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + do_point(i); + } + } +} + +void stencil_for_each_extended_2(auto op, stencil_index_type<2> &begin, + stencil_index_type<2> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair(md::mdspan(std::to_address(&seg.mdspan_extended()( + begin_stencil[0], begin_stencil[1])), + ext), + ext); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(&seg_info.first(index[0], index[1])), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for(dr::mp::sycl_queue(), + sycl::range<2>(distance[0], distance[1]), + do_point) + .wait(); +#else + assert(false); +#endif + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t j = 0; j < distance[1]; j++) { + do_point(stencil_index_type<2>{i, j}); + } + } + } +} + +void stencil_for_each_extended_3(auto op, stencil_index_type<3> &begin, + stencil_index_type<3> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair( + md::mdspan(std::to_address(&seg.mdspan_extended()( + begin_stencil[0], begin_stencil[1], begin_stencil[2])), + ext), + ext); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(&seg_info.first(index[0], index[1], index[2])), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), + sycl::range<3>(distance[0], distance[1], distance[2]), do_point) + .wait(); +#else + assert(false); +#endif + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t j = 0; j < distance[1]; j++) { + for (std::size_t k = 0; k < distance[3]; k++) { + do_point(stencil_index_type<3>{i, j, k}); + } + } + } + } +} +} // namespace __detail + +template +requires(1 <= Rank && Rank <= 3) void stencil_for_each_extended( + auto op, __detail::stencil_index_type begin, + __detail::stencil_index_type end, + dr::distributed_range auto &&...drs) { + auto ranges = std::tie(drs...); + auto &&dr0 = std::get<0>(ranges); + if (rng::empty(dr0)) { + return; + } + + auto all_segments = rng::views::zip(dr::ranges::segments(drs)...); + for (const auto &segs : all_segments) { + if constexpr (Rank == 1) { + __detail::stencil_for_each_extended_1(op, begin, end, segs); + } else if constexpr (Rank == 2) { + __detail::stencil_for_each_extended_2(op, begin, end, segs); + } else if constexpr (Rank == 3) { + __detail::stencil_for_each_extended_3(op, begin, end, segs); + } else { + static_assert(false, "Not supported"); // sycl for_each does not support + // more than 3 dimensions + } + } + barrier(); +} + } // namespace dr::mp diff --git a/include/dr/mp/algorithms/md_for_each.hpp b/include/dr/mp/algorithms/md_for_each.hpp index 9d92f0fe98..e0b29578e5 100644 --- a/include/dr/mp/algorithms/md_for_each.hpp +++ b/include/dr/mp/algorithms/md_for_each.hpp @@ -26,12 +26,12 @@ struct any { template concept one_argument = requires(F &f) { - { f(Arg1{}) }; + {f(Arg1{})}; }; template concept two_arguments = requires(F &f) { - { f(Arg1{}, Arg2{}) }; + {f(Arg1{}, Arg2{})}; }; }; // namespace dr::mp::__detail diff --git a/include/dr/mp/alignment.hpp b/include/dr/mp/alignment.hpp index 71ce55dcc9..790e9620a4 100644 --- a/include/dr/mp/alignment.hpp +++ b/include/dr/mp/alignment.hpp @@ -11,7 +11,9 @@ namespace dr::mp { template -concept has_segments = requires(T &t) { dr::ranges::segments(t); }; +concept has_segments = requires(T &t) { + dr::ranges::segments(t); +}; template concept no_segments = !has_segments; diff --git a/include/dr/mp/containers/distributed_mdarray.hpp b/include/dr/mp/containers/distributed_mdarray.hpp index 14cfdd2945..8050b4b035 100644 --- a/include/dr/mp/containers/distributed_mdarray.hpp +++ b/include/dr/mp/containers/distributed_mdarray.hpp @@ -17,20 +17,21 @@ template class distributed_mdarray { distributed_mdarray(dr::__detail::dr_extents shape, distribution dist = distribution()) : tile_shape_(tile_shape(shape)), dv_(dv_size(), dv_dist(dist, shape)), - md_view_(make_md_view(dv_, shape, tile_shape_)) {} + md_view_(make_md_view(dv_, shape, tile_shape_, dist)), dist_(dist) {} auto begin() const { return rng::begin(md_view_); } auto end() const { return rng::end(md_view_); } auto size() const { return rng::size(md_view_); } auto operator[](auto n) { return md_view_[n]; } - auto segments() { return dr::ranges::segments(md_view_); } + auto segments() const { return dr::ranges::segments(md_view_); } auto &halo() const { return dr::mp::halo(dv_); } auto mdspan() const { return md_view_.mdspan(); } auto extent(std::size_t r) const { return mdspan().extent(r); } auto grid() { return md_view_.grid(); } auto view() const { return md_view_; } + auto dist() const { return dist_; } auto operator==(const distributed_mdarray &other) const { return std::equal(begin(), end(), other.begin()); @@ -70,16 +71,17 @@ template class distributed_mdarray { // This wrapper seems to avoid an issue with template argument // deduction for mdspan_view static auto make_md_view(const DV &dv, shape_type shape, - shape_type tile_shape) { - return views::mdspan(dv, shape, tile_shape); + shape_type tile_shape, distribution dist) { + return views::mdspan(dv, shape, tile_shape, dist); } shape_type tile_shape_; DV dv_; - using mdspan_type = - decltype(make_md_view(std::declval(), std::declval(), - std::declval())); + using mdspan_type = decltype(make_md_view( + std::declval(), std::declval(), + std::declval(), std::declval())); mdspan_type md_view_; + distribution dist_; }; template diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 2611963064..0c8af9d4c0 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,6 +276,8 @@ template class distributed_vector { void fence() { backend.fence(); } + const auto &dist() const { return distribution_; } + private: void init(auto size, auto dist) { size_ = size; @@ -292,6 +294,8 @@ template class distributed_vector { segment_size_ = gran * std::max({(size / gran + comm_size - 1) / comm_size, hb.prev / gran, hb.next / gran}); + extended_local_data_distribution ext_dist(segment_size_, size_, hb); + data_size_ = segment_size_ + hb.prev + hb.next; if (size_ > 0) { @@ -303,7 +307,8 @@ template class distributed_vector { std::size_t segment_index = 0; for (std::size_t i = 0; i < size; i += segment_size_) { segments_.emplace_back(this, segment_index++, - std::min(segment_size_, size - i), data_size_); + std::min(segment_size_, size - i), data_size_, + ext_dist); } fence(); diff --git a/include/dr/mp/containers/distribution.hpp b/include/dr/mp/containers/distribution.hpp index 44f4f43eb4..6fa00145a1 100644 --- a/include/dr/mp/containers/distribution.hpp +++ b/include/dr/mp/containers/distribution.hpp @@ -22,7 +22,19 @@ struct distribution { return *this; } - auto halo() const { return halo_bounds_; } + auto halo() const { + halo_bounds halo_bounds_resized = halo_bounds_; + halo_bounds_resized.prev *= redundancy_; + halo_bounds_resized.next *= redundancy_; + return halo_bounds_resized; + } + + distribution &redundancy(std::size_t redundancy) { + redundancy_ = redundancy; + return *this; + } + + auto redundancy() const { return redundancy_; } distribution &periodic(bool periodic) { halo_bounds_.periodic = periodic; @@ -40,7 +52,25 @@ struct distribution { private: halo_bounds halo_bounds_; + std::size_t redundancy_ = 1; std::size_t granularity_ = 1; }; +struct extended_local_data_distribution { + std::size_t begin; + std::size_t end; + std::size_t segment_size; + + extended_local_data_distribution() = default; + extended_local_data_distribution(std::size_t segment_size, std::size_t size, + halo_bounds hb) + : segment_size(segment_size) { + if (default_comm().rank() * segment_size >= hb.prev) + begin = default_comm().rank() * segment_size - hb.prev; + else + begin = 0; + end = std::min((default_comm().rank() + 1) * segment_size + hb.next, size); + } +}; + } // namespace dr::mp diff --git a/include/dr/mp/containers/segment.hpp b/include/dr/mp/containers/segment.hpp index 56724ac61b..69c0ace053 100644 --- a/include/dr/mp/containers/segment.hpp +++ b/include/dr/mp/containers/segment.hpp @@ -215,15 +215,23 @@ template class dv_segment { private: using iterator = dv_segment_iterator; + using stencil_index_type = dr::__detail::dr_extents<1>; + public: using difference_type = std::ptrdiff_t; dv_segment() = default; dv_segment(DV *dv, std::size_t segment_index, std::size_t size, - std::size_t reserved) { + std::size_t reserved, + const extended_local_data_distribution &ext_dist) { dv_ = dv; segment_index_ = segment_index; size_ = size; reserved_ = reserved; + ext_dist_ = ext_dist; + + begin_index_ = segment_index * ext_dist.segment_size; + end_index_ = begin_index_ + size_; + assert(dv_ != nullptr); } @@ -236,6 +244,24 @@ template class dv_segment { auto end() const { return begin() + size(); } auto reserved() const { return reserved_; } + [[nodiscard]] stencil_index_type + begin_stencil(stencil_index_type stencil) const { + return {std::min(std::max(begin_index_, ext_dist_.begin + stencil[0]), + end_index_) - + begin_index_}; + } + [[nodiscard]] stencil_index_type + end_stencil(stencil_index_type stencil) const { + return {std::max(std::min(end_index_, ext_dist_.end - stencil[0]), + begin_index_) - + begin_index_}; + } + [[nodiscard]] std::pair + stencil(stencil_index_type begin, stencil_index_type end) const { + return {begin_stencil(begin), end_stencil(end)}; + } + auto extents() const { return md::extents(reserved_); } + auto operator[](difference_type n) const { return *(begin() + n); } bool is_local() const { return segment_index_ == default_comm().rank(); } @@ -245,6 +271,10 @@ template class dv_segment { std::size_t segment_index_; std::size_t size_; std::size_t reserved_; + + std::size_t begin_index_; + std::size_t end_index_; + extended_local_data_distribution ext_dist_; }; // dv_segment // @@ -253,7 +283,7 @@ template class dv_segment { // template concept has_halo_method = dr::distributed_range && requires(DR &&dr) { - { rng::begin(dr::ranges::segments(dr)[0]).halo() }; + {rng::begin(dr::ranges::segments(dr)[0]).halo()}; }; auto &halo(has_halo_method auto &&dr) { diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp index 7f7b7dbdb1..3d76fdfe7d 100644 --- a/include/dr/mp/halo.hpp +++ b/include/dr/mp/halo.hpp @@ -4,430 +4,7 @@ #pragma once -#include -#include - -namespace dr::mp { - -enum class halo_tag { - invalid, - forward, - reverse, - index, -}; - -template class halo_impl { - using T = typename Group::element_type; - using Memory = typename Group::memory_type; - -public: - using group_type = Group; - - // Destructor frees buffer_, so cannot copy - halo_impl(const halo_impl &) = delete; - halo_impl operator=(const halo_impl &) = delete; - - /// halo constructor - halo_impl(communicator comm, const std::vector &owned_groups, - const std::vector &halo_groups, - const Memory &memory = Memory()) - : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), - memory_(memory) { - DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), - rng::size(halo_groups)); - buffer_size_ = 0; - std::size_t i = 0; - std::vector buffer_index; - for (auto &g : owned_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - for (auto &g : halo_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - buffer_ = memory_.allocate(buffer_size_); - assert(buffer_ != nullptr); - i = 0; - for (auto &g : owned_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - for (auto &g : halo_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - requests_.resize(i); - } - - /// Begin a halo exchange - void exchange_begin() { - DRLOG("Halo exchange receiving"); - receive(halo_groups_); - DRLOG("Halo exchange sending"); - send(owned_groups_); - DRLOG("Halo exchange begin finished"); - } - - /// Complete a halo exchange - void exchange_finalize() { - DRLOG("Halo exchange finalize started"); - reduce_finalize(); - DRLOG("Halo exchange finalize finished"); - } - - void exchange() { - exchange_begin(); - exchange_finalize(); - } - - /// Begin a halo reduction - void reduce_begin() { - receive(owned_groups_); - send(halo_groups_); - } - - /// Complete a halo reduction - void reduce_finalize(const auto &op) { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize(op) waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(op); - } - } - } - - /// Complete a halo reduction - void reduce_finalize() { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize() waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(); - } - } - } - - struct second_op { - T operator()(T &a, T &b) const { return b; } - } second; - - struct plus_op { - T operator()(T &a, T &b) const { return a + b; } - } plus; - - struct max_op { - T operator()(T &a, T &b) const { return std::max(a, b); } - } max; - - struct min_op { - T operator()(T &a, T &b) const { return std::min(a, b); } - } min; - - struct multiplies_op { - T operator()(T &a, T &b) const { return a * b; } - } multiplies; - - ~halo_impl() { - if (buffer_) { - memory_.deallocate(buffer_, buffer_size_); - buffer_ = nullptr; - } - } - -private: - void send(std::vector &sends) { - for (auto &g : sends) { - g.pack(); - g.receive = false; - DRLOG("sending: {}", g.request_index); - comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } - } - - void receive(std::vector &receives) { - for (auto &g : receives) { - g.receive = true; - DRLOG("receiving: {}", g.request_index); - comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } - } - - communicator comm_; - std::vector halo_groups_, owned_groups_; - T *buffer_ = nullptr; - std::size_t buffer_size_; - std::vector requests_; - std::vector map_; - Memory memory_; -}; - -template > class index_group { -public: - using element_type = T; - using memory_type = Memory; - T *buffer = nullptr; - std::size_t request_index; - bool receive; - bool buffered; - - /// Constructor - index_group(T *data, std::size_t rank, - const std::vector &indices, const Memory &memory) - : memory_(memory), data_(data), rank_(rank) { - buffered = false; - for (std::size_t i = 0; i < rng::size(indices) - 1; i++) { - buffered = buffered || (indices[i + 1] - indices[i] != 1); - } - indices_size_ = rng::size(indices); - indices_ = memory_.template allocate(indices_size_); - assert(indices_ != nullptr); - memory_.memcpy(indices_, indices.data(), - indices_size_ * sizeof(std::size_t)); - } - - index_group(const index_group &o) - : buffer(o.buffer), request_index(o.request_index), receive(o.receive), - buffered(o.buffered), memory_(o.memory_), data_(o.data_), - rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) { - indices_ = memory_.template allocate(indices_size_); - assert(indices_ != nullptr); - memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t)); - } - - void unpack(const auto &op) { - T *dpt = data_; - auto n = indices_size_; - auto *ipt = indices_; - auto *b = buffer; - memory_.offload([=]() { - for (std::size_t i = 0; i < n; i++) { - dpt[ipt[i]] = op(dpt[ipt[i]], b[i]); - } - }); - } - - void pack() { - T *dpt = data_; - auto n = indices_size_; - auto *ipt = indices_; - auto *b = buffer; - memory_.offload([=]() { - for (std::size_t i = 0; i < n; i++) { - b[i] = dpt[ipt[i]]; - } - }); - } - - std::size_t buffer_size() { - if (buffered) { - return indices_size_; - } - return 0; - } - - T *data_pointer() { - if (buffered) { - return buffer; - } else { - return &data_[indices_[0]]; - } - } - - std::size_t data_size() { return indices_size_; } - - std::size_t rank() { return rank_; } - auto tag() { return tag_; } - - ~index_group() { - if (indices_) { - memory_.template deallocate(indices_, indices_size_); - indices_ = nullptr; - } - } - -private: - Memory memory_; - T *data_ = nullptr; - std::size_t rank_; - std::size_t indices_size_; - std::size_t *indices_; - halo_tag tag_ = halo_tag::index; -}; - -template -using unstructured_halo_impl = halo_impl>; - -template > -class unstructured_halo : public unstructured_halo_impl { -public: - using group_type = index_group; - using index_map = std::pair>; - - /// - /// Constructor - /// - unstructured_halo(communicator comm, T *data, - const std::vector &owned, - const std::vector &halo, - const Memory &memory = Memory()) - : unstructured_halo_impl( - comm, make_groups(comm, data, owned, memory), - make_groups(comm, data, halo, memory), memory) {} - -private: - static std::vector make_groups(communicator comm, T *data, - const std::vector &map, - const Memory &memory) { - std::vector groups; - for (auto const &[rank, indices] : map) { - groups.emplace_back(data, rank, indices, memory); - } - return groups; - } -}; - -template > class span_group { -public: - using element_type = T; - using memory_type = Memory; - T *buffer = nullptr; - std::size_t request_index = 0; - bool receive = false; - bool buffered = false; - - span_group(std::span data, std::size_t rank, halo_tag tag) - : data_(data), rank_(rank), tag_(tag) { -#ifdef SYCL_LANGUAGE_VERSION - if (use_sycl() && sycl_mem_kind() == sycl::usm::alloc::shared) { - buffered = true; - } -#endif - } - - void unpack() { - if (buffered) { - if (mp::use_sycl()) { - __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data()); - } else { - std::copy(buffer, buffer + rng::size(data_), data_.data()); - } - } - } - - void pack() { - if (buffered) { - if (mp::use_sycl()) { - __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_), - buffer); - } else { - std::copy(data_.begin(), data_.end(), buffer); - } - } - } - std::size_t buffer_size() { return rng::size(data_); } - - std::size_t data_size() { return rng::size(data_); } - - T *data_pointer() { - if (buffered) { - return buffer; - } else { - return data_.data(); - } - } - - std::size_t rank() { return rank_; } - - auto tag() { return tag_; } - -private: - Memory memory_; - std::span data_; - std::size_t rank_; - halo_tag tag_ = halo_tag::invalid; -}; - -struct halo_bounds { - std::size_t prev = 0, next = 0; - bool periodic = false; -}; - -template -using span_halo_impl = halo_impl>; - -template > -class span_halo : public span_halo_impl { -public: - using group_type = span_group; - - span_halo() : span_halo_impl(communicator(), {}, {}) {} - - span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb) - : span_halo_impl(comm, owned_groups(comm, {data, size}, hb), - halo_groups(comm, {data, size}, hb)) { - check(size, hb); - } - - span_halo(communicator comm, std::span span, halo_bounds hb) - : span_halo_impl(comm, owned_groups(comm, span, hb), - halo_groups(comm, span, hb)) {} - -private: - void check(auto size, auto hb) { - assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); - } - - static std::vector - owned_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector owned; - DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); - if (hb.next > 0 && (hb.periodic || !comm.first())) { - owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), - halo_tag::reverse); - } - if (hb.prev > 0 && (hb.periodic || !comm.last())) { - owned.emplace_back( - span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), - comm.next(), halo_tag::forward); - } - return owned; - } - - static std::vector - halo_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector halo; - if (hb.prev > 0 && (hb.periodic || !comm.first())) { - halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); - } - if (hb.next > 0 && (hb.periodic || !comm.last())) { - halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse); - } - return halo; - } -}; - -} // namespace dr::mp - -#ifdef DR_FORMAT - -template <> -struct fmt::formatter : formatter { - template - auto format(dr::mp::halo_bounds hb, FmtContext &ctx) { - return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next); - } -}; - -#endif +#include +#include +#include +#include diff --git a/include/dr/mp/halo/format.hpp b/include/dr/mp/halo/format.hpp new file mode 100644 index 0000000000..2b602c5db6 --- /dev/null +++ b/include/dr/mp/halo/format.hpp @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#ifdef DR_FORMAT + +template <> +struct fmt::formatter : formatter { + template + auto format(dr::mp::halo_bounds hb, FmtContext &ctx) { + return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next); + } +}; + +#endif diff --git a/include/dr/mp/halo/group.hpp b/include/dr/mp/halo/group.hpp new file mode 100644 index 0000000000..bcf24727ad --- /dev/null +++ b/include/dr/mp/halo/group.hpp @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +namespace dr::mp { + + + template > + class index_group { + public: + using element_type = T; + using memory_type = Memory; + T *buffer = nullptr; + std::size_t request_index; + bool receive; + bool buffered; + + /// Constructor + index_group(T *data, std::size_t rank, + const std::vector &indices, const Memory &memory) + : memory_(memory), data_(data), rank_(rank) { + buffered = false; + for (std::size_t i = 0; i < rng::size(indices) - 1; i++) { + buffered = buffered || (indices[i + 1] - indices[i] != 1); + } + indices_size_ = rng::size(indices); + indices_ = memory_.template allocate(indices_size_); + assert(indices_ != nullptr); + memory_.memcpy(indices_, indices.data(), + indices_size_ * sizeof(std::size_t)); + } + + index_group(const index_group &o) + : buffer(o.buffer), request_index(o.request_index), receive(o.receive), + buffered(o.buffered), memory_(o.memory_), data_(o.data_), + rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) { + indices_ = memory_.template allocate(indices_size_); + assert(indices_ != nullptr); + memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t)); + } + + void unpack(const auto &op) { + T *dpt = data_; + auto n = indices_size_; + auto *ipt = indices_; + auto *b = buffer; + memory_.offload([=]() { + for (std::size_t i = 0; i < n; i++) { + dpt[ipt[i]] = op(dpt[ipt[i]], b[i]); + } + }); + } + + void pack() { + T *dpt = data_; + auto n = indices_size_; + auto *ipt = indices_; + auto *b = buffer; + memory_.offload([=]() { + for (std::size_t i = 0; i < n; i++) { + b[i] = dpt[ipt[i]]; + } + }); + } + + std::size_t buffer_size() { + if (buffered) { + return indices_size_; + } + return 0; + } + + T *data_pointer() { + if (buffered) { + return buffer; + } else { + return &data_[indices_[0]]; + } + } + + std::size_t data_size() { return indices_size_; } + + std::size_t rank() { return rank_; } + auto tag() { return tag_; } + + ~index_group() { + if (indices_) { + memory_.template deallocate(indices_, indices_size_); + indices_ = nullptr; + } + } + + private: + Memory memory_; + T *data_ = nullptr; + std::size_t rank_; + std::size_t indices_size_; + std::size_t *indices_; + halo_tag tag_ = halo_tag::index; + }; + + template > class span_group { + public: + using element_type = T; + using memory_type = Memory; + T *buffer = nullptr; + std::size_t request_index = 0; + bool receive = false; + bool buffered = false; + + span_group(std::span data, std::size_t rank, halo_tag tag) + : data_(data), rank_(rank), tag_(tag) { +#ifdef SYCL_LANGUAGE_VERSION + if (mp::use_sycl() && mp::sycl_mem_kind() == sycl::usm::alloc::shared) { + buffered = true; + } +#endif + } + + /// If span is buffered, push buffer to data + void unpack() { + if (buffered) { + __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data()); + } + } + + /// If span is buffered, pull data into buffer + void pack() { + if (buffered) { + __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_), buffer); + } + } + + std::size_t buffer_size() { return rng::size(data_); } + + std::size_t data_size() { return rng::size(data_); } + + T *data_pointer() { + if (buffered) { + return buffer; + } else { + return data_.data(); + } + } + + std::size_t rank() { return rank_; } + + auto tag() { return tag_; } + + private: + std::span data_; + std::size_t rank_; + halo_tag tag_ = halo_tag::invalid; + }; +} \ No newline at end of file diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp new file mode 100644 index 0000000000..a8d8b92419 --- /dev/null +++ b/include/dr/mp/halo/halo.hpp @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +namespace dr::mp { + +enum class halo_tag { + invalid, + forward, + reverse, + index, +}; + +struct halo_bounds { + // How many values before and after the data segment are in halo + std::size_t prev = 0, next = 0; + bool periodic = false; +}; + +template class halo_impl { + using T = typename Group::element_type; + using Memory = typename Group::memory_type; + +public: + using group_type = Group; + + // Destructor frees buffer_, so cannot copy + halo_impl(const halo_impl &) = delete; + + halo_impl operator=(const halo_impl &) = delete; + + /// halo constructor + halo_impl(communicator comm, const std::vector &owned_groups, + const std::vector &halo_groups, + const Memory &memory = Memory()) + : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), + memory_(memory) { + DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), + rng::size(halo_groups)); + buffer_size_ = 0; + std::size_t i = 0; + std::vector buffer_index; + for (auto &g : owned_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); + } + for (auto &g : halo_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); + } + buffer_ = memory_.allocate(buffer_size_); + assert(buffer_ != nullptr); + i = 0; + for (auto &g : owned_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; + } + for (auto &g : halo_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; + } + requests_.resize(i); + } + + /// Begin a halo exchange + void exchange_begin() { + DRLOG("Halo exchange receiving"); + receive(halo_groups_); + DRLOG("Halo exchange sending"); + send(owned_groups_); + DRLOG("Halo exchange begin finished"); + } + + /// Complete a halo exchange + void exchange_finalize() { + DRLOG("Halo exchange finalize started"); + reduce_finalize(); + DRLOG("Halo exchange finalize finished"); + } + + void exchange() { + exchange_begin(); + exchange_finalize(); + } + + /// Begin a halo reduction + void reduce_begin() { + receive(owned_groups_); + send(halo_groups_); + } + + /// Complete a halo reduction + void reduce_finalize(const auto &op) { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize(op) waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(op); + } + } + } + + /// Complete a halo reduction + void reduce_finalize() { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize() waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(); + } + } + } + + struct second_op { + T operator()(T &a, T &b) const { return b; } + } second; + + struct plus_op { + T operator()(T &a, T &b) const { return a + b; } + } plus; + + struct max_op { + T operator()(T &a, T &b) const { return std::max(a, b); } + } max; + + struct min_op { + T operator()(T &a, T &b) const { return std::min(a, b); } + } min; + + struct multiplies_op { + T operator()(T &a, T &b) const { return a * b; } + } multiplies; + + ~halo_impl() { + if (buffer_) { + memory_.deallocate(buffer_, buffer_size_); + buffer_ = nullptr; + } + } + +private: + void send(std::vector &sends) { + for (auto &g : sends) { + g.pack(); + g.receive = false; + DRLOG("sending: {}", g.request_index); + // std::cout << "send(" << g.data_pointer() << ", " << + // g.data_size() << ", " << g.rank() << ", , " << + // &requests_[g.request_index] << ")\n"; + comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); + } + } + + void receive(std::vector &receives) { + for (auto &g : receives) { + g.receive = true; + DRLOG("receiving: {}", g.request_index); + // std::cout << "recv(" << g.data_pointer() << ", " << + // g.data_size() << ", " << g.rank() << ", , " << + // &requests_[g.request_index] << ")\n"; + comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); + } + } + + communicator comm_; + std::vector halo_groups_, owned_groups_; + T *buffer_ = nullptr; + std::size_t buffer_size_; + std::vector requests_; + std::vector map_; + Memory memory_; +}; + +template +void halo_exchange(auto &&f, T &dv, Ts &...dvs) { + for (std::size_t step = 0; step < dv.dist().redundancy(); step++) { + f(dv, dvs...); + } + halo(dv).exchange(); +} +} // namespace dr::mp diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp new file mode 100644 index 0000000000..04be307690 --- /dev/null +++ b/include/dr/mp/halo/instance.hpp @@ -0,0 +1,98 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include +#include + +namespace dr::mp { +template +using unstructured_halo_impl = halo_impl>; + +template > +class unstructured_halo : public unstructured_halo_impl { +public: + using group_type = index_group; + using index_map = std::pair>; + + /// + /// Constructor + /// + unstructured_halo(communicator comm, T *data, + const std::vector &owned, + const std::vector &halo, + const Memory &memory = Memory()) + : unstructured_halo_impl( + comm, make_groups(comm, data, owned, memory), + make_groups(comm, data, halo, memory), memory) {} + +private: + static std::vector make_groups(communicator comm, T *data, + const std::vector &map, + const Memory &memory) { + std::vector groups; + for (auto const &[rank, indices] : map) { + groups.emplace_back(data, rank, indices, memory); + } + return groups; + } +}; + +template +using span_halo_impl = halo_impl>; + +template > +class span_halo : public span_halo_impl { +public: + using group_type = span_group; + + span_halo() : span_halo_impl(communicator(), {}, {}) {} + + span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb) + : span_halo_impl(comm, owned_groups(comm, {data, size}, hb), + halo_groups(comm, {data, size}, hb)) { + check(size, hb); + } + + span_halo(communicator comm, std::span span, halo_bounds hb) + : span_halo_impl(comm, owned_groups(comm, span, hb), + halo_groups(comm, span, hb)) {} + +private: + void check(auto size, auto hb) { + assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); + } + + static std::vector + owned_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector owned; + DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); + if (hb.next > 0 && (hb.periodic || !comm.first())) { + owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), + halo_tag::reverse); + } + if (hb.prev > 0 && (hb.periodic || !comm.last())) { + owned.emplace_back( + span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), + comm.next(), halo_tag::forward); + } + return owned; + } + + static std::vector + halo_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector halo; + if (hb.prev > 0 && (hb.periodic || !comm.first())) { + halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); + } + if (hb.next > 0 && (hb.periodic || !comm.last())) { + halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse); + } + return halo; + } +}; +} // namespace dr::mp diff --git a/include/dr/mp/views/mdspan_view.hpp b/include/dr/mp/views/mdspan_view.hpp index 1a67f8d78f..44da9f9bbe 100644 --- a/include/dr/mp/views/mdspan_view.hpp +++ b/include/dr/mp/views/mdspan_view.hpp @@ -25,16 +25,24 @@ namespace dr::mp::__detail { template class md_segment : public rng::view_interface> { private: + using stencil_index_type = dr::__detail::dr_extents; + public: using index_type = dr::__detail::dr_extents; md_segment() {} - md_segment(index_type origin, BaseSegment segment, index_type tile_shape) + md_segment(index_type origin, BaseSegment segment, index_type tile_shape, + extended_local_data_distribution ext_dist) : base_(segment), origin_(origin), - mdspan_(local_tile(segment, tile_shape)) { + mdspan_(local_tile(segment, tile_shape)), + mdspan_extended_(local_tile_extended(segment, tile_shape)), + ext_dist_(ext_dist) { dr::drlog.debug(dr::logger::mdspan_view, "md_segment\n origin: {}\n tile shape: {}\n", origin, tile_shape); + + for (std::size_t i = 0; i < Rank; i++) + end_[i] = origin_[i] + tile_shape[i]; } // view_interface uses below to define everything else @@ -45,8 +53,39 @@ class md_segment : public rng::view_interface> { auto halo() const { return dr::mp::halo(base_); } + [[nodiscard]] stencil_index_type + begin_stencil(stencil_index_type stencil) const { + stencil_index_type out; + // Supports only 1d distribution + for (std::size_t i = 0; i < Rank; i++) { + out[i] = std::min(std::max(origin_[i], + (i == 0 ? ext_dist_.begin : origin_[i]) + + stencil[i]), + end_[i]) - + origin_[i]; + } + return out; + } + [[nodiscard]] stencil_index_type + end_stencil(stencil_index_type stencil) const { + stencil_index_type out; + // Supports only 1d distribution + for (std::size_t i = 0; i < Rank; i++) { + out[i] = std::max(std::min(end_[i], (i == 0 ? ext_dist_.end : end_[i]) - + stencil[i]), + origin_[i]) - + origin_[i]; + } + return out; + } + [[nodiscard]] std::pair + stencil(stencil_index_type begin, stencil_index_type end) const { + return {begin_stencil(begin), end_stencil(end)}; + } + // mdspan-specific methods auto mdspan() const { return mdspan_; } + auto mdspan_extended() const { return mdspan_extended_; } auto origin() const { return origin_; } // for slices, this would be the underlying mdspan auto root_mdspan() const { return mdspan(); } @@ -62,9 +101,20 @@ class md_segment : public rng::view_interface> { return md::mdspan(ptr, tile_shape); } + static auto local_tile_extended(BaseSegment segment, + const index_type &tile_shape) { + T *ptr = std::to_address(dr::ranges::local(rng::begin(segment))); + return md::mdspan(ptr, tile_shape); + } + BaseSegment base_; index_type origin_; + index_type end_; md::mdspan, md::layout_stride> mdspan_; + md::mdspan, md::layout_stride> + mdspan_extended_; + + extended_local_data_distribution ext_dist_; }; } // namespace dr::mp::__detail @@ -107,7 +157,11 @@ struct mdspan_view : public rng::view_interface> { return origin; } - static auto make_segments(auto base, auto full_shape, auto tile_shape) { + static auto make_segments(auto base, auto full_shape, auto tile_shape, + auto dist) { + extended_local_data_distribution ext_dist(tile_shape[0], full_shape[0], + dist.halo()); + auto make_md = [=](auto v) { auto clipped = tile_shape; std::size_t segment_index = std::get<0>(v); @@ -117,7 +171,7 @@ struct mdspan_view : public rng::view_interface> { } return __detail::md_segment( segment_index_to_global_origin(segment_index, full_shape, tile_shape), - std::get<1>(v), clipped); + std::get<1>(v), clipped, ext_dist); }; dr::drlog.debug(dr::logger::mdspan_view, @@ -127,11 +181,12 @@ struct mdspan_view : public rng::view_interface> { return dr::__detail::bounded_enumerate(dr::ranges::segments(base)) | rng::views::transform(make_md); } - using segments_type = decltype(make_segments(std::declval(), - full_shape_, tile_shape_)); + using segments_type = + decltype(make_segments(std::declval(), full_shape_, + tile_shape_, std::declval())); public: - mdspan_view(R r, dr::__detail::dr_extents full_shape) + mdspan_view(R r, dr::__detail::dr_extents full_shape, distribution dist) : base_(rng::views::all(std::forward(r))) { full_shape_ = full_shape; @@ -140,15 +195,15 @@ struct mdspan_view : public rng::view_interface> { tile_shape_[0] = decomp::div; replace_decomp(); - segments_ = make_segments(base_, full_shape_, tile_shape_); + segments_ = make_segments(base_, full_shape_, tile_shape_, dist); } mdspan_view(R r, dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) + dr::__detail::dr_extents tile_shape, distribution dist) : base_(rng::views::all(std::forward(r))), full_shape_(full_shape), tile_shape_(tile_shape) { replace_decomp(); - segments_ = make_segments(base_, full_shape_, tile_shape_); + segments_ = make_segments(base_, full_shape_, tile_shape_, dist); } // Base implements random access range @@ -194,17 +249,18 @@ struct mdspan_view : public rng::view_interface> { }; template -mdspan_view(R &&r, dr::__detail::dr_extents extents) +mdspan_view(R &&r, dr::__detail::dr_extents extents, distribution dist) -> mdspan_view, Rank>; template mdspan_view(R &&r, dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) + dr::__detail::dr_extents tile_shape, distribution dist) -> mdspan_view, Rank>; template -concept is_mdspan_view = - dr::distributed_range && requires(R &r) { r.mdspan(); }; +concept is_mdspan_view = dr::distributed_range && requires(R &r) { + r.mdspan(); +}; } // namespace dr::mp @@ -213,17 +269,20 @@ namespace dr::mp::views { template class mdspan_adapter_closure { public: mdspan_adapter_closure(dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) - : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true) {} + dr::__detail::dr_extents tile_shape, + distribution dist) + : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true), + dist_(dist) {} - mdspan_adapter_closure(dr::__detail::dr_extents full_shape) - : full_shape_(full_shape) {} + mdspan_adapter_closure(dr::__detail::dr_extents full_shape, + distribution dist) + : full_shape_(full_shape), dist_(dist) {} template auto operator()(R &&r) const { if (tile_valid_) { - return mdspan_view(std::forward(r), full_shape_, tile_shape_); + return mdspan_view(std::forward(r), full_shape_, tile_shape_, dist_); } else { - return mdspan_view(std::forward(r), full_shape_); + return mdspan_view(std::forward(r), full_shape_, dist_); } } @@ -236,31 +295,35 @@ template class mdspan_adapter_closure { dr::__detail::dr_extents full_shape_; dr::__detail::dr_extents tile_shape_; bool tile_valid_ = false; + distribution dist_; }; class mdspan_fn_ { public: template - auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape) const { + auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape, + distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), - std::forward(tile_shape))( - std::forward(r)); + std::forward(tile_shape), + dist)(std::forward(r)); } template - auto operator()(R &&r, Shape &&full_shape) const { - return mdspan_adapter_closure(std::forward(full_shape))( - std::forward(r)); + auto operator()(R &&r, Shape &&full_shape, distribution dist) const { + return mdspan_adapter_closure(std::forward(full_shape), + dist)(std::forward(r)); } template - auto operator()(Shape &&full_shape, Shape &&tile_shape) const { + auto operator()(Shape &&full_shape, Shape &&tile_shape, + distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), - std::forward(tile_shape)); + std::forward(tile_shape), dist); } - template auto operator()(Shape &&full_shape) const { - return mdspan_adapter_closure(std::forward(full_shape)); + template + auto operator()(Shape &&full_shape, distribution dist) const { + return mdspan_adapter_closure(std::forward(full_shape), dist); } }; diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 32f26d120a..175915a87d 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -37,6 +37,7 @@ add_executable( copy.cpp distributed_vector.cpp halo.cpp + halo-2d.cpp mdstar.cpp mpsort.cpp reduce.cpp @@ -46,38 +47,53 @@ add_executable( wave_kernel.cpp) add_executable( - mp-tests-3 - mp-tests.cpp - communicator-3.cpp - halo-3.cpp - slide_view-3.cpp) + mp-tests-3 + mp-tests.cpp + communicator-3.cpp + halo-3.cpp + slide_view-3.cpp + wide-halo-1d-3.cpp + wide-halo-2d-3.cpp +) -# mp-quick-test is for development. By reducing the number of source files, it +# mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. - add_executable(mp-quick-test mp-tests.cpp - ../common/count.cpp - ) + halo-2d.cpp +) +add_executable(mp-quick-test-3-only + mp-tests.cpp + wide-halo-2d-3.cpp +) # cmake-format: on target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST) +target_compile_definitions(mp-quick-test-3-only PRIVATE QUICK_TEST) -foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test) +foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-only) if(ENABLE_ISHMEM) target_link_ishmem(${test-exec}) endif() target_link_libraries(${test-exec} GTest::gtest_main cxxopts DR::mpi) set_property(TARGET ${test-exec} PROPERTY RULE_LAUNCH_COMPILE - "${CMAKE_COMMAND} -E time") + "${CMAKE_COMMAND} -E time") endforeach() +# Game of life +add_executable(game_of_life game_of_life.cpp) +if(ENABLE_ISHMEM) + target_link_ishmem(game_of_life) +endif() +target_link_libraries(game_of_life cxxopts DR::mpi) + # tests without --sycl flag will fail on IshmemBackend TODO: make them be # running somehow if ENABLE_ISHMEM will be default CI config if(NOT ENABLE_ISHMEM) add_mp_ctest(NAME mp-quick-test NPROC 1) add_mp_ctest(NAME mp-quick-test NPROC 2) + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3) cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) @@ -88,8 +104,7 @@ if(NOT ENABLE_ISHMEM) foreach(nproc RANGE 2 4) add_mp_ctest(NAME mp-tests NPROC ${nproc} TIMEOUT 150) endforeach() - add_mp_ctest( - TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150) + add_mp_ctest(TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150) endif() if(ENABLE_SYCL) @@ -103,7 +118,7 @@ if(ENABLE_SYCL) # equality of these values: *(--counted_result.end()) Which is: 5, should be # 77 Mdspan, Mdarray hangs sometimes on ISHMEM. set(sycl-exclusions - ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) + ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) endif() foreach(nproc RANGE 1 4) @@ -111,6 +126,8 @@ if(ENABLE_SYCL) add_mp_pvc_ctest( NAME mp-quick-test NPROC ${nproc} OFFLOAD SYCL TARGS --device-memory) endforeach() + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 SYCL) + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 OFFLOAD SYCL TARGS --device-memory) add_mp_pvc_ctest( NAME mp-tests NPROC 2 TIMEOUT 150 OFFLOAD SYCL TARGS --device-memory diff --git a/test/gtest/mp/copy.cpp b/test/gtest/mp/copy.cpp index 66e54def9c..a54d194890 100644 --- a/test/gtest/mp/copy.cpp +++ b/test/gtest/mp/copy.cpp @@ -6,9 +6,7 @@ // Fixture -template class CopyMP : public testing::Test { -public: -}; +template class CopyMP : public testing::Test { public: }; TYPED_TEST_SUITE(CopyMP, AllTypes); diff --git a/test/gtest/mp/game_of_life.cpp b/test/gtest/mp/game_of_life.cpp new file mode 100644 index 0000000000..446338c0b9 --- /dev/null +++ b/test/gtest/mp/game_of_life.cpp @@ -0,0 +1,282 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cxxopts.hpp" +#include "dr/mp.hpp" +#include "mpi.h" + +inline void barrier() { dr::mp::barrier(); } +inline void fence() { dr::mp::fence(); } +inline void fence_on(auto &&obj) { obj.fence(); } + +#include +#include +#include + +struct MPI_data { + MPI_Comm comm; + int rank; + int size; + + bool host() { + return rank == 0; + } +}; + +static MPI_data mpi_data; + +struct Options { + std::size_t width; + std::size_t height; + std::size_t steps; + std::size_t redundancy; + bool debug; + + std::unique_ptr logfile; + + bool sycl; + bool device_memory; +}; + +namespace GameOfLife { + +using T = int; +using Array = dr::mp::distributed_mdarray; + +void init(std::size_t n, Array& out) { + std::vector> in(4, std::vector(4, 0)); + /* + 1 0 0 + 0 1 1 + 1 1 0 + */ + // clang-format off + in[1][1] = 1; in[1][2] = 0; in[1][3] = 0; + in[2][1] = 0; in[2][2] = 1; in[2][3] = 1; + in[3][1] = 1; in[3][2] = 1; in[3][3] = 0; + // clang-format on + std::vector local(n * 4); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + local[i * n + j] = in[i][j]; + } + } + dr::mp::copy(local.begin(), local.end(), out.begin()); +} + +void run(std::size_t n, std::size_t m, std::size_t redundancy, std::size_t steps, bool debug) { + if (mpi_data.host()) { + std::cout << "Using backend: dr" << std::endl; + std::cout << "Grid size: " << n << " x " << m << std::endl; + std::cout << "Time steps:" << steps << std::endl; + std::cout << "Redundancy " << redundancy << std::endl; + std::cout << std::endl; + } + + // construct grid + auto dist = dr::mp::distribution().halo(1).redundancy(redundancy); + Array array({n, m}, dist); + Array array_out({n, m}, dist); + dr::mp::fill(array, 0); + + init(m, array); + + // execute one calculation for one cell in game of life + auto calculate = [](auto stencils) { + auto [x, x_out] = stencils; + // because below we calculate the sum of all 9 cells, + // but we want the output only of 8 neighbourhs, subtract the value of self. + int live_neighbours = -x(0, 0); + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + live_neighbours += x(i, j); // alive == 1, dead == 0, so simple addition works + } + } + + if (x(0, 0) == 1) { // self if alive + if (live_neighbours == 2 || live_neighbours == 3) { + x_out(0, 0) = 1; + } else { + x_out(0, 0) = 0; + } + } + else { // self is dead + if (live_neighbours == 3) { + x_out(0, 0) = 1; + } else { + x_out(0, 0) = 0; + } + } + }; + + // assign values of second array to first array + auto assign = [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }; + + auto print = [n, m](const auto &v) { + std::vector local(n * n); + dr::mp::copy(0, v, local.begin()); + if (mpi_data.host()) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < m; j++) { + fmt::print("{}", local[i * m + j] == 1 ? '#' : '.'); + } + fmt::print("\n"); + } + } + }; + + std::chrono::duration exchange_duration; + std::size_t exchange_count = 0; + + auto tic = std::chrono::steady_clock::now(); + for (std::size_t i = 0, next_treshold = 0; i < steps; i++) { + if (i >= next_treshold && mpi_data.host()) { + next_treshold += round(static_cast(steps / 100)); + double percent = round(static_cast(i) * 100 / static_cast(steps)); + fmt::print("Steps done {}% ({} of {} steps)\n", percent, i, steps); + } + + // step + stencil_for_each_extended<2>(calculate, {1, 1}, {1, 1}, array, array_out); + stencil_for_each_extended<2>(assign, {0, 0}, {0, 0}, array, array_out); + + // phase with communication - once after (redundancy - 1) steps without communication + if ((i + 1) % redundancy == 0) { + if (debug && mpi_data.host()) { + fmt::print("Exchange at step {}\n", i); + } + auto exchange_tic = std::chrono::steady_clock::now(); + array.halo().exchange(); + auto exchange_toc = std::chrono::steady_clock::now(); + exchange_duration += exchange_toc - exchange_tic; + exchange_count++; + + // Array_out is a temporary, no need to exchange it + } + + // debug print + if (debug) { + if (mpi_data.host()) { + fmt::print("Array {}:\n", i); + } + // print needs a synchronication accros MPI boundary (dr::mp::copy), each node has to execute it + print(array); + } + } + auto toc = std::chrono::steady_clock::now(); + + std::chrono::duration duration = toc - tic; + + if (mpi_data.host()) { + double t_cpu = duration.count(); + double t_exch = exchange_duration.count(); + double t_step = t_cpu / static_cast(steps); + double t_exch_step = t_exch / static_cast(exchange_count); + + fmt::print("Steps done 100% ({} of {} steps)\n", steps, steps); + fmt::print("Duration {} s, including exchange total time {} s\n", t_cpu, t_exch); + fmt::print("Time per step {} ms\n", t_step * 1000); + fmt::print("Time per exchange {} ms\n", t_exch_step * 1000); + } +} + +} // namespace GameOfLife + +// Initialization functions + +void init_MPI(int argc, char *argv[]) { + MPI_Init(&argc, &argv); + mpi_data.comm = MPI_COMM_WORLD; + MPI_Comm_rank(mpi_data.comm, &mpi_data.rank); + MPI_Comm_size(mpi_data.comm, &mpi_data.size); + + dr::drlog.debug("MPI: rank = {}, size = {}\n", mpi_data.rank, mpi_data.size); +} + +Options parse_options(int argc, char *argv[]) { + Options out; + + cxxopts::Options options_spec(argv[0], "game of life"); + + // clang-format off + options_spec.add_options() + ("drhelp", "Print help") + ("log", "Enable logging") + ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("log-filter", "Filter the log", cxxopts::value>()) + ("device-memory", "Use device memory") + ("sycl", "Execute on SYCL device") + ("d,debug", "enable debug logging") + ("n,size", "Grid width", cxxopts::value()->default_value("128")) + ("m,height", "Grid height", cxxopts::value()->default_value("128")) + ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) + ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")); + // clang-format on + + cxxopts::ParseResult options; + try { + options = options_spec.parse(argc, argv); + } catch (const cxxopts::OptionParseException &e) { + std::cout << options_spec.help() << "\n"; + exit(1); + } + + out.sycl = options.count("sycl") != 0; + out.device_memory = options.count("device-memory") != 0; + + if (options.count("drhelp")) { + std::cout << options_spec.help() << "\n"; + exit(0); + } + + if (options.count("log")) { + out.logfile.reset(new std::ofstream(options["logprefix"].as() + + fmt::format(".{}.log", mpi_data.rank))); + dr::drlog.set_file(*out.logfile); + if (options.count("log-filter")) { + dr::drlog.filter(options["log-filter"].as>()); + } + } + + out.width = options["n"].as(); + out.height = options.count("m") != 0 ? options["m"].as() : out.width; + out.redundancy = options["r"].as(); + out.steps = options["t"].as(); + + out.debug = options.count("debug") != 0; + + return out; +} + +void dr_init(const Options& options) { +#ifdef SYCL_LANGUAGE_VERSION + if (options.sycl) { + sycl::queue q; + fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info(), options.device_memory ? "device" : "shared"); + dr::mp::init(q, options.device_memory ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); + return; + } +#endif + fmt::print("Running on CPU\n"); + dr::mp::init(); +} + +// Main loop + +int main(int argc, char *argv[]) { + init_MPI(argc, argv); + Options options = parse_options(argc, argv); + dr_init(options); + + GameOfLife::run(options.width, options.height, options.redundancy, options.steps, options.debug); + + dr::mp::finalize(); + MPI_Finalize(); + + return 0; +} diff --git a/test/gtest/mp/halo-2d.cpp b/test/gtest/mp/halo-2d.cpp new file mode 100644 index 0000000000..5168bd6511 --- /dev/null +++ b/test/gtest/mp/halo-2d.cpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +TEST(Halo2D, exchange_2d_test) { + dr::mp::distributed_mdarray dv({10, 10}, dr::mp::distribution().halo(1)); + + DRLOG("exchange start"); + dv.halo().exchange(); + DRLOG("exchange end"); +} diff --git a/test/gtest/mp/halo-3.cpp b/test/gtest/mp/halo-3.cpp index dfc372ebd5..aa4127f397 100644 --- a/test/gtest/mp/halo-3.cpp +++ b/test/gtest/mp/halo-3.cpp @@ -18,19 +18,27 @@ TYPED_TEST(Halo3, halo_is_visible_after_exchange_not_earlier) { dv.halo().exchange(); fill(dv, 13); + std::cout << "switch to check values: \n"; switch (dr::mp::default_comm().rank()) { case 0: + std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 0).local(), 13); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 1).local(), 7); break; case 1: + std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 0).local(), 7); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 1).local(), 13); + std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 2).local(), 7); break; case 2: EXPECT_EQ(*(dv.begin() + 1).local(), 7); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 2).local(), 13); + std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n"; break; } @@ -222,3 +230,72 @@ TYPED_TEST(Halo3, dv_halos_prev_0) { EXPECT_EQ(*(dv.begin() + 5).local(), 6); } } + +TYPED_TEST(Halo3, halo_wide) { + TypeParam dv(9, dr::mp::distribution().halo(3)); + fill(dv, 7); + dv.halo().exchange(); + + fill(dv, 13); + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 7); + EXPECT_EQ(*(dv.begin() + 1).local(), 7); + EXPECT_EQ(*(dv.begin() + 2).local(), 7); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 7); + EXPECT_EQ(*(dv.begin() + 7).local(), 7); + EXPECT_EQ(*(dv.begin() + 8).local(), 7); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + } + + dv.halo().exchange(); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + } +} \ No newline at end of file diff --git a/test/gtest/mp/mdstar.cpp b/test/gtest/mp/mdstar.cpp index 0e964a0d18..7670a0717e 100644 --- a/test/gtest/mp/mdstar.cpp +++ b/test/gtest/mp/mdstar.cpp @@ -35,7 +35,7 @@ class Mdspan : public ::testing::Test { TEST_F(Mdspan, StaticAssert) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = xp::views::mdspan(dist, extents2d); + auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); static_assert(rng::forward_range); static_assert(dr::distributed_range); auto segments = dr::ranges::segments(mdspan); @@ -47,7 +47,7 @@ TEST_F(Mdspan, StaticAssert) { TEST_F(Mdspan, Iterator) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = xp::views::mdspan(dist, extents2d); + auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); *mdspan.begin() = 17; xp::fence(); @@ -57,7 +57,7 @@ TEST_F(Mdspan, Iterator) { TEST_F(Mdspan, Mdindex2D) { xp::distributed_vector dist(n2d, dist2d_1d); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); std::size_t i = 1, j = 2; dmdspan.mdspan()(i, j) = 17; @@ -68,7 +68,7 @@ TEST_F(Mdspan, Mdindex2D) { TEST_F(Mdspan, Mdindex3D) { xp::distributed_vector dist(n3d, dist3d_1d); - auto dmdspan = xp::views::mdspan(dist, extents3d); + auto dmdspan = xp::views::mdspan(dist, extents3d, dist3d_1d); std::size_t i = 1, j = 2, k = 0; dmdspan.mdspan()(i, j, k) = 17; @@ -79,7 +79,7 @@ TEST_F(Mdspan, Mdindex3D) { TEST_F(Mdspan, Pipe) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = dist | xp::views::mdspan(extents2d); + auto mdspan = dist | xp::views::mdspan(extents2d, dist2d_1d); *mdspan.begin() = 17; xp::fence(); @@ -89,7 +89,7 @@ TEST_F(Mdspan, Pipe) { TEST_F(Mdspan, SegmentExtents) { xp::distributed_vector dist(n2d, dist2d_1d); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); // Sum of leading dimension matches original std::size_t x = 0; @@ -106,7 +106,7 @@ TEST_F(Mdspan, Subrange) { xp::distributed_vector dist(n2d, dist2d_1d); auto inner = rng::subrange(dist.begin() + ydim, dist.end() - ydim); std::array inner_extents({extents2d[0] - 2, extents2d[1]}); - auto dmdspan = xp::views::mdspan(inner, inner_extents); + auto dmdspan = xp::views::mdspan(inner, inner_extents, dist2d_1d); // Summing up leading dimension size of segments should equal // original minus 2 rows @@ -123,7 +123,7 @@ TEST_F(Mdspan, Subrange) { TEST_F(Mdspan, GridExtents) { xp::distributed_vector dist(n2d, dist2d_1d); xp::iota(dist, 100); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); auto grid = dmdspan.grid(); auto x = 0; @@ -147,7 +147,7 @@ TEST_F(Mdspan, GridLocalReference) { xp::distributed_vector dist(n2d, dist2d_1d); xp::iota(dist, 100); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); auto grid = dmdspan.grid(); auto tile = grid(0, 0).mdspan(); diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp index e663188bbd..4fb683138b 100644 --- a/test/gtest/mp/reduce.cpp +++ b/test/gtest/mp/reduce.cpp @@ -6,9 +6,7 @@ // Fixture -template class ReduceMP : public testing::Test { -public: -}; +template class ReduceMP : public testing::Test { public: }; TYPED_TEST_SUITE(ReduceMP, AllTypes); diff --git a/test/gtest/mp/segments.cpp b/test/gtest/mp/segments.cpp index 029a5faaa3..8817bed379 100644 --- a/test/gtest/mp/segments.cpp +++ b/test/gtest/mp/segments.cpp @@ -6,9 +6,7 @@ #include -template class Segmented : public testing::Test { -public: -}; +template class Segmented : public testing::Test { public: }; TYPED_TEST_SUITE(Segmented, AllTypesWithoutIshmem); @@ -27,9 +25,7 @@ TYPED_TEST(Segmented, Basic) { EXPECT_EQ(dr::ranges::segments(ops.dist_vec), segmented); } -template class SegmentUtils : public testing::Test { -public: -}; +template class SegmentUtils : public testing::Test { public: }; // traversing on host over local_segment does not work in case of both: // device_memory and IshmemBackend (which uses device memory) diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp new file mode 100644 index 0000000000..686e09dcf1 --- /dev/null +++ b/test/gtest/mp/wide-halo-1d-3.cpp @@ -0,0 +1,193 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +template class WideHalo3_1D : public testing::Test {}; + +using T = int; +using Array = dr::mp::distributed_vector; + +static const std::size_t redundancy = 2; +static const std::size_t size = 6; + +static dr::mp::distribution get_distribution() { + return dr::mp::distribution().halo(1).redundancy(redundancy); +} + +static int &get(Array &v, std::size_t i) { return *(v.begin() + i).local(); } + +TEST(WideHalo3_1D, suite_works_for_3_processes_only) { + EXPECT_EQ(dr::mp::default_comm().size(), 3); +} + +TEST(WideHalo3_1D, halo_is_visible_after_exchange_not_earlier) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto print = [&](const auto &v) { + for (auto seg : v.segments()) { + for (auto i = seg.begin_stencil({0ul})[0]; i < seg.end_stencil({0ul})[0]; + i++) { + std::cout << *(seg.begin() + i).local() << " "; + } + } + std::cout << "\n"; + }; + + auto transform = [&] { + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, + {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, + {0}, {0}, dv, dv_out); + }; + + transform(); + print(dv); + + // after first step, only actually stored values and their neighbours are + // guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; + } + + // after second step, only actually stored values are guaranteed to be correct + + transform(); + print(dv); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 7); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 7); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } + + // after exchange all are correct + dv.halo().exchange(); + print(dv); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } +} + +TEST(WideHalo3_1D, halo_api_works) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + halo_exchange( + [](Array &dv, Array &dv_out) { + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, + {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, + {0}, {0}, dv, dv_out); + }, + dv, dv_out); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } +} diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp new file mode 100644 index 0000000000..08ce9fcc96 --- /dev/null +++ b/test/gtest/mp/wide-halo-2d-3.cpp @@ -0,0 +1,267 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +template class WideHalo3_2D : public testing::Test {}; + +using T = int; +using Array = dr::mp::distributed_mdarray; + +static const std::size_t redundancy = 2; +static const std::array size = {6, 6}; + +static dr::mp::distribution get_distribution() { + return dr::mp::distribution().halo(1).redundancy(redundancy); +} + +static int &get(Array &v, std::size_t i, std::size_t j) { + return *(v.begin() + i * size[0] + j).local(); +} + +TEST(WideHalo3_2D, suite_works_for_3_processes_only) { + EXPECT_EQ(dr::mp::default_comm().size(), 3); +} + +TEST(WideHalo3_2D, halo2d_is_visible_after_exchange_not_earlier) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto transform = [&] { + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, + {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, + {0, 0}, {0, 0}, dv, dv_out); + }; + auto print = [](std::string s, const auto &v) { + std::cout << s << "\n"; + for (auto seg : v.segments()) { + auto [beg, end] = seg.stencil({0, 0}, {0, 0}); + for (std::size_t i = beg[0]; i < end[0]; i++) { + for (std::size_t j = beg[1]; j < end[1]; j++) { + std::cout << seg.mdspan_extended()(i, j) << "\t"; + } + std::cout << "\n"; + } + } + std::cout << "\n"; + }; + + print("dv", dv); + + transform(); + print("dv", dv); + // after first step, only actually stored values and their neighbours are + // guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; + } + + transform(); + print("dv", dv); + // after second step, only actually stored values are guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 41); + EXPECT_EQ(get(dv, 3, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 57); + EXPECT_EQ(get(dv, 3, 2), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 41); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 1); + EXPECT_EQ(get(dv, 3, 2), 57); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } + + dv.halo().exchange(); + dv_out.halo().exchange(); + print("dv", dv); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } +} + +TEST(WideHalo3_2D, halo2d_api_works) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto print = [](std::string s, const auto &v) { + std::cout << s << "\n"; + for (auto seg : v.segments()) { + auto [beg, end] = seg.stencil({0, 0}, {0, 0}); + for (std::size_t i = beg[0]; i < end[0]; i++) { + for (std::size_t j = beg[1]; j < end[1]; j++) { + std::cout << seg.mdspan_extended()(i, j) << "\t"; + } + std::cout << "\n"; + } + } + std::cout << "\n"; + }; + + print("dv", dv); + + halo_exchange( + [](Array &dv, Array &dv_out) { + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, + {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, + {0, 0}, {0, 0}, dv, dv_out); + }, + dv, dv_out); + + print("dv", dv); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } +} diff --git a/test/gtest/mp/xp-tests.hpp b/test/gtest/mp/xp-tests.hpp index d4c14fc0e5..4705b53baf 100644 --- a/test/gtest/mp/xp-tests.hpp +++ b/test/gtest/mp/xp-tests.hpp @@ -20,15 +20,15 @@ namespace xp = dr::mp; template concept compliant_view = rng::forward_range && rng::random_access_range && - rng::viewable_range && requires(V &v) { - // test one at a time so error is apparent - dr::ranges::segments(v); - dr::ranges::segments(v).begin(); - *dr::ranges::segments(v).begin(); - dr::ranges::rank(*dr::ranges::segments(v).begin()); - // dr::ranges::local(rng::begin(dr::ranges::segments(v)[0])); - // dr::mp::local_segments(v); - }; + rng::viewable_range && requires(V &v) { + // test one at a time so error is apparent + dr::ranges::segments(v); + dr::ranges::segments(v).begin(); + *dr::ranges::segments(v).begin(); + dr::ranges::rank(*dr::ranges::segments(v).begin()); + // dr::ranges::local(rng::begin(dr::ranges::segments(v)[0])); + // dr::mp::local_segments(v); +}; inline void barrier() { dr::mp::barrier(); } inline void fence() { dr::mp::fence(); }