-
Notifications
You must be signed in to change notification settings - Fork 16
Interleaved computation with communication in halo exchange #881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
25243a9
c950ea3
7eec868
6090fdc
167702d
755c896
e98de3b
f31b80c
b847d04
06cc78b
5511751
b26665e
e1e9910
bdecda7
9e81fd7
811307c
a5fdcf5
5353689
6701c41
f79fe45
b126f8b
e1d50c9
0da1f2c
871bd58
2e3c96d
dff502a
89f9c18
2e4cc88
9c37e12
78cbd29
e18f59a
3329c97
a0e067d
fb05c7d
ea43af7
4fd4d8f
95ca6cf
900613e
9218cd8
46bfa65
3d7a9a8
2603a6c
cdce405
46f6ade
e4eafa2
ad460d8
bd1e8ed
8d0f5be
046b7e4
5c32e9e
6e95e27
46dff8e
b842cc7
d43b9d3
1b6b21b
d333564
8690c17
3860947
6dcd2f4
df55977
4b0f293
0fe709c
05fa8f6
0d94948
33a1d4f
8dd8a00
d04461d
497eb8c
29759f1
73c78a4
02c92d8
6d36592
5a48300
2955cdc
d5ad221
b02102f
8d798eb
d658d77
4e33bd0
6d108c4
381159f
3d2edda
ccc1033
3723fef
6e2f48b
741b0d8
a1483a2
ffc698d
e69d371
92baecf
232b66f
7f6adc6
f1dd4c1
71bff7d
8b6dc9d
73ffe2d
4790936
c25edce
68cadd5
7efc1dd
bfcad1a
6a57340
b9f85f4
c9f663f
91bd445
f17cb40
c3fe0df
3b2a249
a0e6d1a
ce168f8
f49db0a
650ef40
ecd682b
459ab92
c404bf5
c30485c
81f646a
2f8558a
b517be9
eef815c
0c8b1df
cfa249e
0afa41b
a8063ea
175843b
62b282f
6cd3a48
6b363de
a71921b
3c897e4
f3f5b05
bbcc376
d26e451
5c2a3df
8811883
ba753d3
8deeb5d
4fdb6bc
ad88358
af75a5c
cd257c9
0a3c1ee
73c3b05
c54b47f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,338 @@ | ||
// SPDX-FileCopyrightText: Intel Corporation | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#pragma once | ||
|
||
#include <dr/mp/allocator.hpp> | ||
#include <dr/mp/containers/distribution.hpp> | ||
#include <dr/mp/containers/dual_segment.hpp> | ||
|
||
namespace dr::mp { | ||
|
||
static constexpr std::size_t DUAL_SEGMENTS_PER_PROC = 2; | ||
|
||
class DualMpiBackend { | ||
dr::rma_window win_; | ||
|
||
public: | ||
void *allocate(std::size_t data_size) { | ||
assert(data_size > 0); | ||
void *data = __detail::allocator<std::byte>().allocate(data_size); | ||
DRLOG("called MPI allocate({}) -> got:{}", data_size, data); | ||
win_.create(default_comm(), data, data_size); | ||
active_wins().insert(win_.mpi_win()); | ||
return data; | ||
} | ||
|
||
void deallocate(void *data, std::size_t data_size) { | ||
assert(data_size > 0); | ||
DRLOG("calling MPI deallocate ({}, data_size:{})", data, data_size); | ||
active_wins().erase(win_.mpi_win()); | ||
win_.free(); | ||
__detail::allocator<std::byte>().deallocate(static_cast<std::byte *>(data), | ||
data_size); | ||
} | ||
|
||
void getmem(void *dst, std::size_t offset, std::size_t datalen, | ||
int segment_index) { | ||
const std::size_t peer = get_peer(segment_index); | ||
|
||
DRLOG("calling MPI get(dst:{}, " | ||
"segm_offset:{}, size:{}, peer:{})", | ||
dst, offset, datalen, peer); | ||
|
||
#if (MPI_VERSION >= 4) || \ | ||
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) | ||
// 64-bit API inside | ||
win_.get(dst, datalen, peer, offset); | ||
#else | ||
for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { | ||
std::size_t s = std::min(remainder, (std::size_t)INT_MAX); | ||
DRLOG("{}:{} win_.get total {} now {} bytes at off {}, dst offset {}", | ||
default_comm().rank(), __LINE__, datalen, s, off, offset + off); | ||
win_.get((uint8_t *)dst + off, s, peer, offset + off); | ||
off += s; | ||
remainder -= s; | ||
} | ||
#endif | ||
} | ||
|
||
void putmem(void const *src, std::size_t offset, std::size_t datalen, | ||
int segment_index) { | ||
const std::size_t peer = get_peer(segment_index); | ||
|
||
DRLOG("calling MPI put(segm_offset:{}, " | ||
"src:{}, size:{}, peer:{})", | ||
offset, src, datalen, peer); | ||
|
||
#if (MPI_VERSION >= 4) || \ | ||
(defined(I_MPI_NUMVERSION) && (I_MPI_NUMVERSION > 20211200000)) | ||
// 64-bit API inside | ||
win_.put(src, datalen, peer, offset); | ||
#else | ||
for (std::size_t remainder = datalen, off = 0UL; remainder > 0;) { | ||
std::size_t s = std::min(remainder, (std::size_t)INT_MAX); | ||
DRLOG("{}:{} win_.put {} bytes at off {}, dst offset {}", | ||
default_comm().rank(), __LINE__, s, off, offset + off); | ||
win_.put((uint8_t *)src + off, s, peer, offset + off); | ||
off += s; | ||
remainder -= s; | ||
} | ||
#endif | ||
} | ||
|
||
std::size_t getrank() { return win_.communicator().rank(); } | ||
|
||
void fence() { win_.fence(); } | ||
|
||
private: | ||
std::size_t get_peer(const std::size_t segment_index) { | ||
const auto size = win_.communicator().size(); | ||
return segment_index < size ? segment_index : 2 * size - segment_index - 1; | ||
} | ||
}; | ||
|
||
/// distributed vector | ||
template <typename T, class BackendT = DualMpiBackend> | ||
class dual_distributed_vector { | ||
|
||
public: | ||
using value_type = T; | ||
using size_type = std::size_t; | ||
using difference_type = std::ptrdiff_t; | ||
using backend_type = BackendT; | ||
|
||
class iterator { | ||
public: | ||
using iterator_category = std::random_access_iterator_tag; | ||
using value_type = typename dual_distributed_vector::value_type; | ||
using difference_type = typename dual_distributed_vector::difference_type; | ||
|
||
iterator() {} | ||
iterator(const dual_distributed_vector *parent, difference_type offset) | ||
: parent_(parent), offset_(offset) {} | ||
|
||
auto operator+(difference_type n) const { | ||
return iterator(parent_, offset_ + n); | ||
} | ||
friend auto operator+(difference_type n, const iterator &other) { | ||
return other + n; | ||
} | ||
auto operator-(difference_type n) const { | ||
return iterator(parent_, offset_ - n); | ||
} | ||
auto operator-(iterator other) const { return offset_ - other.offset_; } | ||
|
||
auto &operator+=(difference_type n) { | ||
offset_ += n; | ||
return *this; | ||
} | ||
auto &operator-=(difference_type n) { | ||
offset_ -= n; | ||
return *this; | ||
} | ||
auto &operator++() { | ||
offset_++; | ||
return *this; | ||
} | ||
auto operator++(int) { | ||
auto old = *this; | ||
offset_++; | ||
return old; | ||
} | ||
auto &operator--() { | ||
offset_--; | ||
return *this; | ||
} | ||
auto operator--(int) { | ||
auto old = *this; | ||
offset_--; | ||
return old; | ||
} | ||
|
||
bool operator==(iterator other) const { | ||
if (parent_ == nullptr || other.parent_ == nullptr) { | ||
return false; | ||
} else { | ||
return offset_ == other.offset_; | ||
} | ||
} | ||
auto operator<=>(iterator other) const { | ||
assert(parent_ == other.parent_); | ||
return offset_ <=> other.offset_; | ||
} | ||
|
||
auto operator*() const { | ||
auto segment_size = parent_->segment_size_; | ||
return parent_ | ||
->segments()[offset_ / segment_size][offset_ % segment_size]; | ||
} | ||
auto operator[](difference_type n) const { return *(*this + n); } | ||
|
||
auto local() { | ||
auto segment_size = parent_->segment_size_; | ||
return (parent_->segments()[offset_ / segment_size].begin() + | ||
offset_ % segment_size) | ||
.local(); | ||
} | ||
|
||
// | ||
// Support for distributed ranges | ||
// | ||
// distributed iterator provides segments | ||
// remote iterator provides local | ||
// | ||
auto segments() { | ||
return dr::__detail::drop_segments(parent_->segments(), offset_); | ||
} | ||
|
||
private: | ||
const dual_distributed_vector *parent_ = nullptr; | ||
difference_type offset_; | ||
}; | ||
|
||
// Do not copy | ||
// We need a move constructor for the implementation of reduce algorithm | ||
dual_distributed_vector(const dual_distributed_vector &) = delete; | ||
dual_distributed_vector &operator=(const dual_distributed_vector &) = delete; | ||
dual_distributed_vector(dual_distributed_vector &&) { assert(false); } | ||
|
||
/// Constructor | ||
dual_distributed_vector(std::size_t size = 0, | ||
distribution dist = distribution()) { | ||
init(size, dist); | ||
} | ||
|
||
/// Constructor | ||
dual_distributed_vector(std::size_t size, value_type fill_value, | ||
distribution dist = distribution()) { | ||
init(size, dist); | ||
mp::fill(*this, fill_value); | ||
} | ||
|
||
~dual_distributed_vector() { | ||
if (finalized()) return; | ||
|
||
for (size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { | ||
fence(i); | ||
|
||
if (datas_[i] != nullptr) { | ||
backends_[i].deallocate(datas_[i], data_size_ * sizeof(value_type)); | ||
} | ||
|
||
delete halos_[i]; | ||
} | ||
|
||
delete halo_; | ||
} | ||
|
||
/// Returns iterator to beginning | ||
auto begin() const { return iterator(this, 0); } | ||
/// Returns iterator to end | ||
auto end() const { return begin() + size_; } | ||
|
||
/// Returns size | ||
auto size() const { return size_; } | ||
/// Returns reference using index | ||
auto operator[](difference_type n) const { return *(begin() + n); } | ||
|
||
auto &halo() const { return *halo_; } | ||
|
||
auto segments() const { return rng::views::all(segments_); } | ||
auto segments() { return rng::views::all(segments_); } | ||
|
||
__attribute__((unused)) | ||
void fence(const std::size_t i) { backends_[i].fence(); } | ||
|
||
auto res_idx(const std::size_t segment_index) const { | ||
return segment_index < default_comm().size() ? 0 : 1; | ||
} | ||
|
||
backend_type& backend(const std::size_t segment_index) { | ||
return backends_[res_idx(segment_index)]; | ||
} | ||
const backend_type& backend(const std::size_t segment_index) const { | ||
return backends_[res_idx(segment_index)]; | ||
} | ||
|
||
T *data(const std::size_t segment_index) { | ||
return datas_[res_idx(segment_index)]; | ||
} | ||
|
||
std::size_t data_size() const { return data_size_; } | ||
|
||
private: | ||
void init(auto size, auto dist) { | ||
size_ = size; | ||
distribution_ = dist; | ||
|
||
// determine the distribution of data | ||
auto comm_size = default_comm().size(); // dr-style ignore | ||
auto hb = dist.halo(); | ||
std::size_t gran = dist.granularity(); | ||
// TODO: make this an error that is reported back to user | ||
assert(size % gran == 0 && "size must be a multiple of the granularity"); | ||
assert(hb.prev % gran == 0 && "size must be a multiple of the granularity"); | ||
assert(hb.next % gran == 0 && "size must be a multiple of the granularity"); | ||
|
||
std::size_t segment_count = comm_size * DUAL_SEGMENTS_PER_PROC; | ||
auto proc_segments_size = gran * std::max({ | ||
(size / gran + segment_count - 1) / segment_count, | ||
hb.prev / gran, | ||
hb.next / gran}); | ||
segment_size_ = proc_segments_size; | ||
|
||
std::size_t actual_segment_count_ = | ||
size_ / segment_size_ + (size_ % segment_size_ == 0 ? 0 : 1); | ||
assert(actual_segment_count_ <= segment_count | ||
&& "there must be at most 2 segments per process"); | ||
|
||
data_size_ = segment_size_ + hb.prev + hb.next; | ||
|
||
for (std::size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { | ||
if (size_ > 0) { | ||
datas_.push_back(static_cast<T *>(backends_[i].allocate(data_size_ * sizeof(value_type)))); | ||
std::memset(datas_[i], 69, data_size_ * sizeof(value_type)); | ||
halos_.push_back(new dual_span_halo<T>(default_comm(), datas_[i], data_size_, hb, i == 1)); | ||
} | ||
} | ||
|
||
halo_ = new cyclic_span_halo<T>(halos_); | ||
|
||
std::size_t segment_index = 0; | ||
for (std::size_t i = 0; i < size; i += segment_size_) { | ||
segments_.emplace_back(this, segment_index++, | ||
std::min(segment_size_, size - i), data_size_); | ||
} | ||
|
||
for (size_t i = 0; i < default_comm().size(); i++) { | ||
segments_[default_comm().size() + i].swap_state(); | ||
} | ||
|
||
for (size_t i = 0; i < DUAL_SEGMENTS_PER_PROC; i++) { | ||
fence(i); | ||
} | ||
} | ||
|
||
friend dual_dv_segment_iterator<dual_distributed_vector>; | ||
|
||
std::size_t segment_size_ = 0; | ||
std::size_t data_size_ = 0; // size + halo | ||
|
||
std::vector<dual_span_halo<T> *> halos_; | ||
std::vector<T *> datas_; | ||
cyclic_span_halo<T> *halo_; | ||
|
||
distribution distribution_; | ||
std::size_t size_; | ||
std::vector<dual_dv_segment<dual_distributed_vector>> segments_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if these 2 lines are the only differences between ordinary and dual vector, then let's pass different template parameters and make segment type a template paramter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
std::vector<backend_type> backends_{DUAL_SEGMENTS_PER_PROC}; | ||
}; | ||
|
||
template <typename T, typename B> | ||
auto &halo(const dual_distributed_vector<T, B> &dv) { | ||
return dv.halo(); | ||
} | ||
|
||
} // namespace dr::mp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is the difference between DualMPiBackend and MpiBackend types? if none, please use one type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a leftover from when I was experimenting with changing some code in the backend, thanks for pointing it out